From f7b631384fc70967bbe727b281edde32b7925d75 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 00:59:01 -0800 Subject: [PATCH 01/11] refactor: rewrite browser tools from single action enum to 12 flat tools with index-based ARIA tree Replace the single BrowserTool with 12 separate tools (browser_launch, browser_navigate, browser_snapshot, browser_click, browser_type, browser_press_key, browser_screenshot, browser_evaluate, browser_tab_open, browser_tab_list, browser_tab_close, browser_close) that share a BrowserContext with Arc>. Port extract_dom.js from browser-use-rs to build an ARIA tree with numeric indices and CSS selectors. browser_snapshot returns YAML-like output where interactive elements are tagged [index=N]. Click/type/press_key accept an index that resolves to a CSS selector at call time, eliminating stale CDP node ID issues. Add snapshot caching on BrowserState, invalidated on mutations. Update worker prompt and capability fragments for the new tool names and workflow. --- .../en/fragments/worker_capabilities.md.j2 | 2 +- prompts/en/tools/browser_description.md.j2 | 2 +- prompts/en/worker.md.j2 | 25 +- src/tools.rs | 26 +- src/tools/browser.rs | 1927 ++++++++++------- src/tools/browser_extract_dom.js | 734 +++++++ 6 files changed, 1855 insertions(+), 861 deletions(-) create mode 100644 src/tools/browser_extract_dom.js diff --git a/prompts/en/fragments/worker_capabilities.md.j2 b/prompts/en/fragments/worker_capabilities.md.j2 index dfab94c2e..ccef7726d 100644 --- a/prompts/en/fragments/worker_capabilities.md.j2 +++ b/prompts/en/fragments/worker_capabilities.md.j2 @@ -18,7 +18,7 @@ When you spawn a worker, it runs independently with a task description and retur - **exec** — run subprocesses with environment control - **set_status** — update worker status visible in your status block {%- if browser_enabled %} -- **browser** — browse web pages, take screenshots, click elements, fill forms +- **browser_*** — suite of browser tools: navigate, snapshot, click, type, screenshot, press_key, evaluate, tab management {%- endif %} {%- if web_search_enabled %} - **web_search** — search the web via Brave Search API diff --git a/prompts/en/tools/browser_description.md.j2 b/prompts/en/tools/browser_description.md.j2 index 26500c354..2a21dd338 100644 --- a/prompts/en/tools/browser_description.md.j2 +++ b/prompts/en/tools/browser_description.md.j2 @@ -1 +1 @@ -Browser automation tool. Launch a headless Chrome browser, navigate pages, interact with elements, take screenshots, and extract page content. Workflow: launch → navigate → snapshot (get element refs) → act (click/type by ref) → screenshot. Element refs like "e1", "e2" are assigned during snapshot and used in act calls. \ No newline at end of file +Browser automation tools. Navigate pages, get ARIA snapshots with numbered element indices, click/type by index, take screenshots. Workflow: browser_navigate → browser_snapshot → browser_click/browser_type by index → browser_screenshot. diff --git a/prompts/en/worker.md.j2 b/prompts/en/worker.md.j2 index 111f0b18f..6c072b6fd 100644 --- a/prompts/en/worker.md.j2 +++ b/prompts/en/worker.md.j2 @@ -89,28 +89,27 @@ Path restrictions apply: you cannot write to identity files (SOUL.md, IDENTITY.m Run a subprocess with specific arguments. Use this for programs that need structured argument passing rather than shell interpretation. -### browser +### Browser tools (browser_*) -Automate a headless Chrome browser. Use this for web scraping, testing web interfaces, filling out forms, or any task requiring browser interaction. +Automate a headless Chrome browser. Each action is a separate tool. **Workflow:** -1. `launch` — Start the browser -2. `navigate` — Go to a URL -3. `snapshot` — Get the page's accessibility tree with element refs (e1, e2, e3...) -4. `act` — Interact with elements by ref: `click`, `type`, `press_key`, `hover`, `scroll_into_view`, `focus` -5. `screenshot` — Capture the page or a specific element +1. `browser_launch` — Start the browser (or auto-launched by `browser_navigate`) +2. `browser_navigate` — Go to a URL +3. `browser_snapshot` — Get the page's ARIA accessibility tree with numbered element indices +4. `browser_click` — Click an element by index: `{"index": 5}` +5. `browser_type` — Type text into an input by index: `{"index": 3, "text": "hello"}` +6. `browser_screenshot` — Capture the page {%- if browser_persist_session %} -6. `close` — Detach from the browser when done (tabs and session are preserved for the next worker) +7. `browser_close` — Detach from the browser when done (tabs and session preserved) {%- else %} -6. `close` — Shut down the browser when done +7. `browser_close` — Shut down the browser when done {%- endif %} -**Multi-tab support:** Use `open` to create new tabs, `tabs` to list them, `focus` to switch between them, `close_tab` to close one. +**Element indices** are assigned during `browser_snapshot` and look like `[index=0]`, `[index=1]`. Always snapshot before interacting — indices reset on each snapshot. -**Element refs** are assigned during `snapshot` and look like "e1", "e2". Always snapshot before interacting — refs reset on each snapshot or navigation. - -**Additional actions:** `content` (get page HTML), `evaluate` (run JavaScript, if enabled in config). +**Other tools:** `browser_press_key` (Enter, Tab, Escape), `browser_evaluate` (run JS), `browser_tab_open`, `browser_tab_list`, `browser_tab_close`. ### secret_set diff --git a/src/tools.rs b/src/tools.rs index 8d27b802b..7ba9e8c10 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -66,8 +66,8 @@ pub use attachment_recall::{ }; pub use branch_tool::{BranchArgs, BranchError, BranchOutput, BranchTool}; pub use browser::{ - ActKind, BrowserAction, BrowserArgs, BrowserError, BrowserOutput, BrowserTool, ElementSummary, - SharedBrowserHandle, TabInfo, + BrowserError, BrowserOutput, SharedBrowserHandle, TabInfo, new_shared_browser_handle, + register_browser_tools, }; pub use cancel::{CancelArgs, CancelError, CancelOutput, CancelTool}; pub use channel_recall::{ @@ -507,16 +507,7 @@ pub fn create_worker_tool_server( } if browser_config.enabled { - let browser_tool = if let Some(shared) = runtime_config - .shared_browser - .as_ref() - .filter(|_| browser_config.persist_session) - { - BrowserTool::new_shared(shared.clone(), browser_config, screenshot_dir) - } else { - BrowserTool::new(browser_config, screenshot_dir) - }; - server = server.tool(browser_tool); + server = register_browser_tools(server, browser_config, screenshot_dir, &runtime_config); } if let Some(key) = brave_search_key { @@ -598,16 +589,7 @@ pub fn create_cortex_chat_tool_server( .tool(ExecTool::new(workspace, sandbox)); if browser_config.enabled { - let browser_tool = if let Some(shared) = runtime_config - .shared_browser - .as_ref() - .filter(|_| browser_config.persist_session) - { - BrowserTool::new_shared(shared.clone(), browser_config, screenshot_dir) - } else { - BrowserTool::new(browser_config, screenshot_dir) - }; - server = server.tool(browser_tool); + server = register_browser_tools(server, browser_config, screenshot_dir, &runtime_config); } if let Some(key) = brave_search_key { diff --git a/src/tools/browser.rs b/src/tools/browser.rs index 90186b262..9361fa310 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -1,16 +1,19 @@ -//! Browser automation tool for workers. +//! Browser automation tools for workers. //! -//! Provides navigation, element interaction, screenshots, and page observation -//! via headless Chrome using chromiumoxide. Uses an accessibility-tree based -//! ref system for LLM-friendly element addressing. +//! Provides a suite of separate tools (navigate, click, type, snapshot, etc.) +//! backed by a shared browser state. Uses an ARIA-tree DOM extraction script +//! (ported from browser-use-rs) that assigns numeric indices to interactive +//! elements. The LLM sees a compact YAML snapshot and interacts via index. +//! +//! Element resolution: index → CSS selector (built from DOM position during +//! extraction) → chromiumoxide `page.find_element()`. This avoids stale CDP +//! node IDs and works with both native HTML elements and ARIA widgets. use crate::config::BrowserConfig; + use chromiumoxide::browser::{Browser, BrowserConfig as ChromeConfig}; use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions}; use chromiumoxide::page::ScreenshotParams; -use chromiumoxide_cdp::cdp::browser_protocol::accessibility::{ - EnableParams as AxEnableParams, GetFullAxTreeParams, -}; use chromiumoxide_cdp::cdp::browser_protocol::input::{ DispatchKeyEventParams, DispatchKeyEventType, }; @@ -22,12 +25,16 @@ use rig::tool::Tool; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::collections::HashMap; -use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; +use std::net::{IpAddr, Ipv4Addr}; use std::path::{Path, PathBuf}; use std::sync::Arc; use tokio::sync::Mutex; use tokio::task::JoinHandle; +// --------------------------------------------------------------------------- +// URL validation (SSRF protection) +// --------------------------------------------------------------------------- + /// Validate that a URL is safe for the browser to navigate to. /// Blocks private/loopback IPs, link-local addresses, and cloud metadata endpoints /// to prevent server-side request forgery. @@ -48,7 +55,6 @@ fn validate_url(url: &str) -> Result<(), BrowserError> { return Err(BrowserError::new("URL has no host")); }; - // Block cloud metadata endpoints regardless of how the IP resolves if host == "metadata.google.internal" || host == "169.254.169.254" || host == "metadata.aws.internal" @@ -58,7 +64,6 @@ fn validate_url(url: &str) -> Result<(), BrowserError> { )); } - // If the host parses as an IP address, check against blocked ranges if let Ok(ip) = host.parse::() && is_blocked_ip(ip) { @@ -67,7 +72,6 @@ fn validate_url(url: &str) -> Result<(), BrowserError> { ))); } - // IPv6 addresses in brackets (url crate strips them for host_str) if let Some(stripped) = host.strip_prefix('[').and_then(|h| h.strip_suffix(']')) && let Ok(ip) = stripped.parse::() && is_blocked_ip(ip) @@ -80,44 +84,40 @@ fn validate_url(url: &str) -> Result<(), BrowserError> { Ok(()) } -/// Returns true if the IP address belongs to a private, loopback, or -/// link-local range that should not be reachable from the browser tool. fn is_blocked_ip(ip: IpAddr) -> bool { match ip { IpAddr::V4(v4) => { - v4.is_loopback() // 127.0.0.0/8 - || v4.is_private() // 10/8, 172.16/12, 192.168/16 - || v4.is_link_local() // 169.254.0.0/16 - || v4.is_broadcast() // 255.255.255.255 - || v4.is_unspecified() // 0.0.0.0 - || is_v4_cgnat(v4) // 100.64.0.0/10 + v4.is_loopback() + || v4.is_private() + || v4.is_link_local() + || v4.is_broadcast() + || v4.is_unspecified() + || is_v4_cgnat(v4) } IpAddr::V6(v6) => { - v6.is_loopback() // ::1 - || v6.is_unspecified() // :: - || is_v6_unique_local(v6) // fd00::/8 (fc00::/7) - || is_v6_link_local(v6) // fe80::/10 - || is_v4_mapped_blocked(v6) + v6.is_loopback() + || v6.is_unspecified() + || is_v6_unique_local(v6) + || is_v6_link_local(v6) + || is_v4_mapped_blocked(v6) } } } fn is_v4_cgnat(ip: Ipv4Addr) -> bool { let octets = ip.octets(); - octets[0] == 100 && (octets[1] & 0xC0) == 64 // 100.64.0.0/10 + octets[0] == 100 && (octets[1] & 0xC0) == 64 } -fn is_v6_unique_local(ip: Ipv6Addr) -> bool { - (ip.segments()[0] & 0xFE00) == 0xFC00 // fc00::/7 +fn is_v6_unique_local(ip: std::net::Ipv6Addr) -> bool { + (ip.segments()[0] & 0xFE00) == 0xFC00 } -fn is_v6_link_local(ip: Ipv6Addr) -> bool { - (ip.segments()[0] & 0xFFC0) == 0xFE80 // fe80::/10 +fn is_v6_link_local(ip: std::net::Ipv6Addr) -> bool { + (ip.segments()[0] & 0xFFC0) == 0xFE80 } -/// Check if an IPv6 address is a v4-mapped address (::ffff:x.x.x.x) -/// pointing to a blocked IPv4 range. -fn is_v4_mapped_blocked(ip: Ipv6Addr) -> bool { +fn is_v4_mapped_blocked(ip: std::net::Ipv6Addr) -> bool { if let Some(v4) = ip.to_ipv4_mapped() { is_blocked_ip(IpAddr::V4(v4)) } else { @@ -125,6 +125,212 @@ fn is_v4_mapped_blocked(ip: Ipv6Addr) -> bool { } } +// --------------------------------------------------------------------------- +// DOM snapshot types (ported from browser-use-rs) +// --------------------------------------------------------------------------- + +/// An ARIA tree snapshot of a page, extracted via injected JavaScript. +/// +/// Contains the tree structure for LLM display and a parallel `selectors` +/// array that maps element indices to CSS selectors for interaction. +#[derive(Debug, Clone, Deserialize)] +pub(crate) struct DomSnapshot { + pub root: AriaNode, + /// CSS selector for each indexed element. `selectors[i]` corresponds to the + /// element with `index == i` in the ARIA tree. + pub selectors: Vec, + #[allow(dead_code)] + pub iframe_indices: Vec, + /// Extraction error from the JS side, if any. + pub error: Option, +} + +/// A node in the ARIA accessibility tree. +#[derive(Debug, Clone, Deserialize)] +pub(crate) struct AriaNode { + pub role: String, + pub name: String, + #[serde(default)] + pub index: Option, + #[serde(default)] + pub children: Vec, + #[serde(default)] + pub props: HashMap, + #[serde(default)] + pub active: Option, + #[serde(default)] + pub checked: Option, + #[serde(default)] + pub disabled: Option, + #[serde(default)] + pub expanded: Option, + #[serde(default)] + pub level: Option, + #[serde(default)] + pub pressed: Option, + #[serde(default)] + pub selected: Option, + #[serde(default)] + pub box_info: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub(crate) struct BoxInfo { + #[allow(dead_code)] + pub visible: bool, + pub cursor: Option, +} + +/// A child of an `AriaNode` — either a text string or a nested node. +#[derive(Debug, Clone, Deserialize)] +#[serde(untagged)] +pub(crate) enum AriaChild { + Text(String), + Node(Box), +} + +/// The JS script injected into pages to extract the ARIA tree. +/// Ported from browser-use-rs's `extract_dom.js`. +const EXTRACT_DOM_JS: &str = include_str!("browser_extract_dom.js"); + +impl DomSnapshot { + /// Render the ARIA tree as compact YAML-like text for LLM consumption. + pub fn render(&self) -> String { + let mut output = String::with_capacity(4096); + render_node(&self.root, 0, &mut output); + output + } + + /// Look up the CSS selector for an element index. + pub fn selector_for_index(&self, index: usize) -> Option<&str> { + self.selectors.get(index).map(|s| s.as_str()) + } +} + +fn render_node(node: &AriaNode, depth: usize, output: &mut String) { + let indent = " ".repeat(depth); + + // Skip the root fragment — just render children + if node.role == "fragment" { + for child in &node.children { + render_child(child, depth, output); + } + return; + } + + // Skip generic nodes without index or name — they're structural noise + if node.role == "generic" && node.index.is_none() && node.name.is_empty() { + for child in &node.children { + render_child(child, depth, output); + } + return; + } + + // Build the line: `- role "name" [attrs]:` + output.push_str(&indent); + output.push_str("- "); + output.push_str(&node.role); + + if !node.name.is_empty() { + output.push_str(" \""); + // Escape quotes in name for YAML safety + output.push_str(&node.name.replace('"', "\\\"")); + output.push('"'); + } + + // Attributes + if let Some(index) = node.index { + output.push_str(&format!(" [index={index}]")); + } + if let Some(level) = node.level { + output.push_str(&format!(" [level={level}]")); + } + if let Some(true) = node.active { + output.push_str(" [active]"); + } + if let Some(true) = node.disabled { + output.push_str(" [disabled]"); + } + if let Some(true) = node.selected { + output.push_str(" [selected]"); + } + if let Some(ref checked) = node.checked { + match checked { + serde_json::Value::Bool(true) => output.push_str(" [checked]"), + serde_json::Value::Bool(false) => output.push_str(" [unchecked]"), + serde_json::Value::String(s) if s == "mixed" => output.push_str(" [checked=mixed]"), + _ => {} + } + } + if let Some(ref pressed) = node.pressed { + match pressed { + serde_json::Value::Bool(true) => output.push_str(" [pressed]"), + serde_json::Value::Bool(false) => {} + serde_json::Value::String(s) if s == "mixed" => output.push_str(" [pressed=mixed]"), + _ => {} + } + } + if let Some(true) = node.expanded { + output.push_str(" [expanded]"); + } else if let Some(false) = node.expanded { + output.push_str(" [collapsed]"); + } + if let Some(ref box_info) = node.box_info + && box_info.cursor.as_deref() == Some("pointer") + { + output.push_str(" [cursor=pointer]"); + } + + // Props on separate indented lines + let has_children = !node.children.is_empty(); + let has_props = !node.props.is_empty(); + + if has_children || has_props { + output.push_str(":\n"); + } else { + output.push('\n'); + } + + for (key, value) in &node.props { + output.push_str(&indent); + output.push_str(" /"); + output.push_str(key); + output.push_str(": "); + output.push_str(value); + output.push('\n'); + } + + for child in &node.children { + render_child(child, depth + 1, output); + } +} + +fn render_child(child: &AriaChild, depth: usize, output: &mut String) { + match child { + AriaChild::Text(text) => { + let trimmed = text.trim(); + if !trimmed.is_empty() { + let indent = " ".repeat(depth); + // Truncate long text for LLM context efficiency + let display = if trimmed.len() > 200 { + format!("{}...", &trimmed[..200]) + } else { + trimmed.to_string() + }; + output.push_str(&indent); + output.push_str("- text \""); + output.push_str(&display.replace('"', "\\\"")); + output.push_str("\"\n"); + } + } + AriaChild::Node(node) => render_node(node, depth, output), + } +} + +// --------------------------------------------------------------------------- +// Shared browser state +// --------------------------------------------------------------------------- + /// Opaque handle to shared browser state that persists across worker lifetimes. /// /// Held by `RuntimeConfig` when `persist_session = true`. All workers for the @@ -136,38 +342,22 @@ pub fn new_shared_browser_handle() -> SharedBrowserHandle { Arc::new(Mutex::new(BrowserState::new())) } -/// Tool for browser automation (worker-only). -#[derive(Debug, Clone)] -pub struct BrowserTool { - state: Arc>, - config: BrowserConfig, - screenshot_dir: PathBuf, -} - /// Internal browser state managed across tool invocations. /// /// When `persist_session` is enabled this struct lives in `RuntimeConfig` (via /// `SharedBrowserHandle`) and is shared across worker lifetimes. Otherwise each -/// `BrowserTool` owns its own instance. +/// tool set owns its own instance. pub struct BrowserState { browser: Option, - /// Background task driving the CDP WebSocket handler. _handler_task: Option>, - /// Tracked pages by target ID. pages: HashMap, - /// Currently active page target ID. active_target: Option, - /// Element ref map from the last snapshot, keyed by ref like "e1". - element_refs: HashMap, - /// Counter for generating element refs. - next_ref: usize, - /// Chrome's user data directory. For ephemeral sessions this is a random - /// temp dir cleaned up on drop. For persistent sessions this is a stable - /// path under the instance dir that survives agent restarts. + /// Cached DOM snapshot from the last `browser_snapshot` call. Invalidated + /// on navigation, tab switch, and explicit snapshot refresh. + snapshot: Option, user_data_dir: Option, /// When true, `user_data_dir` is a stable path that should NOT be deleted - /// on drop — it holds cookies, localStorage, and login sessions that must - /// survive across agent restarts. + /// on drop — it holds cookies, localStorage, and login sessions. persistent_profile: bool, } @@ -178,28 +368,27 @@ impl BrowserState { _handler_task: None, pages: HashMap::new(), active_target: None, - element_refs: HashMap::new(), - next_ref: 0, + snapshot: None, user_data_dir: None, persistent_profile: false, } } + + /// Invalidate the cached snapshot. Called after any page-mutating action. + fn invalidate_snapshot(&mut self) { + self.snapshot = None; + } } impl Drop for BrowserState { fn drop(&mut self) { - // Browser and handler task are dropped automatically — - // chromiumoxide's Child has kill_on_drop(true). - - // Persistent profiles store cookies, localStorage, and login sessions - // that must survive across agent restarts — never delete them. + // Persistent profiles store cookies and login sessions that must + // survive across agent restarts — never delete them. if self.persistent_profile { return; } if let Some(dir) = self.user_data_dir.take() { - // Offload sync fs cleanup to a blocking thread so we don't stall - // the tokio worker that's dropping this state. if let Ok(handle) = tokio::runtime::Handle::try_current() { handle.spawn_blocking(move || { if let Err(error) = std::fs::remove_dir_all(&dir) { @@ -210,14 +399,11 @@ impl Drop for BrowserState { ); } }); - } else { - // Dropped outside a tokio runtime (unlikely) — clean up inline. - if let Err(error) = std::fs::remove_dir_all(&dir) { - eprintln!( - "failed to clean up browser user data dir {}: {error}", - dir.display() - ); - } + } else if let Err(error) = std::fs::remove_dir_all(&dir) { + eprintln!( + "failed to clean up browser user data dir {}: {error}", + dir.display() + ); } } } @@ -229,53 +415,16 @@ impl std::fmt::Debug for BrowserState { .field("has_browser", &self.browser.is_some()) .field("pages", &self.pages.len()) .field("active_target", &self.active_target) - .field("element_refs", &self.element_refs.len()) + .field("has_snapshot", &self.snapshot.is_some()) .field("persistent_profile", &self.persistent_profile) .finish() } } -/// Stored info about an element from the accessibility tree snapshot. -#[derive(Debug, Clone)] -#[allow(dead_code)] -struct ElementRef { - role: String, - name: Option, - description: Option, - /// The AX node ID from the accessibility tree. - ax_node_id: String, - backend_node_id: Option, -} - -impl BrowserTool { - /// Create a tool with its own isolated browser state (default, non-persistent). - pub fn new(config: BrowserConfig, screenshot_dir: PathBuf) -> Self { - Self { - state: Arc::new(Mutex::new(BrowserState::new())), - config, - screenshot_dir, - } - } - - /// Create a tool backed by a shared browser state handle. - /// - /// Used when `persist_session = true`. Multiple workers share the same - /// `SharedBrowserHandle`, so the browser process and tabs survive across - /// worker lifetimes. - pub fn new_shared( - shared_state: SharedBrowserHandle, - config: BrowserConfig, - screenshot_dir: PathBuf, - ) -> Self { - Self { - state: shared_state, - config, - screenshot_dir, - } - } -} +// --------------------------------------------------------------------------- +// Error type +// --------------------------------------------------------------------------- -/// Error type for browser tool operations. #[derive(Debug, thiserror::Error)] #[error("Browser error: {message}")] pub struct BrowserError { @@ -290,104 +439,26 @@ impl BrowserError { } } -/// The action to perform. -#[derive(Debug, Clone, Deserialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -pub enum BrowserAction { - /// Launch the browser. Must be called before any other action. - Launch, - /// Navigate the active tab to a URL. - Navigate, - /// Open a new tab, optionally at a URL. - Open, - /// List all open tabs. - Tabs, - /// Focus a tab by its target ID. - Focus, - /// Close a tab by its target ID (or the active tab if omitted). - CloseTab, - /// Get an accessibility tree snapshot of the active page with element refs. - Snapshot, - /// Perform an interaction on an element by ref. - Act, - /// Take a screenshot of the active page or a specific element. - Screenshot, - /// Evaluate JavaScript in the active page. - Evaluate, - /// Get the page HTML content. - Content, - /// Shut down the browser. - Close, -} - -/// The kind of interaction to perform via the `act` action. -#[derive(Debug, Clone, Deserialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -pub enum ActKind { - /// Click an element by ref. - Click, - /// Type text into an element by ref. - Type, - /// Press a keyboard key (e.g., "Enter", "Tab", "Escape"). - PressKey, - /// Hover over an element by ref. - Hover, - /// Scroll an element into the viewport by ref. - ScrollIntoView, - /// Focus an element by ref. - Focus, -} - -/// Arguments for the browser tool. -#[derive(Debug, Deserialize, JsonSchema)] -pub struct BrowserArgs { - /// The action to perform. - pub action: BrowserAction, - /// URL for navigate/open actions. - pub url: Option, - /// Target ID for focus/close_tab actions. - pub target_id: Option, - /// Element reference (e.g., "e3") for act/screenshot actions. - pub element_ref: Option, - /// Kind of interaction for the act action. - pub act_kind: Option, - /// Text to type for act:type. - pub text: Option, - /// Key to press for act:press_key (e.g., "Enter", "Tab"). - pub key: Option, - /// Whether to take a full-page screenshot. - #[serde(default)] - pub full_page: bool, - /// JavaScript expression to evaluate. - pub script: Option, -} +// --------------------------------------------------------------------------- +// Common output type +// --------------------------------------------------------------------------- -/// Output from the browser tool. #[derive(Debug, Serialize)] pub struct BrowserOutput { - /// Whether the action succeeded. pub success: bool, - /// Human-readable result message. pub message: String, - /// Page title (when available). #[serde(skip_serializing_if = "Option::is_none")] pub title: Option, - /// Current URL (when available). #[serde(skip_serializing_if = "Option::is_none")] pub url: Option, - /// Element snapshot data from the accessibility tree. #[serde(skip_serializing_if = "Option::is_none")] - pub elements: Option>, - /// List of open tabs. + pub snapshot: Option, #[serde(skip_serializing_if = "Option::is_none")] pub tabs: Option>, - /// Path to saved screenshot file. #[serde(skip_serializing_if = "Option::is_none")] pub screenshot_path: Option, - /// JavaScript evaluation result. #[serde(skip_serializing_if = "Option::is_none")] pub eval_result: Option, - /// Page HTML content. #[serde(skip_serializing_if = "Option::is_none")] pub content: Option, } @@ -399,7 +470,7 @@ impl BrowserOutput { message: message.into(), title: None, url: None, - elements: None, + snapshot: None, tabs: None, screenshot_path: None, eval_result: None, @@ -414,25 +485,6 @@ impl BrowserOutput { } } -/// Summary of an interactive element from the accessibility tree. -#[derive(Debug, Clone, Serialize)] -pub struct ElementSummary { - /// Short ref like "e1", "e2" for use in subsequent act calls. - pub ref_id: String, - /// ARIA role (e.g., "button", "textbox", "link"). - pub role: String, - /// Accessible name. - #[serde(skip_serializing_if = "Option::is_none")] - pub name: Option, - /// Accessible description. - #[serde(skip_serializing_if = "Option::is_none")] - pub description: Option, - /// Current value (for inputs, sliders, etc.). - #[serde(skip_serializing_if = "Option::is_none")] - pub value: Option, -} - -/// Info about an open browser tab. #[derive(Debug, Clone, Serialize)] pub struct TabInfo { pub target_id: String, @@ -441,141 +493,135 @@ pub struct TabInfo { pub active: bool, } -/// Roles that are interactive and worth assigning refs to. -const INTERACTIVE_ROLES: &[&str] = &[ - "button", - "checkbox", - "combobox", - "link", - "listbox", - "menu", - "menubar", - "menuitem", - "menuitemcheckbox", - "menuitemradio", - "option", - "radio", - "scrollbar", - "searchbox", - "slider", - "spinbutton", - "switch", - "tab", - "textbox", - "treeitem", -]; - -/// Max elements to assign refs to in a single snapshot. -const MAX_ELEMENT_REFS: usize = 200; - -impl Tool for BrowserTool { - const NAME: &'static str = "browser"; +// --------------------------------------------------------------------------- +// Shared helper struct that all tools reference +// --------------------------------------------------------------------------- - type Error = BrowserError; - type Args = BrowserArgs; - type Output = BrowserOutput; +/// Shared context cloned into each browser tool. Holds the browser state mutex, +/// config, and screenshot directory. +#[derive(Debug, Clone)] +pub(crate) struct BrowserContext { + state: Arc>, + config: BrowserConfig, + screenshot_dir: PathBuf, +} - async fn definition(&self, _prompt: String) -> ToolDefinition { - ToolDefinition { - name: Self::NAME.to_string(), - description: crate::prompts::text::get("tools/browser").to_string(), - parameters: serde_json::json!({ - "type": "object", - "properties": { - "action": { - "type": "string", - "enum": ["launch", "navigate", "open", "tabs", "focus", "close_tab", - "snapshot", "act", "screenshot", "evaluate", "content", "close"], - "description": "The browser action to perform" - }, - "url": { - "type": "string", - "description": "URL for navigate/open actions" - }, - "target_id": { - "type": "string", - "description": "Tab target ID for focus/close_tab actions" - }, - "element_ref": { - "type": "string", - "description": "Element reference from snapshot (e.g., \"e3\") for act/screenshot" - }, - "act_kind": { - "type": "string", - "enum": ["click", "type", "press_key", "hover", "scroll_into_view", "focus"], - "description": "Kind of interaction for the act action" - }, - "text": { - "type": "string", - "description": "Text to type for act:type" - }, - "key": { - "type": "string", - "description": "Key to press for act:press_key (e.g., \"Enter\", \"Tab\", \"Escape\")" - }, - "full_page": { - "type": "boolean", - "default": false, - "description": "Take full-page screenshot instead of viewport only" - }, - "script": { - "type": "string", - "description": "JavaScript expression to evaluate (requires evaluate_enabled in config)" - } - }, - "required": ["action"] - }), +impl BrowserContext { + fn new( + state: Arc>, + config: BrowserConfig, + screenshot_dir: PathBuf, + ) -> Self { + Self { + state, + config, + screenshot_dir, } } - async fn call(&self, args: Self::Args) -> Result { - match args.action { - BrowserAction::Launch => self.handle_launch().await, - BrowserAction::Navigate => self.handle_navigate(args.url).await, - BrowserAction::Open => self.handle_open(args.url).await, - BrowserAction::Tabs => self.handle_tabs().await, - BrowserAction::Focus => self.handle_focus(args.target_id).await, - BrowserAction::CloseTab => self.handle_close_tab(args.target_id).await, - BrowserAction::Snapshot => self.handle_snapshot().await, - BrowserAction::Act => { - self.handle_act(args.act_kind, args.element_ref, args.text, args.key) - .await - } - BrowserAction::Screenshot => { - self.handle_screenshot(args.element_ref, args.full_page) - .await - } - BrowserAction::Evaluate => self.handle_evaluate(args.script).await, - BrowserAction::Content => self.handle_content().await, - BrowserAction::Close => self.handle_close().await, + /// Get the active page or return an error. Does NOT hold the lock — caller + /// must pass a reference to the already-locked state. + fn require_active_page<'a>( + &self, + state: &'a BrowserState, + ) -> Result<&'a chromiumoxide::Page, BrowserError> { + let target = state + .active_target + .as_ref() + .ok_or_else(|| BrowserError::new("no active tab — navigate or open a tab first"))?; + state + .pages + .get(target) + .ok_or_else(|| BrowserError::new("active tab no longer exists")) + } + + /// Extract the DOM snapshot from the active page via injected JavaScript. + /// Caches the result on `BrowserState` so repeated reads don't re-extract. + async fn extract_snapshot<'a>( + &self, + state: &'a mut BrowserState, + ) -> Result<&'a DomSnapshot, BrowserError> { + if let Some(ref snapshot) = state.snapshot { + return Ok(snapshot); + } + + let page = self.require_active_page(state)?; + + let result = page + .evaluate(EXTRACT_DOM_JS) + .await + .map_err(|error| BrowserError::new(format!("DOM extraction failed: {error}")))?; + + let value = result.value().cloned().unwrap_or(serde_json::Value::Null); + + // The JS wraps the result in JSON.stringify(), so we get a string back + let json_value = if let Some(json_str) = value.as_str() { + serde_json::from_str::(json_str).unwrap_or(serde_json::Value::Null) + } else { + value + }; + + let snapshot: DomSnapshot = serde_json::from_value(json_value) + .map_err(|error| BrowserError::new(format!("failed to parse DOM snapshot: {error}")))?; + + if let Some(ref error) = snapshot.error { + tracing::warn!(%error, "DOM extraction JS reported an error"); } + + state.snapshot = Some(snapshot); + Ok(state.snapshot.as_ref().expect("just stored")) } -} -impl BrowserTool { - async fn handle_launch(&self) -> Result { - // Quick check under the lock — if a browser is already running and - // we're in persistent mode, reconnect to existing tabs. + /// Resolve a numeric element index to a chromiumoxide Element on the active page. + /// Uses the cached snapshot's CSS selectors. + async fn find_element_by_index( + &self, + state: &mut BrowserState, + index: usize, + ) -> Result { + // Ensure snapshot is cached + self.extract_snapshot(state).await?; + let snapshot = state.snapshot.as_ref().expect("just extracted"); + + let selector = snapshot.selector_for_index(index).ok_or_else(|| { + BrowserError::new(format!( + "element index {index} not found — run browser_snapshot to get fresh indices \ + (max index in current snapshot: {})", + snapshot.selectors.len().saturating_sub(1) + )) + })?; + + if selector.is_empty() { + return Err(BrowserError::new(format!( + "element index {index} has an empty CSS selector — the element may be in an iframe. \ + Try browser_evaluate with a custom querySelector instead." + ))); + } + + let page = self.require_active_page(state)?; + + page.find_element(selector).await.map_err(|error| { + BrowserError::new(format!( + "element at index {index} not found via selector '{selector}': {error}. \ + The page may have changed — run browser_snapshot again." + )) + }) + } + + /// Launch the browser if not already running. Returns a status message. + async fn ensure_launched(&self) -> Result { { let mut state = self.state.lock().await; if state.browser.is_some() { if self.config.persist_session { return self.reconnect_existing_tabs(&mut state).await; } - return Ok(BrowserOutput::success("Browser already running")); + return Ok("Browser already running".to_string()); } } - // Resolve the Chrome executable path (may download ~150MB on first use): - // 1. Explicit config override - // 2. CHROME / CHROME_PATH env vars - // 3. chromiumoxide default detection (system PATH + well-known paths) - // 4. Auto-download via BrowserFetcher (cached in chrome_cache_dir) let executable = resolve_chrome_executable(&self.config).await?; - // Persistent sessions use a stable profile dir under chrome_cache_dir so - // cookies, localStorage, and login sessions survive across agent restarts. - // Ephemeral sessions use a random temp dir to avoid singleton lock collisions. let (user_data_dir, persistent_profile) = if self.config.persist_session { (self.config.chrome_cache_dir.join("profile"), true) } else { @@ -584,6 +630,14 @@ impl BrowserTool { (dir, false) }; + if persistent_profile { + let lock_file = user_data_dir.join("SingletonLock"); + if lock_file.exists() { + tracing::debug!(path = %lock_file.display(), "removing stale Chrome SingletonLock"); + let _ = std::fs::remove_file(&lock_file); + } + } + let mut builder = ChromeConfig::builder() .no_sandbox() .chrome_executable(&executable) @@ -610,18 +664,12 @@ impl BrowserTool { let handler_task = tokio::spawn(async move { while handler.next().await.is_some() {} }); - // Re-acquire the lock only to store state. let mut state = self.state.lock().await; - // Guard against a concurrent launch that won the race. + // Guard against concurrent launch race if state.browser.is_some() { - // Another call launched while we were downloading/starting. Clean up - // the browser we just created and return success. drop(browser); handler_task.abort(); - // Clean up the temp user data dir — but only for ephemeral sessions. - // Persistent profiles use a shared stable path that the winner is - // actively using. if !persistent_profile { let dir = user_data_dir; tokio::spawn(async move { @@ -634,11 +682,10 @@ impl BrowserTool { } }); } - if self.config.persist_session { return self.reconnect_existing_tabs(&mut state).await; } - return Ok(BrowserOutput::success("Browser already running")); + return Ok("Browser already running".to_string()); } state.browser = Some(browser); @@ -647,24 +694,14 @@ impl BrowserTool { state.persistent_profile = persistent_profile; tracing::info!("browser launched"); - Ok(BrowserOutput::success("Browser launched successfully")) + Ok("Browser launched successfully".to_string()) } /// Discover existing tabs from the browser and rebuild the page map. - /// - /// Called when a worker connects to an already-running persistent browser. - /// Rebuilds `state.pages` from `browser.pages()` so stale entries (tabs - /// closed externally) are pruned. Validates that `active_target` still - /// points to a live page. - /// - /// Note: holds the mutex across CDP calls. `Browser::pages()` and the - /// per-tab title/url queries are quick CDP round-trips, and concurrent - /// browser use during reconnect is rare (workers typically call `launch` - /// once at the start of their run). async fn reconnect_existing_tabs( &self, state: &mut BrowserState, - ) -> Result { + ) -> Result { let browser = state .browser .as_ref() @@ -674,7 +711,6 @@ impl BrowserTool { BrowserError::new(format!("failed to enumerate existing tabs: {error}")) })?; - // Rebuild the page map from the live browser, pruning stale entries. let previous_ids: std::collections::HashSet = state.pages.keys().cloned().collect(); let mut refreshed_pages = HashMap::with_capacity(pages.len()); for page in pages { @@ -685,9 +721,10 @@ impl BrowserTool { .keys() .filter(|id| !previous_ids.contains(*id)) .count(); + state.pages = refreshed_pages; + state.invalidate_snapshot(); - // Ensure active_target points to a valid page. let active_valid = state .active_target .as_ref() @@ -697,387 +734,443 @@ impl BrowserTool { } let tab_count = state.pages.len(); - let mut tabs = Vec::with_capacity(tab_count); - for (id, page) in &state.pages { - let title = page.get_title().await.ok().flatten(); - let url = page.url().await.ok().flatten(); - let is_active = state.active_target.as_deref() == Some(id); - tabs.push(TabInfo { - target_id: id.clone(), - url, - title, - active: is_active, - }); + tracing::info!(tab_count, discovered, "reconnected to persistent browser"); + + Ok(format!( + "Connected to persistent browser ({tab_count} tab{} open, {discovered} newly discovered)", + if tab_count == 1 { "" } else { "s" } + )) + } +} + +// --------------------------------------------------------------------------- +// Tool: browser_launch +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone)] +pub struct BrowserLaunchTool { + context: BrowserContext, +} + +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserLaunchArgs {} + +impl Tool for BrowserLaunchTool { + const NAME: &'static str = "browser_launch"; + type Error = BrowserError; + type Args = BrowserLaunchArgs; + type Output = BrowserOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Launch the browser. Must be called before any other browser tool." + .to_string(), + parameters: serde_json::json!({ "type": "object", "properties": {} }), } + } - tracing::info!(tab_count, discovered, "reconnected to persistent browser"); + async fn call(&self, _args: Self::Args) -> Result { + let message = self.context.ensure_launched().await?; + Ok(BrowserOutput::success(message)) + } +} - Ok(BrowserOutput { - success: true, - message: format!( - "Connected to persistent browser ({tab_count} tab{} open, {discovered} newly discovered)", - if tab_count == 1 { "" } else { "s" } - ), - url: None, - title: None, - elements: None, - tabs: Some(tabs), - screenshot_path: None, - eval_result: None, - content: None, - }) +// --------------------------------------------------------------------------- +// Tool: browser_navigate +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone)] +pub struct BrowserNavigateTool { + context: BrowserContext, +} + +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserNavigateArgs { + /// The URL to navigate to. + pub url: String, +} + +impl Tool for BrowserNavigateTool { + const NAME: &'static str = "browser_navigate"; + type Error = BrowserError; + type Args = BrowserNavigateArgs; + type Output = BrowserOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Navigate the active tab to a URL. Auto-launches the browser if needed." + .to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "url": { "type": "string", "description": "The URL to navigate to" } + }, + "required": ["url"] + }), + } } - async fn handle_navigate(&self, url: Option) -> Result { - let Some(url) = url else { - return Err(BrowserError::new("url is required for navigate action")); - }; + async fn call(&self, args: Self::Args) -> Result { + validate_url(&args.url)?; + self.context.ensure_launched().await?; - validate_url(&url)?; + let mut state = self.context.state.lock().await; - let mut state = self.state.lock().await; - let page = self.get_or_create_page(&mut state, Some(&url)).await?; + // Get or create the active page + let page = get_or_create_page(&self.context, &mut state, Some(&args.url)).await?; - page.goto(&url) + page.goto(&args.url) .await .map_err(|error| BrowserError::new(format!("navigation failed: {error}")))?; let title = page.get_title().await.ok().flatten(); let current_url = page.url().await.ok().flatten(); + state.invalidate_snapshot(); - // Clear stale element refs on navigation - state.element_refs.clear(); - state.next_ref = 0; - - Ok( - BrowserOutput::success(format!("Navigated to {url}")) - .with_page_info(title, current_url), - ) + Ok(BrowserOutput::success(format!("Navigated to {}", args.url)) + .with_page_info(title, current_url)) } +} - async fn handle_open(&self, url: Option) -> Result { - let mut state = self.state.lock().await; - let browser = state - .browser - .as_ref() - .ok_or_else(|| BrowserError::new("browser not launched — call launch first"))?; - - let target_url = url.as_deref().unwrap_or("about:blank"); +// --------------------------------------------------------------------------- +// Tool: browser_snapshot +// --------------------------------------------------------------------------- - if target_url != "about:blank" { - validate_url(target_url)?; - } - - let page = browser - .new_page(target_url) - .await - .map_err(|error| BrowserError::new(format!("failed to open tab: {error}")))?; - - let target_id = page_target_id(&page); - let title = page.get_title().await.ok().flatten(); - let current_url = page.url().await.ok().flatten(); +#[derive(Debug, Clone)] +pub struct BrowserSnapshotTool { + context: BrowserContext, +} - state.pages.insert(target_id.clone(), page); - state.active_target = Some(target_id.clone()); +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserSnapshotArgs {} - // Clear refs when switching pages - state.element_refs.clear(); - state.next_ref = 0; +impl Tool for BrowserSnapshotTool { + const NAME: &'static str = "browser_snapshot"; + type Error = BrowserError; + type Args = BrowserSnapshotArgs; + type Output = BrowserOutput; - Ok(BrowserOutput { - tabs: None, - elements: None, - screenshot_path: None, - eval_result: None, - content: None, - success: true, - message: format!("Opened new tab (target: {target_id})"), - title, - url: current_url, - }) + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Get the page's ARIA accessibility tree with numbered element indices. \ + Use the [index=N] values in browser_click, browser_type, etc." + .to_string(), + parameters: serde_json::json!({ "type": "object", "properties": {} }), + } } - async fn handle_tabs(&self) -> Result { - let state = self.state.lock().await; + async fn call(&self, _args: Self::Args) -> Result { + let mut state = self.context.state.lock().await; - let mut tabs = Vec::new(); - for (target_id, page) in &state.pages { - let title = page.get_title().await.ok().flatten(); - let url = page.url().await.ok().flatten(); - let active = state.active_target.as_ref() == Some(target_id); + // Force a fresh snapshot + state.invalidate_snapshot(); + let snapshot = self.context.extract_snapshot(&mut state).await?; - tabs.push(TabInfo { - target_id: target_id.clone(), - title, - url, - active, - }); - } + let rendered = snapshot.render(); + let element_count = snapshot.selectors.len(); + let title = self + .context + .require_active_page(&state)? + .get_title() + .await + .ok() + .flatten(); + let url = self + .context + .require_active_page(&state)? + .url() + .await + .ok() + .flatten(); - let count = tabs.len(); Ok(BrowserOutput { success: true, - message: format!("{count} tab(s) open"), - title: None, - url: None, - elements: None, - tabs: Some(tabs), + message: format!("{element_count} interactive element(s) found"), + title, + url, + snapshot: Some(rendered), + tabs: None, screenshot_path: None, eval_result: None, content: None, }) } +} - async fn handle_focus(&self, target_id: Option) -> Result { - let Some(target_id) = target_id else { - return Err(BrowserError::new("target_id is required for focus action")); - }; +// --------------------------------------------------------------------------- +// Tool: browser_click +// --------------------------------------------------------------------------- - let mut state = self.state.lock().await; +#[derive(Debug, Clone)] +pub struct BrowserClickTool { + context: BrowserContext, +} - if !state.pages.contains_key(&target_id) { - return Err(BrowserError::new(format!( - "no tab with target_id '{target_id}'" - ))); - } +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserClickArgs { + /// The element index from the snapshot (e.g., 5). + pub index: usize, +} - state.active_target = Some(target_id.clone()); - state.element_refs.clear(); - state.next_ref = 0; +impl Tool for BrowserClickTool { + const NAME: &'static str = "browser_click"; + type Error = BrowserError; + type Args = BrowserClickArgs; + type Output = BrowserOutput; - Ok(BrowserOutput::success(format!("Focused tab {target_id}"))) + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Click an element by its index from browser_snapshot.".to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "index": { "type": "integer", "description": "Element index from snapshot" } + }, + "required": ["index"] + }), + } } - async fn handle_close_tab( - &self, - target_id: Option, - ) -> Result { - let mut state = self.state.lock().await; + async fn call(&self, args: Self::Args) -> Result { + let mut state = self.context.state.lock().await; + let element = self + .context + .find_element_by_index(&mut state, args.index) + .await?; + + element + .click() + .await + .map_err(|error| BrowserError::new(format!("click failed: {error}")))?; - let id = target_id - .or_else(|| state.active_target.clone()) - .ok_or_else(|| BrowserError::new("no active tab to close"))?; + state.invalidate_snapshot(); - let page = state - .pages - .remove(&id) - .ok_or_else(|| BrowserError::new(format!("no tab with target_id '{id}'")))?; + Ok(BrowserOutput::success(format!( + "Clicked element at index {}", + args.index + ))) + } +} - page.close() - .await - .map_err(|error| BrowserError::new(format!("failed to close tab: {error}")))?; +// --------------------------------------------------------------------------- +// Tool: browser_type +// --------------------------------------------------------------------------- - if state.active_target.as_ref() == Some(&id) { - state.active_target = state.pages.keys().next().cloned(); - } +#[derive(Debug, Clone)] +pub struct BrowserTypeTool { + context: BrowserContext, +} - state.element_refs.clear(); - state.next_ref = 0; +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserTypeArgs { + /// The element index from the snapshot. + pub index: usize, + /// The text to type into the element. + pub text: String, + /// Whether to clear the field before typing. Defaults to true. + #[serde(default = "default_true")] + pub clear: bool, +} - Ok(BrowserOutput::success(format!("Closed tab {id}"))) - } +fn default_true() -> bool { + true +} - async fn handle_snapshot(&self) -> Result { - let mut state = self.state.lock().await; - let page = self.require_active_page(&state)?.clone(); +impl Tool for BrowserTypeTool { + const NAME: &'static str = "browser_type"; + type Error = BrowserError; + type Args = BrowserTypeArgs; + type Output = BrowserOutput; - // Enable accessibility domain if not already enabled - page.execute(AxEnableParams::default()) - .await - .map_err(|error| { - BrowserError::new(format!("failed to enable accessibility: {error}")) - })?; + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Type text into an input element by its index from browser_snapshot." + .to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "index": { "type": "integer", "description": "Element index from snapshot" }, + "text": { "type": "string", "description": "Text to type" }, + "clear": { "type": "boolean", "default": true, "description": "Clear the field before typing (default true)" } + }, + "required": ["index", "text"] + }), + } + } - let ax_tree = page - .execute(GetFullAxTreeParams::default()) + async fn call(&self, args: Self::Args) -> Result { + let mut state = self.context.state.lock().await; + + // Use JS injection for reliable clear + type: focus, select all, then + // set value and fire events. chromiumoxide's type_str sends individual + // key events which is slow and fragile with JS frameworks. + let snapshot = self.context.extract_snapshot(&mut state).await?; + let selector = snapshot + .selector_for_index(args.index) + .ok_or_else(|| BrowserError::new(format!("element index {} not found", args.index)))? + .to_string(); + + let page = self.context.require_active_page(&state)?; + + let text_json = serde_json::to_string(&args.text).unwrap_or_default(); + let clear_js = if args.clear { "el.value = '';" } else { "" }; + let js = format!( + r#"(() => {{ + const el = document.querySelector({selector_json}); + if (!el) return JSON.stringify({{success: false, error: 'element not found'}}); + el.focus(); + {clear_js} + el.value = {text_json}; + el.dispatchEvent(new Event('input', {{bubbles: true}})); + el.dispatchEvent(new Event('change', {{bubbles: true}})); + return JSON.stringify({{success: true}}); + }})()"#, + selector_json = serde_json::to_string(&selector).unwrap_or_default(), + clear_js = clear_js, + text_json = text_json, + ); + + let result = page + .evaluate(js) .await - .map_err(|error| { - BrowserError::new(format!("failed to get accessibility tree: {error}")) - })?; + .map_err(|error| BrowserError::new(format!("type failed: {error}")))?; - state.element_refs.clear(); - state.next_ref = 0; + // Check for JS-level errors + if let Some(value) = result.value() + && let Some(json_str) = value.as_str() + && let Ok(parsed) = serde_json::from_str::(json_str) + && parsed.get("success").and_then(|v| v.as_bool()) == Some(false) + { + let error = parsed + .get("error") + .and_then(|v| v.as_str()) + .unwrap_or("type action failed"); + return Err(BrowserError::new(error.to_string())); + } - let mut elements = Vec::new(); + state.invalidate_snapshot(); - for node in &ax_tree.result.nodes { - if node.ignored { - continue; - } + let display_text = if args.text.len() > 50 { + format!("{}...", &args.text[..50]) + } else { + args.text.clone() + }; + Ok(BrowserOutput::success(format!( + "Typed '{display_text}' into element at index {}", + args.index + ))) + } +} - let role = extract_ax_value_string(&node.role); - let Some(role) = role else { continue }; +// --------------------------------------------------------------------------- +// Tool: browser_press_key +// --------------------------------------------------------------------------- - let role_lower = role.to_lowercase(); - let is_interactive = INTERACTIVE_ROLES.contains(&role_lower.as_str()); +#[derive(Debug, Clone)] +pub struct BrowserPressKeyTool { + context: BrowserContext, +} - if !is_interactive { - continue; - } +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserPressKeyArgs { + /// The key to press (e.g., "Enter", "Tab", "Escape", "ArrowDown"). + pub key: String, +} - if state.next_ref >= MAX_ELEMENT_REFS { - break; - } +impl Tool for BrowserPressKeyTool { + const NAME: &'static str = "browser_press_key"; + type Error = BrowserError; + type Args = BrowserPressKeyArgs; + type Output = BrowserOutput; - let name = extract_ax_value_string(&node.name); - let description = extract_ax_value_string(&node.description); - let value = extract_ax_value_string(&node.value); - let backend_node_id = node.backend_dom_node_id.as_ref().map(|id| *id.inner()); - - let ref_id = format!("e{}", state.next_ref); - state.next_ref += 1; - - state.element_refs.insert( - ref_id.clone(), - ElementRef { - role: role.clone(), - name: name.clone(), - description: description.clone(), - ax_node_id: node.node_id.inner().clone(), - backend_node_id, + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Press a keyboard key (e.g., \"Enter\", \"Tab\", \"Escape\").".to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "key": { "type": "string", "description": "Key name (Enter, Tab, Escape, ArrowDown, etc.)" } }, - ); - - elements.push(ElementSummary { - ref_id, - role, - name, - description, - value, - }); + "required": ["key"] + }), } + } - let title = page.get_title().await.ok().flatten(); - let url = page.url().await.ok().flatten(); - let count = elements.len(); - - Ok(BrowserOutput { - success: true, - message: format!("{count} interactive element(s) found"), - title, - url, - elements: Some(elements), - tabs: None, - screenshot_path: None, - eval_result: None, - content: None, - }) + async fn call(&self, args: Self::Args) -> Result { + let state = self.context.state.lock().await; + let page = self.context.require_active_page(&state)?; + dispatch_key_press(page, &args.key).await?; + Ok(BrowserOutput::success(format!( + "Pressed key '{}'", + args.key + ))) } +} - async fn handle_act( - &self, - act_kind: Option, - element_ref: Option, - text: Option, - key: Option, - ) -> Result { - let Some(act_kind) = act_kind else { - return Err(BrowserError::new("act_kind is required for act action")); - }; +// --------------------------------------------------------------------------- +// Tool: browser_screenshot +// --------------------------------------------------------------------------- - let state = self.state.lock().await; - let page = self.require_active_page(&state)?; - - match act_kind { - ActKind::Click => { - let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .click() - .await - .map_err(|error| BrowserError::new(format!("click failed: {error}")))?; - Ok(BrowserOutput::success("Clicked element")) - } - ActKind::Type => { - let Some(text) = text else { - return Err(BrowserError::new("text is required for act:type")); - }; - let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .click() - .await - .map_err(|error| BrowserError::new(format!("focus failed: {error}")))?; - element - .type_str(&text) - .await - .map_err(|error| BrowserError::new(format!("type failed: {error}")))?; - Ok(BrowserOutput::success(format!( - "Typed '{}' into element", - truncate_for_display(&text, 50) - ))) - } - ActKind::PressKey => { - let Some(key) = key else { - return Err(BrowserError::new("key is required for act:press_key")); - }; - if element_ref.is_some() { - let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .press_key(&key) - .await - .map_err(|error| BrowserError::new(format!("press_key failed: {error}")))?; - } else { - dispatch_key_press(page, &key).await?; +#[derive(Debug, Clone)] +pub struct BrowserScreenshotTool { + context: BrowserContext, +} + +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserScreenshotArgs { + /// Whether to take a full-page screenshot. + #[serde(default)] + pub full_page: bool, +} + +impl Tool for BrowserScreenshotTool { + const NAME: &'static str = "browser_screenshot"; + type Error = BrowserError; + type Args = BrowserScreenshotArgs; + type Output = BrowserOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: + "Take a screenshot of the current page. Saves to disk and returns the file path." + .to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "full_page": { "type": "boolean", "default": false, "description": "Capture entire page, not just viewport" } } - Ok(BrowserOutput::success(format!("Pressed key '{key}'"))) - } - ActKind::Hover => { - let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .hover() - .await - .map_err(|error| BrowserError::new(format!("hover failed: {error}")))?; - Ok(BrowserOutput::success("Hovered over element")) - } - ActKind::ScrollIntoView => { - let element = self.resolve_element_ref(&state, page, element_ref).await?; - element.scroll_into_view().await.map_err(|error| { - BrowserError::new(format!("scroll_into_view failed: {error}")) - })?; - Ok(BrowserOutput::success("Scrolled element into view")) - } - ActKind::Focus => { - let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .focus() - .await - .map_err(|error| BrowserError::new(format!("focus failed: {error}")))?; - Ok(BrowserOutput::success("Focused element")) - } + }), } } - async fn handle_screenshot( - &self, - element_ref: Option, - full_page: bool, - ) -> Result { - let state = self.state.lock().await; - let page = self.require_active_page(&state)?; - - let screenshot_data = if let Some(ref_id) = element_ref { - let element = self.resolve_element_ref(&state, page, Some(ref_id)).await?; - element - .screenshot(CaptureScreenshotFormat::Png) - .await - .map_err(|error| BrowserError::new(format!("element screenshot failed: {error}")))? - } else { - let params = ScreenshotParams::builder() - .format(CaptureScreenshotFormat::Png) - .full_page(full_page) - .build(); - page.screenshot(params) - .await - .map_err(|error| BrowserError::new(format!("screenshot failed: {error}")))? - }; + async fn call(&self, args: Self::Args) -> Result { + let state = self.context.state.lock().await; + let page = self.context.require_active_page(&state)?; + + let params = ScreenshotParams::builder() + .format(CaptureScreenshotFormat::Png) + .full_page(args.full_page) + .build(); + + let screenshot_data = page + .screenshot(params) + .await + .map_err(|error| BrowserError::new(format!("screenshot failed: {error}")))?; - // Save to disk let filename = format!( "screenshot_{}.png", chrono::Utc::now().format("%Y%m%d_%H%M%S_%3f") ); - let filepath = self.screenshot_dir.join(&filename); + let filepath = self.context.screenshot_dir.join(&filename); - tokio::fs::create_dir_all(&self.screenshot_dir) + tokio::fs::create_dir_all(&self.context.screenshot_dir) .await .map_err(|error| { BrowserError::new(format!("failed to create screenshot dir: {error}")) @@ -1089,7 +1182,6 @@ impl BrowserTool { let path_str = filepath.to_string_lossy().to_string(); let size_kb = screenshot_data.len() / 1024; - tracing::debug!(path = %path_str, size_kb, "screenshot saved"); Ok(BrowserOutput { @@ -1097,30 +1189,63 @@ impl BrowserTool { message: format!("Screenshot saved ({size_kb}KB)"), title: None, url: None, - elements: None, + snapshot: None, tabs: None, screenshot_path: Some(path_str), eval_result: None, content: None, }) } +} + +// --------------------------------------------------------------------------- +// Tool: browser_evaluate +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone)] +pub struct BrowserEvaluateTool { + context: BrowserContext, +} - async fn handle_evaluate(&self, script: Option) -> Result { - if !self.config.evaluate_enabled { +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserEvaluateArgs { + /// JavaScript expression to evaluate in the page. + pub script: String, +} + +impl Tool for BrowserEvaluateTool { + const NAME: &'static str = "browser_evaluate"; + type Error = BrowserError; + type Args = BrowserEvaluateArgs; + type Output = BrowserOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Evaluate JavaScript in the active page and return the result." + .to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "script": { "type": "string", "description": "JavaScript expression to evaluate" } + }, + "required": ["script"] + }), + } + } + + async fn call(&self, args: Self::Args) -> Result { + if !self.context.config.evaluate_enabled { return Err(BrowserError::new( "JavaScript evaluation is disabled in browser config (set evaluate_enabled = true)", )); } - let Some(script) = script else { - return Err(BrowserError::new("script is required for evaluate action")); - }; - - let state = self.state.lock().await; - let page = self.require_active_page(&state)?; + let state = self.context.state.lock().await; + let page = self.context.require_active_page(&state)?; let result = page - .evaluate(script) + .evaluate(args.script) .await .map_err(|error| BrowserError::new(format!("evaluate failed: {error}")))?; @@ -1131,75 +1256,252 @@ impl BrowserTool { message: "JavaScript evaluated".to_string(), title: None, url: None, - elements: None, + snapshot: None, tabs: None, screenshot_path: None, eval_result: value, content: None, }) } +} + +// --------------------------------------------------------------------------- +// Tool: browser_tab_open +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone)] +pub struct BrowserTabOpenTool { + context: BrowserContext, +} + +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserTabOpenArgs { + /// URL to open in the new tab. Defaults to about:blank. + #[serde(default)] + pub url: Option, +} + +impl Tool for BrowserTabOpenTool { + const NAME: &'static str = "browser_tab_open"; + type Error = BrowserError; + type Args = BrowserTabOpenArgs; + type Output = BrowserOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Open a new browser tab, optionally at a URL.".to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "url": { "type": "string", "description": "URL to open (default: about:blank)" } + } + }), + } + } + + async fn call(&self, args: Self::Args) -> Result { + let target_url = args.url.as_deref().unwrap_or("about:blank"); + if target_url != "about:blank" { + validate_url(target_url)?; + } - async fn handle_content(&self) -> Result { - let state = self.state.lock().await; - let page = self.require_active_page(&state)?; + let mut state = self.context.state.lock().await; + let browser = state + .browser + .as_ref() + .ok_or_else(|| BrowserError::new("browser not launched — call browser_launch first"))?; - let html = page - .content() + let page = browser + .new_page(target_url) .await - .map_err(|error| BrowserError::new(format!("failed to get page content: {error}")))?; + .map_err(|error| BrowserError::new(format!("failed to open tab: {error}")))?; + let target_id = page_target_id(&page); let title = page.get_title().await.ok().flatten(); - let url = page.url().await.ok().flatten(); - - // Truncate very large pages for LLM consumption - let truncated = if html.len() > 100_000 { - format!( - "{}... [truncated, {} bytes total]", - &html[..100_000], - html.len() - ) - } else { - html - }; + let current_url = page.url().await.ok().flatten(); + + state.pages.insert(target_id.clone(), page); + state.active_target = Some(target_id.clone()); + state.invalidate_snapshot(); Ok(BrowserOutput { success: true, - message: "Page content retrieved".to_string(), + message: format!("Opened new tab (target: {target_id})"), title, - url, - elements: None, + url: current_url, + snapshot: None, tabs: None, screenshot_path: None, eval_result: None, - content: Some(truncated), + content: None, }) } +} + +// --------------------------------------------------------------------------- +// Tool: browser_tab_list +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone)] +pub struct BrowserTabListTool { + context: BrowserContext, +} + +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserTabListArgs {} + +impl Tool for BrowserTabListTool { + const NAME: &'static str = "browser_tab_list"; + type Error = BrowserError; + type Args = BrowserTabListArgs; + type Output = BrowserOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "List all open browser tabs with their target IDs, titles, and URLs." + .to_string(), + parameters: serde_json::json!({ "type": "object", "properties": {} }), + } + } + + async fn call(&self, _args: Self::Args) -> Result { + let state = self.context.state.lock().await; + let mut tabs = Vec::new(); + for (target_id, page) in &state.pages { + let title = page.get_title().await.ok().flatten(); + let url = page.url().await.ok().flatten(); + let active = state.active_target.as_ref() == Some(target_id); + tabs.push(TabInfo { + target_id: target_id.clone(), + title, + url, + active, + }); + } + + let count = tabs.len(); + Ok(BrowserOutput { + success: true, + message: format!("{count} tab(s) open"), + title: None, + url: None, + snapshot: None, + tabs: Some(tabs), + screenshot_path: None, + eval_result: None, + content: None, + }) + } +} + +// --------------------------------------------------------------------------- +// Tool: browser_tab_close +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone)] +pub struct BrowserTabCloseTool { + context: BrowserContext, +} + +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserTabCloseArgs { + /// Target ID of the tab to close. If omitted, closes the active tab. + #[serde(default)] + pub target_id: Option, +} + +impl Tool for BrowserTabCloseTool { + const NAME: &'static str = "browser_tab_close"; + type Error = BrowserError; + type Args = BrowserTabCloseArgs; + type Output = BrowserOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Close a browser tab by target_id, or the active tab if omitted." + .to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "target_id": { "type": "string", "description": "Tab target ID (from browser_tab_list). Omit for active tab." } + } + }), + } + } + + async fn call(&self, args: Self::Args) -> Result { + let mut state = self.context.state.lock().await; + let id = args + .target_id + .or_else(|| state.active_target.clone()) + .ok_or_else(|| BrowserError::new("no active tab to close"))?; + + let page = state + .pages + .remove(&id) + .ok_or_else(|| BrowserError::new(format!("no tab with target_id '{id}'")))?; + + page.close() + .await + .map_err(|error| BrowserError::new(format!("failed to close tab: {error}")))?; + + if state.active_target.as_ref() == Some(&id) { + state.active_target = state.pages.keys().next().cloned(); + } + state.invalidate_snapshot(); + + Ok(BrowserOutput::success(format!("Closed tab {id}"))) + } +} + +// --------------------------------------------------------------------------- +// Tool: browser_close +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone)] +pub struct BrowserCloseTool { + context: BrowserContext, +} + +#[derive(Debug, Deserialize, JsonSchema)] +pub struct BrowserCloseArgs {} + +impl Tool for BrowserCloseTool { + const NAME: &'static str = "browser_close"; + type Error = BrowserError; + type Args = BrowserCloseArgs; + type Output = BrowserOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Close or detach from the browser (behavior depends on config)." + .to_string(), + parameters: serde_json::json!({ "type": "object", "properties": {} }), + } + } - async fn handle_close(&self) -> Result { + async fn call(&self, _args: Self::Args) -> Result { use crate::config::ClosePolicy; - match self.config.close_policy { + match self.context.config.close_policy { ClosePolicy::Detach => { - let mut state = self.state.lock().await; - // Clear per-worker element refs but preserve tabs, browser, - // handler, and active_target so the next worker picks up - // exactly where this one left off. - state.element_refs.clear(); - state.next_ref = 0; + let mut state = self.context.state.lock().await; + state.invalidate_snapshot(); tracing::info!(policy = "detach", "worker detached from browser"); Ok(BrowserOutput::success( "Detached from browser (tabs and session preserved)", )) } ClosePolicy::CloseTabs => { - // Drain pages under the lock, then close them outside it so - // other workers aren't blocked by CDP round-trips. let pages_to_close: Vec<(String, chromiumoxide::Page)> = { - let mut state = self.state.lock().await; + let mut state = self.context.state.lock().await; let pages = state.pages.drain().collect(); state.active_target = None; - state.element_refs.clear(); - state.next_ref = 0; + state.invalidate_snapshot(); pages }; @@ -1229,18 +1531,15 @@ impl BrowserTool { )) } ClosePolicy::CloseBrowser => { - // Take everything out of state under the lock, then do the - // actual teardown outside it. let (browser, handler_task, user_data_dir, persistent_profile) = { - let mut state = self.state.lock().await; + let mut state = self.context.state.lock().await; let browser = state.browser.take(); let handler_task = state._handler_task.take(); let user_data_dir = state.user_data_dir.take(); let persistent_profile = state.persistent_profile; state.pages.clear(); state.active_target = None; - state.element_refs.clear(); - state.next_ref = 0; + state.invalidate_snapshot(); (browser, handler_task, user_data_dir, persistent_profile) }; @@ -1256,9 +1555,6 @@ impl BrowserTool { return Err(BrowserError::new(message)); } - // Clean up the user data dir — but only for ephemeral sessions. - // Persistent profiles hold cookies and login sessions that must - // survive browser restarts. if !persistent_profile && let Some(dir) = user_data_dir { tokio::spawn(async move { if let Err(error) = tokio::fs::remove_dir_all(&dir).await { @@ -1276,80 +1572,106 @@ impl BrowserTool { } } } +} - /// Get the active page, or create a first one if the browser has no pages yet. - async fn get_or_create_page<'a>( - &self, - state: &'a mut BrowserState, - url: Option<&str>, - ) -> Result<&'a chromiumoxide::Page, BrowserError> { - if let Some(target) = state.active_target.as_ref() - && state.pages.contains_key(target) - { - return Ok(&state.pages[target]); - } +// --------------------------------------------------------------------------- +// Tool registration helper +// --------------------------------------------------------------------------- - let browser = state - .browser - .as_ref() - .ok_or_else(|| BrowserError::new("browser not launched — call launch first"))?; +/// Register all browser tools on a `ToolServer`. The tools share a single +/// `BrowserState` (via `SharedBrowserHandle` for persistent sessions, or a +/// fresh instance for ephemeral sessions). +pub fn register_browser_tools( + server: rig::tool::server::ToolServer, + config: BrowserConfig, + screenshot_dir: PathBuf, + runtime_config: &crate::config::RuntimeConfig, +) -> rig::tool::server::ToolServer { + let state = if let Some(shared) = runtime_config + .shared_browser + .as_ref() + .filter(|_| config.persist_session) + { + shared.clone() + } else { + Arc::new(Mutex::new(BrowserState::new())) + }; - let target_url = url.unwrap_or("about:blank"); - let page = browser - .new_page(target_url) - .await - .map_err(|error| BrowserError::new(format!("failed to create page: {error}")))?; + let context = BrowserContext::new(state, config, screenshot_dir); - let target_id = page_target_id(&page); - state.pages.insert(target_id.clone(), page); - state.active_target = Some(target_id.clone()); + server + .tool(BrowserLaunchTool { + context: context.clone(), + }) + .tool(BrowserNavigateTool { + context: context.clone(), + }) + .tool(BrowserSnapshotTool { + context: context.clone(), + }) + .tool(BrowserClickTool { + context: context.clone(), + }) + .tool(BrowserTypeTool { + context: context.clone(), + }) + .tool(BrowserPressKeyTool { + context: context.clone(), + }) + .tool(BrowserScreenshotTool { + context: context.clone(), + }) + .tool(BrowserEvaluateTool { + context: context.clone(), + }) + .tool(BrowserTabOpenTool { + context: context.clone(), + }) + .tool(BrowserTabListTool { + context: context.clone(), + }) + .tool(BrowserTabCloseTool { + context: context.clone(), + }) + .tool(BrowserCloseTool { context }) +} - Ok(&state.pages[&target_id]) +// --------------------------------------------------------------------------- +// Shared helpers +// --------------------------------------------------------------------------- + +/// Get the active page, or create a first one if the browser has no pages yet. +async fn get_or_create_page<'a>( + context: &BrowserContext, + state: &'a mut BrowserState, + url: Option<&str>, +) -> Result<&'a chromiumoxide::Page, BrowserError> { + if let Some(target) = state.active_target.as_ref() + && state.pages.contains_key(target) + { + return Ok(&state.pages[target]); } - /// Get the active page or return an error. - fn require_active_page<'a>( - &self, - state: &'a BrowserState, - ) -> Result<&'a chromiumoxide::Page, BrowserError> { - let target = state - .active_target - .as_ref() - .ok_or_else(|| BrowserError::new("no active tab — navigate or open a tab first"))?; - - state - .pages - .get(target) - .ok_or_else(|| BrowserError::new("active tab no longer exists")) - } + let browser = state + .browser + .as_ref() + .ok_or_else(|| BrowserError::new("browser not launched — call browser_launch first"))?; - /// Resolve an element ref (like "e3") to a chromiumoxide Element on the page. - async fn resolve_element_ref( - &self, - state: &BrowserState, - page: &chromiumoxide::Page, - element_ref: Option, - ) -> Result { - let Some(ref_id) = element_ref else { - return Err(BrowserError::new("element_ref is required for this action")); - }; + let target_url = url.unwrap_or("about:blank"); + let page = browser + .new_page(target_url) + .await + .map_err(|error| BrowserError::new(format!("failed to create page: {error}")))?; - let elem_ref = state.element_refs.get(&ref_id).ok_or_else(|| { - BrowserError::new(format!( - "unknown element ref '{ref_id}' — run snapshot first to get element refs" - )) - })?; + let target_id = page_target_id(&page); + state.pages.insert(target_id.clone(), page); + state.active_target = Some(target_id.clone()); - // Use backend_node_id to find the element via CSS selector derived from role+name, - // or fall back to XPath with aria role and name attributes - let selector = build_selector_for_ref(elem_ref); + // Suppress the "unused variable" warning — we need `context` for the type + // signature to match the pattern used by the navigate tool. + let _ = context; - page.find_element(&selector).await.map_err(|error| { - BrowserError::new(format!( - "failed to find element for ref '{ref_id}' (selector: {selector}): {error}" - )) - }) - } + Ok(&state.pages[&target_id]) } /// Dispatch a key press event to the page via CDP Input domain. @@ -1377,52 +1699,15 @@ async fn dispatch_key_press(page: &chromiumoxide::Page, key: &str) -> Result<(), Ok(()) } -/// Extract the string value from an AxValue option. -fn extract_ax_value_string( - ax_value: &Option, -) -> Option { - let val = ax_value.as_ref()?; - val.value - .as_ref() - .and_then(|v| v.as_str().map(|s| s.to_string())) -} - -/// Build a CSS selector from an ElementRef's role and name. -fn build_selector_for_ref(elem_ref: &ElementRef) -> String { - // Use ARIA role attribute as primary selector, with name for disambiguation - let role_selector = format!("[role='{}']", elem_ref.role); - - if let Some(name) = &elem_ref.name { - // Escape single quotes in the name for CSS selector safety - let escaped = name.replace('\'', "\\'"); - format!("{role_selector}[aria-label='{escaped}']") - } else { - role_selector - } -} - -/// Extract target ID string from a Page. fn page_target_id(page: &chromiumoxide::Page) -> String { page.target_id().inner().clone() } -/// Truncate a string for display, appending "..." if truncated. -fn truncate_for_display(text: &str, max_len: usize) -> String { - if text.len() <= max_len { - text.to_string() - } else { - format!("{}...", &text[..max_len]) - } -} +// --------------------------------------------------------------------------- +// Chrome executable resolution +// --------------------------------------------------------------------------- -/// Resolve the Chrome/Chromium executable path using a layered detection chain: -/// -/// 1. Explicit config override (`executable_path` in TOML) -/// 2. `CHROME` / `CHROME_PATH` environment variables -/// 3. chromiumoxide default detection (system PATH + well-known install paths) -/// 4. Auto-download via `BrowserFetcher` (cached in `chrome_cache_dir`) async fn resolve_chrome_executable(config: &BrowserConfig) -> Result { - // 1. Explicit config if let Some(path) = &config.executable_path { let path = PathBuf::from(path); if path.exists() { @@ -1435,19 +1720,16 @@ async fn resolve_chrome_executable(config: &BrowserConfig) -> Result Result Option { for variable in ["CHROME", "CHROME_PATH"] { if let Ok(value) = std::env::var(variable) { @@ -1468,8 +1749,6 @@ fn detect_chrome_from_env() -> Option { None } -/// Download Chromium using chromiumoxide's built-in fetcher. -/// The binary is cached in `cache_dir` and reused on subsequent launches. async fn fetch_chrome(cache_dir: &Path) -> Result { tokio::fs::create_dir_all(cache_dir) .await diff --git a/src/tools/browser_extract_dom.js b/src/tools/browser_extract_dom.js new file mode 100644 index 000000000..d1869bcd7 --- /dev/null +++ b/src/tools/browser_extract_dom.js @@ -0,0 +1,734 @@ +// ARIA Snapshot DOM Extraction +// Based on Playwright's ariaSnapshot.ts - generates ARIA-tree structure for AI consumption +JSON.stringify((function() { + 'use strict'; + + let currentIndex = 0; + + // Helper: normalize whitespace + function normalizeWhiteSpace(text) { + return text.replace(/\s+/g, ' ').trim(); + } + + // Helper: check if element is visible for ARIA + function isElementHiddenForAria(element) { + const tagName = element.tagName; + if (['STYLE', 'SCRIPT', 'NOSCRIPT', 'TEMPLATE'].includes(tagName)) { + return true; + } + + const style = window.getComputedStyle(element); + + // Check display: contents + if (style.display === 'contents' && element.nodeName !== 'SLOT') { + let hasVisibleChild = false; + for (let child of element.childNodes) { + if (child.nodeType === 1 && !isElementHiddenForAria(child)) { + hasVisibleChild = true; + break; + } + if (child.nodeType === 3 && child.textContent && child.textContent.trim()) { + hasVisibleChild = true; + break; + } + } + if (!hasVisibleChild) return true; + } + + // Check visibility + if (style.visibility !== 'visible') { + return true; + } + + // Check display: none and aria-hidden + if (style.display === 'none') { + return true; + } + + if (element.getAttribute('aria-hidden') === 'true') { + return true; + } + + return false; + } + + // Helper: check if element is visible (bounding box check) + function isElementVisible(element) { + const rect = element.getBoundingClientRect(); + return rect.width > 0 && rect.height > 0; + } + + // Helper: compute element box information + function computeBox(element) { + const style = window.getComputedStyle(element); + const rect = element.getBoundingClientRect(); + const visible = rect.width > 0 && rect.height > 0; + const inline = style.display === 'inline'; + const cursor = style.cursor; + + return { visible, inline, cursor, rect }; + } + + // Helper: check if element receives pointer events + function receivesPointerEvents(element) { + const box = computeBox(element); + if (!box.visible) return false; + + const style = window.getComputedStyle(element); + return style.pointerEvents !== 'none'; + } + + // Helper: get ARIA role for element + function getAriaRole(element) { + // Check explicit role + const explicitRole = element.getAttribute('role'); + if (explicitRole) { + const roles = explicitRole.split(' ').map(r => r.trim()); + const validRole = roles[0]; // take first role + if (validRole) return validRole; + } + + // Implicit roles based on tag name + const tagName = element.tagName; + const implicitRoles = { + 'BUTTON': 'button', + 'A': element.hasAttribute('href') ? 'link' : null, + 'INPUT': getInputRole(element), + 'TEXTAREA': 'textbox', + 'SELECT': element.hasAttribute('multiple') || element.size > 1 ? 'listbox' : 'combobox', + 'H1': 'heading', 'H2': 'heading', 'H3': 'heading', + 'H4': 'heading', 'H5': 'heading', 'H6': 'heading', + 'IMG': element.getAttribute('alt') === '' ? 'presentation' : 'img', + 'NAV': 'navigation', + 'MAIN': 'main', + 'ARTICLE': 'article', + 'SECTION': element.hasAttribute('aria-label') || element.hasAttribute('aria-labelledby') ? 'region' : null, + 'HEADER': 'banner', + 'FOOTER': 'contentinfo', + 'ASIDE': 'complementary', + 'FORM': 'form', + 'TABLE': 'table', + 'UL': 'list', 'OL': 'list', + 'LI': 'listitem', + 'P': 'paragraph', + 'DIALOG': 'dialog', + 'IFRAME': 'iframe' + }; + + return implicitRoles[tagName] || 'generic'; + } + + function getInputRole(input) { + const type = (input.type || 'text').toLowerCase(); + const roles = { + 'button': 'button', + 'checkbox': 'checkbox', + 'radio': 'radio', + 'range': 'slider', + 'search': 'searchbox', + 'text': 'textbox', + 'email': 'textbox', + 'tel': 'textbox', + 'url': 'textbox', + 'number': 'spinbutton' + }; + return roles[type] || 'textbox'; + } + + // Helper: get accessible name for element + function getElementAccessibleName(element) { + // Check aria-label + const ariaLabel = element.getAttribute('aria-label'); + if (ariaLabel) return ariaLabel; + + // Check aria-labelledby + const labelledBy = element.getAttribute('aria-labelledby'); + if (labelledBy) { + const ids = labelledBy.split(/\s+/); + const texts = ids.map(id => { + const el = document.getElementById(id); + return el ? el.textContent : ''; + }).filter(t => t); + if (texts.length) return texts.join(' '); + } + + // Check associated label (for inputs) + if (element.tagName === 'INPUT' || element.tagName === 'TEXTAREA' || element.tagName === 'SELECT') { + const id = element.id; + if (id) { + const label = document.querySelector('label[for="' + id + '"]'); + if (label) return label.textContent || ''; + } + // Check if wrapped in label + const parentLabel = element.closest('label'); + if (parentLabel) { + return parentLabel.textContent || ''; + } + } + + // Check alt for images + if (element.tagName === 'IMG') { + return element.getAttribute('alt') || ''; + } + + // Check title + const title = element.getAttribute('title'); + if (title) return title; + + // Check placeholder for inputs + if (element.tagName === 'INPUT' || element.tagName === 'TEXTAREA') { + const placeholder = element.getAttribute('placeholder'); + if (placeholder) return placeholder; + } + + // For links and buttons, use text content if no other name found + if (element.tagName === 'A' || element.tagName === 'BUTTON') { + const text = element.textContent || ''; + if (text.trim()) return text.trim(); + } + + return ''; + } + + // Helper: get ARIA checked state + function getAriaChecked(element) { + const checked = element.getAttribute('aria-checked'); + if (checked === 'true') return true; + if (checked === 'false') return false; + if (checked === 'mixed') return 'mixed'; + + // Native checkbox/radio + if (element.tagName === 'INPUT') { + if (element.type === 'checkbox' || element.type === 'radio') { + return element.checked; + } + } + + return undefined; + } + + // Helper: get ARIA disabled state + function getAriaDisabled(element) { + const disabled = element.getAttribute('aria-disabled'); + if (disabled === 'true') return true; + + // Native disabled + if (element.disabled !== undefined) { + return element.disabled; + } + + return undefined; + } + + // Helper: get ARIA expanded state + function getAriaExpanded(element) { + const expanded = element.getAttribute('aria-expanded'); + if (expanded === 'true') return true; + if (expanded === 'false') return false; + return undefined; + } + + // Helper: get ARIA level + function getAriaLevel(element) { + const level = element.getAttribute('aria-level'); + if (level) { + const num = parseInt(level, 10); + if (!isNaN(num)) return num; + } + + // Heading level + if (element.tagName.match(/^H[1-6]$/)) { + return parseInt(element.tagName[1], 10); + } + + return undefined; + } + + // Helper: get ARIA pressed state + function getAriaPressed(element) { + const pressed = element.getAttribute('aria-pressed'); + if (pressed === 'true') return true; + if (pressed === 'false') return false; + if (pressed === 'mixed') return 'mixed'; + return undefined; + } + + // Helper: get ARIA selected state + function getAriaSelected(element) { + const selected = element.getAttribute('aria-selected'); + if (selected === 'true') return true; + if (selected === 'false') return false; + return undefined; + } + + // Helper: get CSS content (::before, ::after) + function getCSSContent(element, pseudo) { + try { + const style = window.getComputedStyle(element, pseudo); + const content = style.content; + if (content && content !== 'none' && content !== 'normal') { + // Simple extraction - remove quotes + return content.replace(/^["']|["']$/g, ''); + } + } catch (e) { + // Ignore errors + } + return ''; + } + + // Compute ARIA index for element + function computeAriaIndex(ariaNode) { + // Only assign indices to visible, interactive ARIA roles + if (!ariaNode.box.visible) { + return; + } + + // Check if this is an interactive role that should have an index + const interactiveRoles = [ + 'button', 'link', 'textbox', 'searchbox', 'checkbox', 'radio', + 'combobox', 'listbox', 'option', 'menuitem', 'menuitemcheckbox', + 'menuitemradio', 'tab', 'tabpanel', 'slider', 'spinbutton', + 'switch', 'img', 'article', 'region', 'navigation', 'main', + 'complementary', 'banner', 'contentinfo', 'form', 'search', + 'tree', 'treeitem', 'grid', 'gridcell', 'row', 'columnheader', + 'rowheader', 'heading', 'dialog', 'alertdialog', 'alert', + 'status', 'progressbar', 'list', 'listitem', 'generic' + ]; + + // Only assign index to interactive roles or elements with pointer cursor + const hasPointerCursor = ariaNode.box.cursor === 'pointer'; + const isInteractiveRole = interactiveRoles.includes(ariaNode.role); + + if (!isInteractiveRole && !hasPointerCursor) { + return; + } + + // Assign sequential index + ariaNode.index = currentIndex++; + } + + // Convert element to AriaNode + function toAriaNode(element) { + const active = document.activeElement === element; + + // Handle iframe specially + if (element.tagName === 'IFRAME') { + const ariaNode = { + role: 'iframe', + name: '', + children: [], + props: {}, + element: element, + box: computeBox(element), + receivesPointerEvents: true, + active: active + }; + computeAriaIndex(ariaNode); + return ariaNode; + } + + const role = getAriaRole(element); + + // Skip elements without role or with presentation/none + if (!role || role === 'presentation' || role === 'none') { + return null; + } + + const name = normalizeWhiteSpace(getElementAccessibleName(element) || ''); + const box = computeBox(element); + + // Skip inline generic elements with just text + if (role === 'generic' && box.inline && + element.childNodes.length === 1 && + element.childNodes[0].nodeType === 3) { + return null; + } + + const result = { + role: role, + name: name, + children: [], + props: {}, + element: element, + box: box, + receivesPointerEvents: receivesPointerEvents(element), + active: active + }; + + computeAriaIndex(result); + + // Add ARIA properties based on role + const checkedRoles = ['checkbox', 'menuitemcheckbox', 'menuitemradio', 'radio', 'switch']; + if (checkedRoles.includes(role)) { + const checked = getAriaChecked(element); + if (checked !== undefined) result.checked = checked; + } + + const disabledRoles = ['button', 'input', 'select', 'textarea']; + if (disabledRoles.includes(role) || role.includes('menuitem')) { + const disabled = getAriaDisabled(element); + if (disabled !== undefined) result.disabled = disabled; + } + + const expandedRoles = ['button', 'combobox', 'gridcell', 'link', 'menuitem', 'row', 'tab', 'treeitem']; + if (expandedRoles.includes(role)) { + const expanded = getAriaExpanded(element); + if (expanded !== undefined) result.expanded = expanded; + } + + const levelRoles = ['heading', 'listitem', 'row', 'treeitem']; + if (levelRoles.includes(role)) { + const level = getAriaLevel(element); + if (level !== undefined) result.level = level; + } + + const pressedRoles = ['button']; + if (pressedRoles.includes(role)) { + const pressed = getAriaPressed(element); + if (pressed !== undefined) result.pressed = pressed; + } + + const selectedRoles = ['gridcell', 'option', 'row', 'tab', 'treeitem']; + if (selectedRoles.includes(role)) { + const selected = getAriaSelected(element); + if (selected !== undefined) result.selected = selected; + } + + // Special handling for input/textarea values + if (element.tagName === 'INPUT' || element.tagName === 'TEXTAREA') { + if (element.type !== 'checkbox' && element.type !== 'radio' && element.type !== 'file') { + result.children = [element.value || '']; + } + } + + return result; + } + + // Main visitor function + function visit(ariaNode, node, parentElementVisible, visited) { + if (visited.has(node)) return; + visited.add(node); + + // Handle text nodes + if (node.nodeType === 3) { // TEXT_NODE + if (!parentElementVisible) return; + + const text = node.nodeValue; + // Skip text inside textbox + if (ariaNode.role !== 'textbox' && text) { + ariaNode.children.push(text); + } + return; + } + + // Only process element nodes + if (node.nodeType !== 1) return; // ELEMENT_NODE + + const element = node; + + // Check visibility + const isElementVisibleForAria = !isElementHiddenForAria(element); + let visible = isElementVisibleForAria || isElementVisible(element); + + // Skip if not visible for ARIA + if (!visible) return; + + // Handle aria-owns + const ariaChildren = []; + if (element.hasAttribute('aria-owns')) { + const ids = element.getAttribute('aria-owns').split(/\s+/); + for (const id of ids) { + const ownedElement = document.getElementById(id); + if (ownedElement) ariaChildren.push(ownedElement); + } + } + + // Convert to aria node + const childAriaNode = toAriaNode(element); + if (childAriaNode) { + ariaNode.children.push(childAriaNode); + } + + // Process element (add CSS content, children, etc.) + processElement(childAriaNode || ariaNode, element, ariaChildren, visible, visited); + } + + function processElement(ariaNode, element, ariaChildren, parentElementVisible, visited) { + const style = window.getComputedStyle(element); + const display = style ? style.display : 'inline'; + const treatAsBlock = (display !== 'inline' || element.nodeName === 'BR') ? ' ' : ''; + + if (treatAsBlock) { + ariaNode.children.push(treatAsBlock); + } + + // Add ::before content + const beforeContent = getCSSContent(element, '::before'); + if (beforeContent) { + ariaNode.children.push(beforeContent); + } + + // Process shadow DOM slots + if (element.nodeName === 'SLOT') { + const assignedNodes = element.assignedNodes(); + for (const child of assignedNodes) { + visit(ariaNode, child, parentElementVisible, visited); + } + } else { + // Process regular children + for (let child = element.firstChild; child; child = child.nextSibling) { + if (!child.assignedSlot) { + visit(ariaNode, child, parentElementVisible, visited); + } + } + + // Process shadow root + if (element.shadowRoot) { + for (let child = element.shadowRoot.firstChild; child; child = child.nextSibling) { + visit(ariaNode, child, parentElementVisible, visited); + } + } + } + + // Process aria-owns children + for (const child of ariaChildren) { + visit(ariaNode, child, parentElementVisible, visited); + } + + // Add ::after content + const afterContent = getCSSContent(element, '::after'); + if (afterContent) { + ariaNode.children.push(afterContent); + } + + if (treatAsBlock) { + ariaNode.children.push(treatAsBlock); + } + + // Remove redundant children + if (ariaNode.children.length === 1 && ariaNode.name === ariaNode.children[0]) { + ariaNode.children = []; + } + + // Add special props + if (ariaNode.role === 'link' && element.hasAttribute('href')) { + ariaNode.props.url = element.getAttribute('href'); + } + + if (ariaNode.role === 'textbox' && element.hasAttribute('placeholder')) { + const placeholder = element.getAttribute('placeholder'); + if (placeholder !== ariaNode.name) { + ariaNode.props.placeholder = placeholder; + } + } + } + + // Normalize string children + function normalizeStringChildren(ariaNode) { + const normalizedChildren = []; + let buffer = []; + + function flushBuffer() { + if (buffer.length === 0) return; + const text = normalizeWhiteSpace(buffer.join('')); + if (text) { + normalizedChildren.push(text); + } + buffer = []; + } + + for (const child of ariaNode.children || []) { + if (typeof child === 'string') { + buffer.push(child); + } else { + flushBuffer(); + normalizeStringChildren(child); + normalizedChildren.push(child); + } + } + flushBuffer(); + + ariaNode.children = normalizedChildren; + + // Remove if child equals name + if (ariaNode.children.length === 1 && ariaNode.children[0] === ariaNode.name) { + ariaNode.children = []; + } + } + + // Normalize generic roles (remove unnecessary generic wrappers) + function normalizeGenericRoles(node) { + function normalizeChildren(node) { + const result = []; + + for (const child of node.children || []) { + if (typeof child === 'string') { + result.push(child); + continue; + } + + const normalized = normalizeChildren(child); + result.push(...normalized); + } + + // Remove generic that encloses single element + const removeSelf = node.role === 'generic' && !node.name && + result.length <= 1 && + result.every(c => typeof c !== 'string' && c.index !== undefined); + + if (removeSelf) { + return result; + } + + node.children = result; + return [node]; + } + + normalizeChildren(node); + } + + // Serialize ariaNode to plain object (remove Element references) + function serializeAriaNode(ariaNode) { + const result = { + role: ariaNode.role, + name: ariaNode.name, + children: [], + props: ariaNode.props + }; + + // Include index if present + if (ariaNode.index !== undefined) result.index = ariaNode.index; + if (ariaNode.active) result.active = true; + if (ariaNode.checked !== undefined) result.checked = ariaNode.checked; + if (ariaNode.disabled !== undefined) result.disabled = ariaNode.disabled; + if (ariaNode.expanded !== undefined) result.expanded = ariaNode.expanded; + if (ariaNode.level !== undefined) result.level = ariaNode.level; + if (ariaNode.pressed !== undefined) result.pressed = ariaNode.pressed; + if (ariaNode.selected !== undefined) result.selected = ariaNode.selected; + + // Serialize box info + result.box_info = { + visible: ariaNode.box.visible, + cursor: ariaNode.box.cursor + }; + + // Serialize children + for (const child of ariaNode.children) { + if (typeof child === 'string') { + result.children.push(child); + } else { + result.children.push(serializeAriaNode(child)); + } + } + + return result; + } + + // Collect selectors and iframe indices + function collectSelectorsAndIframes(ariaNode, selectors, iframeIndices) { + if (ariaNode.index !== undefined && ariaNode.element) { + // Store CSS selector for element at its index position + const selector = buildSelector(ariaNode.element); + // Ensure selectors array is large enough + while (selectors.length <= ariaNode.index) { + selectors.push(''); + } + selectors[ariaNode.index] = selector; + + if (ariaNode.role === 'iframe') { + iframeIndices.push(ariaNode.index); + } + } + + for (const child of ariaNode.children) { + if (typeof child !== 'string') { + collectSelectorsAndIframes(child, selectors, iframeIndices); + } + } + } + + // Build CSS selector for element + function buildSelector(element) { + if (element.id) { + return '#' + element.id; + } + + const path = []; + let current = element; + + while (current && current !== document.body) { + let selector = current.tagName.toLowerCase(); + + if (current.className && typeof current.className === 'string') { + const classes = current.className.trim().split(/\s+/); + if (classes.length > 0 && classes[0]) { + selector += '.' + classes[0]; + } + } + + // Add nth-child if needed for uniqueness + const parent = current.parentElement; + if (parent) { + const siblings = Array.from(parent.children); + const index = siblings.indexOf(current); + if (siblings.filter(s => s.tagName === current.tagName).length > 1) { + selector += ':nth-child(' + (index + 1) + ')'; + } + } + + path.unshift(selector); + current = current.parentElement; + } + + return path.join(' > '); + } + + // Main execution + try { + const rootElement = document.body || document.documentElement; + const visited = new Set(); + + // Reset index counter + currentIndex = 0; + + // Create root fragment node + const snapshot = { + role: 'fragment', + name: '', + children: [], + props: {}, + element: rootElement, + box: computeBox(rootElement), + receivesPointerEvents: true + }; + + // Visit the DOM tree + visit(snapshot, rootElement, true, visited); + + // Normalize + normalizeStringChildren(snapshot); + normalizeGenericRoles(snapshot); + + // Collect selectors and iframe indices + const selectors = []; + const iframeIndices = []; + collectSelectorsAndIframes(snapshot, selectors, iframeIndices); + + // Serialize and return + const serialized = serializeAriaNode(snapshot); + + return { + root: serialized, + selectors: selectors, + iframeIndices: iframeIndices + }; + + } catch (error) { + return { + error: error.toString(), + root: { role: 'fragment', name: '', children: [], props: {}, box_info: { visible: false } }, + selectors: [], + iframeIndices: [] + }; + } +})()) From 388e5a275bb2f1c8158655e1bf4f466d2a106b9f Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 01:19:48 -0800 Subject: [PATCH 02/11] fix: rename iframeIndices to iframe_indices in extract_dom.js to match Rust struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The JS returned camelCase `iframeIndices` but DomSnapshot expected snake_case `iframe_indices`, causing deserialization to fail with 'missing field iframe_indices' on every browser_snapshot call. This forced workers to fall back to browser_evaluate for all interaction. Also soften worker prompt guidance on browser_evaluate — suggest index-based tools as preferred but keep evaluate as a valid fallback. --- prompts/en/worker.md.j2 | 4 ++- src/tools/browser.rs | 42 -------------------------------- src/tools/browser_extract_dom.js | 14 +++++------ 3 files changed, 10 insertions(+), 50 deletions(-) diff --git a/prompts/en/worker.md.j2 b/prompts/en/worker.md.j2 index 6c072b6fd..b4eca6be3 100644 --- a/prompts/en/worker.md.j2 +++ b/prompts/en/worker.md.j2 @@ -109,7 +109,9 @@ Automate a headless Chrome browser. Each action is a separate tool. **Element indices** are assigned during `browser_snapshot` and look like `[index=0]`, `[index=1]`. Always snapshot before interacting — indices reset on each snapshot. -**Other tools:** `browser_press_key` (Enter, Tab, Escape), `browser_evaluate` (run JS), `browser_tab_open`, `browser_tab_list`, `browser_tab_close`. +**Other tools:** `browser_press_key` (Enter, Tab, Escape), `browser_evaluate` (run JS on the page), `browser_tab_open`, `browser_tab_list`, `browser_tab_close`. + +**Prefer index-based interaction:** Use `browser_snapshot` to get element indices, then `browser_click`/`browser_type`/`browser_press_key` to interact. Fall back to `browser_evaluate` for reading page data or when the index-based tools don't work for a specific page. ### secret_set diff --git a/src/tools/browser.rs b/src/tools/browser.rs index 9361fa310..fd1fe2dd7 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -31,9 +31,7 @@ use std::sync::Arc; use tokio::sync::Mutex; use tokio::task::JoinHandle; -// --------------------------------------------------------------------------- // URL validation (SSRF protection) -// --------------------------------------------------------------------------- /// Validate that a URL is safe for the browser to navigate to. /// Blocks private/loopback IPs, link-local addresses, and cloud metadata endpoints @@ -125,9 +123,7 @@ fn is_v4_mapped_blocked(ip: std::net::Ipv6Addr) -> bool { } } -// --------------------------------------------------------------------------- // DOM snapshot types (ported from browser-use-rs) -// --------------------------------------------------------------------------- /// An ARIA tree snapshot of a page, extracted via injected JavaScript. /// @@ -327,9 +323,7 @@ fn render_child(child: &AriaChild, depth: usize, output: &mut String) { } } -// --------------------------------------------------------------------------- // Shared browser state -// --------------------------------------------------------------------------- /// Opaque handle to shared browser state that persists across worker lifetimes. /// @@ -421,9 +415,7 @@ impl std::fmt::Debug for BrowserState { } } -// --------------------------------------------------------------------------- // Error type -// --------------------------------------------------------------------------- #[derive(Debug, thiserror::Error)] #[error("Browser error: {message}")] @@ -439,9 +431,7 @@ impl BrowserError { } } -// --------------------------------------------------------------------------- // Common output type -// --------------------------------------------------------------------------- #[derive(Debug, Serialize)] pub struct BrowserOutput { @@ -493,9 +483,7 @@ pub struct TabInfo { pub active: bool, } -// --------------------------------------------------------------------------- // Shared helper struct that all tools reference -// --------------------------------------------------------------------------- /// Shared context cloned into each browser tool. Holds the browser state mutex, /// config, and screenshot directory. @@ -743,9 +731,7 @@ impl BrowserContext { } } -// --------------------------------------------------------------------------- // Tool: browser_launch -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserLaunchTool { @@ -776,9 +762,7 @@ impl Tool for BrowserLaunchTool { } } -// --------------------------------------------------------------------------- // Tool: browser_navigate -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserNavigateTool { @@ -834,9 +818,7 @@ impl Tool for BrowserNavigateTool { } } -// --------------------------------------------------------------------------- // Tool: browser_snapshot -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserSnapshotTool { @@ -900,9 +882,7 @@ impl Tool for BrowserSnapshotTool { } } -// --------------------------------------------------------------------------- // Tool: browser_click -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserClickTool { @@ -956,9 +936,7 @@ impl Tool for BrowserClickTool { } } -// --------------------------------------------------------------------------- // Tool: browser_type -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserTypeTool { @@ -1067,9 +1045,7 @@ impl Tool for BrowserTypeTool { } } -// --------------------------------------------------------------------------- // Tool: browser_press_key -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserPressKeyTool { @@ -1113,9 +1089,7 @@ impl Tool for BrowserPressKeyTool { } } -// --------------------------------------------------------------------------- // Tool: browser_screenshot -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserScreenshotTool { @@ -1198,9 +1172,7 @@ impl Tool for BrowserScreenshotTool { } } -// --------------------------------------------------------------------------- // Tool: browser_evaluate -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserEvaluateTool { @@ -1265,9 +1237,7 @@ impl Tool for BrowserEvaluateTool { } } -// --------------------------------------------------------------------------- // Tool: browser_tab_open -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserTabOpenTool { @@ -1339,9 +1309,7 @@ impl Tool for BrowserTabOpenTool { } } -// --------------------------------------------------------------------------- // Tool: browser_tab_list -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserTabListTool { @@ -1396,9 +1364,7 @@ impl Tool for BrowserTabListTool { } } -// --------------------------------------------------------------------------- // Tool: browser_tab_close -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserTabCloseTool { @@ -1457,9 +1423,7 @@ impl Tool for BrowserTabCloseTool { } } -// --------------------------------------------------------------------------- // Tool: browser_close -// --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct BrowserCloseTool { @@ -1574,9 +1538,7 @@ impl Tool for BrowserCloseTool { } } -// --------------------------------------------------------------------------- // Tool registration helper -// --------------------------------------------------------------------------- /// Register all browser tools on a `ToolServer`. The tools share a single /// `BrowserState` (via `SharedBrowserHandle` for persistent sessions, or a @@ -1636,9 +1598,7 @@ pub fn register_browser_tools( .tool(BrowserCloseTool { context }) } -// --------------------------------------------------------------------------- // Shared helpers -// --------------------------------------------------------------------------- /// Get the active page, or create a first one if the browser has no pages yet. async fn get_or_create_page<'a>( @@ -1703,9 +1663,7 @@ fn page_target_id(page: &chromiumoxide::Page) -> String { page.target_id().inner().clone() } -// --------------------------------------------------------------------------- // Chrome executable resolution -// --------------------------------------------------------------------------- async fn resolve_chrome_executable(config: &BrowserConfig) -> Result { if let Some(path) = &config.executable_path { diff --git a/src/tools/browser_extract_dom.js b/src/tools/browser_extract_dom.js index d1869bcd7..71fa2f811 100644 --- a/src/tools/browser_extract_dom.js +++ b/src/tools/browser_extract_dom.js @@ -625,7 +625,7 @@ JSON.stringify((function() { } // Collect selectors and iframe indices - function collectSelectorsAndIframes(ariaNode, selectors, iframeIndices) { + function collectSelectorsAndIframes(ariaNode, selectors, iframe_indices) { if (ariaNode.index !== undefined && ariaNode.element) { // Store CSS selector for element at its index position const selector = buildSelector(ariaNode.element); @@ -636,13 +636,13 @@ JSON.stringify((function() { selectors[ariaNode.index] = selector; if (ariaNode.role === 'iframe') { - iframeIndices.push(ariaNode.index); + iframe_indices.push(ariaNode.index); } } for (const child of ariaNode.children) { if (typeof child !== 'string') { - collectSelectorsAndIframes(child, selectors, iframeIndices); + collectSelectorsAndIframes(child, selectors, iframe_indices); } } } @@ -711,8 +711,8 @@ JSON.stringify((function() { // Collect selectors and iframe indices const selectors = []; - const iframeIndices = []; - collectSelectorsAndIframes(snapshot, selectors, iframeIndices); + const iframe_indices = []; + collectSelectorsAndIframes(snapshot, selectors, iframe_indices); // Serialize and return const serialized = serializeAriaNode(snapshot); @@ -720,7 +720,7 @@ JSON.stringify((function() { return { root: serialized, selectors: selectors, - iframeIndices: iframeIndices + iframe_indices: iframe_indices }; } catch (error) { @@ -728,7 +728,7 @@ JSON.stringify((function() { error: error.toString(), root: { role: 'fragment', name: '', children: [], props: {}, box_info: { visible: false } }, selectors: [], - iframeIndices: [] + iframe_indices: [] }; } })()) From 85fef5789fdf52ce5ce7f0d04d9b581fff071b40 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 01:29:27 -0800 Subject: [PATCH 03/11] =?UTF-8?q?fix:=20stabilize=20browser=20tools=20?= =?UTF-8?q?=E2=80=94=20robust=20selectors,=20page=20readiness=20wait,=20tr?= =?UTF-8?q?immed=20element=20roles?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes for browser tool reliability: 1. CSS selector generation: look for nearest ancestor with an ID and anchor from there (max 6 levels). Verify uniqueness via querySelectorAll before returning. Falls back to full path from body. Replaces the old strategy that built deep fragile nth-child chains from body every time. 2. Page readiness: add wait_for_page_ready() that polls readyState + body.children.length after navigate and click. Gives SPAs up to 3s to hydrate before snapshot extraction runs. Fixes empty snapshots on client-rendered pages like railway.com. 3. Trim interactiveRoles: remove structural/landmark roles (generic, list, listitem, heading, region, navigation, banner, contentinfo, article, form, search, main, complementary, tabpanel, tree, grid, row, status, progressbar, alert) from the indexed set. Only truly interactive elements (button, link, textbox, checkbox, etc.) get indices now. Reduces snapshot noise from ~139 to ~20-30 elements on typical pages. --- src/tools/browser.rs | 53 ++++++++++++++++ src/tools/browser_extract_dom.js | 100 +++++++++++++++++++++---------- 2 files changed, 122 insertions(+), 31 deletions(-) diff --git a/src/tools/browser.rs b/src/tools/browser.rs index fd1fe2dd7..86395d092 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -123,6 +123,49 @@ fn is_v4_mapped_blocked(ip: std::net::Ipv6Addr) -> bool { } } +// Page readiness helper + +/// Wait for the page to be "ready enough" for DOM extraction. +/// +/// Many sites are SPAs that load a shell document and then hydrate with JS. +/// `chromiumoxide::Page::goto` returns after the main-frame load event, but +/// interactive content may not exist yet. This helper polls the page's +/// `document.readyState` and body child count to detect when content has +/// rendered, with a generous timeout so we don't block forever on broken pages. +async fn wait_for_page_ready(page: &chromiumoxide::Page) { + const POLL_INTERVAL: std::time::Duration = std::time::Duration::from_millis(150); + const MAX_WAIT: std::time::Duration = std::time::Duration::from_secs(3); + + let start = std::time::Instant::now(); + + loop { + if start.elapsed() >= MAX_WAIT { + break; + } + + // Check if the page has meaningful content: readyState is "complete" + // and the body has at least one child element. + let ready = page + .evaluate( + "(document.readyState === 'complete' && \ + document.body && document.body.children.length > 0)", + ) + .await + .ok() + .and_then(|result| result.value().cloned()) + .and_then(|value| value.as_bool()) + .unwrap_or(false); + + if ready { + // Give JS frameworks one more tick to finish hydration. + tokio::time::sleep(std::time::Duration::from_millis(200)).await; + break; + } + + tokio::time::sleep(POLL_INTERVAL).await; + } +} + // DOM snapshot types (ported from browser-use-rs) /// An ARIA tree snapshot of a page, extracted via injected JavaScript. @@ -809,6 +852,11 @@ impl Tool for BrowserNavigateTool { .await .map_err(|error| BrowserError::new(format!("navigation failed: {error}")))?; + // Wait briefly for SPA content to render. Many sites load a shell via + // the initial HTML then hydrate with JS — without this pause, + // `browser_snapshot` runs before any interactive elements exist. + wait_for_page_ready(page).await; + let title = page.get_title().await.ok().flatten(); let current_url = page.url().await.ok().flatten(); state.invalidate_snapshot(); @@ -927,6 +975,11 @@ impl Tool for BrowserClickTool { .await .map_err(|error| BrowserError::new(format!("click failed: {error}")))?; + // Clicks often trigger navigation or DOM changes — give the page a + // moment to settle before the next snapshot. + let page = self.context.require_active_page(&state)?; + wait_for_page_ready(page).await; + state.invalidate_snapshot(); Ok(BrowserOutput::success(format!( diff --git a/src/tools/browser_extract_dom.js b/src/tools/browser_extract_dom.js index 71fa2f811..782e5608b 100644 --- a/src/tools/browser_extract_dom.js +++ b/src/tools/browser_extract_dom.js @@ -283,16 +283,16 @@ JSON.stringify((function() { return; } - // Check if this is an interactive role that should have an index + // Only truly interactive roles get an index — things the user can + // click, type into, or toggle. Structural/landmark roles (region, + // navigation, banner, contentinfo, article, list, listitem, heading, + // generic, etc.) are intentionally excluded to keep snapshots small. const interactiveRoles = [ 'button', 'link', 'textbox', 'searchbox', 'checkbox', 'radio', 'combobox', 'listbox', 'option', 'menuitem', 'menuitemcheckbox', - 'menuitemradio', 'tab', 'tabpanel', 'slider', 'spinbutton', - 'switch', 'img', 'article', 'region', 'navigation', 'main', - 'complementary', 'banner', 'contentinfo', 'form', 'search', - 'tree', 'treeitem', 'grid', 'gridcell', 'row', 'columnheader', - 'rowheader', 'heading', 'dialog', 'alertdialog', 'alert', - 'status', 'progressbar', 'list', 'listitem', 'generic' + 'menuitemradio', 'tab', 'slider', 'spinbutton', + 'switch', 'treeitem', 'gridcell', 'columnheader', 'rowheader', + 'dialog', 'alertdialog', 'img' ]; // Only assign index to interactive roles or elements with pointer cursor @@ -647,40 +647,78 @@ JSON.stringify((function() { } } - // Build CSS selector for element + // Build a short, stable CSS selector for an element. + // Strategy: walk up the tree looking for the nearest ancestor with an ID, + // then build a short relative path from there. Falls back to a short + // path from if no ID is found within a reasonable depth. function buildSelector(element) { + // Quick path: element itself has an ID if (element.id) { - return '#' + element.id; + return '#' + CSS.escape(element.id); } - - const path = []; - let current = element; - - while (current && current !== document.body) { - let selector = current.tagName.toLowerCase(); - - if (current.className && typeof current.className === 'string') { - const classes = current.className.trim().split(/\s+/); - if (classes.length > 0 && classes[0]) { - selector += '.' + classes[0]; + + // Build a segment for one level: tag + first-class + nth-child if needed + function segment(el) { + let sel = el.tagName.toLowerCase(); + if (el.className && typeof el.className === 'string') { + const cls = el.className.trim().split(/\s+/).filter(Boolean); + if (cls.length > 0) { + sel += '.' + CSS.escape(cls[0]); } } - - // Add nth-child if needed for uniqueness - const parent = current.parentElement; + const parent = el.parentElement; if (parent) { - const siblings = Array.from(parent.children); - const index = siblings.indexOf(current); - if (siblings.filter(s => s.tagName === current.tagName).length > 1) { - selector += ':nth-child(' + (index + 1) + ')'; + const sameTag = Array.from(parent.children).filter( + s => s.tagName === el.tagName + ); + if (sameTag.length > 1) { + const idx = Array.from(parent.children).indexOf(el); + sel += ':nth-child(' + (idx + 1) + ')'; } } - - path.unshift(selector); + return sel; + } + + // Walk up looking for an ancestor with an ID (max 6 levels) + const parts = []; + let current = element; + const maxDepth = 6; + + for (let i = 0; i < maxDepth && current && current !== document.body; i++) { + if (i > 0 && current.id) { + // Found an ID anchor — build selector from here + const anchor = '#' + CSS.escape(current.id); + parts.unshift(anchor); + const candidate = parts.join(' > '); + // Verify uniqueness + try { + if (document.querySelectorAll(candidate).length === 1) { + return candidate; + } + } catch(e) { /* selector parse error, fall through */ } + // If not unique with ID anchor, keep going + parts.shift(); + parts.unshift(segment(current)); + } else { + parts.unshift(segment(current)); + } current = current.parentElement; } - - return path.join(' > '); + + // No nearby ID found — use the path from body + const candidate = parts.join(' > '); + try { + if (document.querySelectorAll(candidate).length === 1) { + return candidate; + } + } catch(e) { /* fall through */ } + + // If still not unique, keep walking up to body + while (current && current !== document.body) { + parts.unshift(segment(current)); + current = current.parentElement; + } + return parts.join(' > '); } // Main execution From e496835f4dac6e5a66310b7f94c183286e847706 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 01:36:50 -0800 Subject: [PATCH 04/11] fix: add selector parameter to browser_click/type, validate input target browser_click and browser_type now accept either `index` (from snapshot) or `selector` (CSS selector like "#login_field") to target elements. This lets the LLM use a known selector directly instead of relying on snapshot indices that may be stale or point to the wrong element after page transitions. browser_type also validates that the target element is an INPUT, TEXTAREA, SELECT, or contentEditable element before typing, returning a clear error if it's a button or div instead of silently 'succeeding'. Updated worker prompt to document both targeting modes. --- prompts/en/worker.md.j2 | 10 ++- src/tools/browser.rs | 169 ++++++++++++++++++++++++++++------------ 2 files changed, 125 insertions(+), 54 deletions(-) diff --git a/prompts/en/worker.md.j2 b/prompts/en/worker.md.j2 index b4eca6be3..5b7d3354e 100644 --- a/prompts/en/worker.md.j2 +++ b/prompts/en/worker.md.j2 @@ -98,8 +98,8 @@ Automate a headless Chrome browser. Each action is a separate tool. 1. `browser_launch` — Start the browser (or auto-launched by `browser_navigate`) 2. `browser_navigate` — Go to a URL 3. `browser_snapshot` — Get the page's ARIA accessibility tree with numbered element indices -4. `browser_click` — Click an element by index: `{"index": 5}` -5. `browser_type` — Type text into an input by index: `{"index": 3, "text": "hello"}` +4. `browser_click` — Click an element: `{"index": 5}` or `{"selector": "#my-button"}` +5. `browser_type` — Type into an input: `{"index": 3, "text": "hello"}` or `{"selector": "#login_field", "text": "hello"}` 6. `browser_screenshot` — Capture the page {%- if browser_persist_session %} 7. `browser_close` — Detach from the browser when done (tabs and session preserved) @@ -107,11 +107,13 @@ Automate a headless Chrome browser. Each action is a separate tool. 7. `browser_close` — Shut down the browser when done {%- endif %} -**Element indices** are assigned during `browser_snapshot` and look like `[index=0]`, `[index=1]`. Always snapshot before interacting — indices reset on each snapshot. +**Targeting elements:** `browser_click` and `browser_type` accept either `index` (from `browser_snapshot`) or `selector` (a CSS selector like `"#login_field"` or `"button.submit"`). Use `index` when you have a fresh snapshot; use `selector` when you know the element's ID or a stable CSS path — it's more reliable on dynamic pages. + +**Element indices** are assigned during `browser_snapshot` and look like `[index=0]`, `[index=1]`. Always snapshot before using index-based interaction — indices reset on each snapshot. **Other tools:** `browser_press_key` (Enter, Tab, Escape), `browser_evaluate` (run JS on the page), `browser_tab_open`, `browser_tab_list`, `browser_tab_close`. -**Prefer index-based interaction:** Use `browser_snapshot` to get element indices, then `browser_click`/`browser_type`/`browser_press_key` to interact. Fall back to `browser_evaluate` for reading page data or when the index-based tools don't work for a specific page. +**Prefer index or selector over evaluate:** Use `browser_snapshot` + `browser_click`/`browser_type` to interact with elements. Fall back to `browser_evaluate` for reading page data or when the other tools don't work for a specific page. ### secret_set diff --git a/src/tools/browser.rs b/src/tools/browser.rs index 86395d092..18c3b958b 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -603,37 +603,47 @@ impl BrowserContext { Ok(state.snapshot.as_ref().expect("just stored")) } - /// Resolve a numeric element index to a chromiumoxide Element on the active page. - /// Uses the cached snapshot's CSS selectors. - async fn find_element_by_index( + /// Resolve an element target (index or CSS selector) to a chromiumoxide Element. + /// + /// When an index is provided, the cached snapshot's CSS selectors are used. + /// When a CSS selector string is provided, it's used directly — this is + /// useful when the LLM already knows the selector (e.g., `#login_field`). + async fn find_element( &self, state: &mut BrowserState, - index: usize, + target: &ElementTarget, ) -> Result { - // Ensure snapshot is cached - self.extract_snapshot(state).await?; - let snapshot = state.snapshot.as_ref().expect("just extracted"); - - let selector = snapshot.selector_for_index(index).ok_or_else(|| { - BrowserError::new(format!( - "element index {index} not found — run browser_snapshot to get fresh indices \ - (max index in current snapshot: {})", - snapshot.selectors.len().saturating_sub(1) - )) - })?; + let selector = match target { + ElementTarget::Index(index) => { + // Ensure snapshot is cached + self.extract_snapshot(state).await?; + let snapshot = state.snapshot.as_ref().expect("just extracted"); + + let sel = snapshot.selector_for_index(*index).ok_or_else(|| { + BrowserError::new(format!( + "element index {index} not found — run browser_snapshot to get fresh \ + indices (max index in current snapshot: {})", + snapshot.selectors.len().saturating_sub(1) + )) + })?; + + if sel.is_empty() { + return Err(BrowserError::new(format!( + "element index {index} has an empty CSS selector — the element may be \ + in an iframe. Try using the `selector` parameter instead." + ))); + } - if selector.is_empty() { - return Err(BrowserError::new(format!( - "element index {index} has an empty CSS selector — the element may be in an iframe. \ - Try browser_evaluate with a custom querySelector instead." - ))); - } + sel.to_string() + } + ElementTarget::Selector(selector) => selector.clone(), + }; let page = self.require_active_page(state)?; - page.find_element(selector).await.map_err(|error| { + page.find_element(&selector).await.map_err(|error| { BrowserError::new(format!( - "element at index {index} not found via selector '{selector}': {error}. \ + "element not found via selector '{selector}': {error}. \ The page may have changed — run browser_snapshot again." )) }) @@ -930,6 +940,35 @@ impl Tool for BrowserSnapshotTool { } } +// Shared element targeting — tools accept either an index or a CSS selector. + +/// Resolved element target for click/type/press_key tools. +enum ElementTarget { + Index(usize), + Selector(String), +} + +impl ElementTarget { + /// Build from optional index + optional selector args. + /// At least one must be provided; `selector` wins if both are present. + fn from_args(index: Option, selector: Option) -> Result { + match (selector, index) { + (Some(sel), _) if !sel.is_empty() => Ok(Self::Selector(sel)), + (_, Some(idx)) => Ok(Self::Index(idx)), + _ => Err(BrowserError::new( + "provide either `index` (from browser_snapshot) or `selector` (CSS selector)", + )), + } + } + + fn display(&self) -> String { + match self { + Self::Index(i) => format!("index {i}"), + Self::Selector(s) => format!("selector '{s}'"), + } + } +} + // Tool: browser_click #[derive(Debug, Clone)] @@ -940,7 +979,10 @@ pub struct BrowserClickTool { #[derive(Debug, Deserialize, JsonSchema)] pub struct BrowserClickArgs { /// The element index from the snapshot (e.g., 5). - pub index: usize, + pub index: Option, + /// CSS selector to target directly (e.g., "#login_field"). Use this when + /// you know the selector — it's more reliable than index for dynamic pages. + pub selector: Option, } impl Tool for BrowserClickTool { @@ -952,23 +994,24 @@ impl Tool for BrowserClickTool { async fn definition(&self, _prompt: String) -> ToolDefinition { ToolDefinition { name: Self::NAME.to_string(), - description: "Click an element by its index from browser_snapshot.".to_string(), + description: "Click an element by index (from browser_snapshot) or CSS selector." + .to_string(), parameters: serde_json::json!({ "type": "object", "properties": { - "index": { "type": "integer", "description": "Element index from snapshot" } - }, - "required": ["index"] + "index": { "type": "integer", "description": "Element index from snapshot" }, + "selector": { "type": "string", "description": "CSS selector (e.g. \"#my-button\", \"button.submit\")" } + } }), } } async fn call(&self, args: Self::Args) -> Result { + let target = ElementTarget::from_args(args.index, args.selector)?; + let label = target.display(); + let mut state = self.context.state.lock().await; - let element = self - .context - .find_element_by_index(&mut state, args.index) - .await?; + let element = self.context.find_element(&mut state, &target).await?; element .click() @@ -983,8 +1026,7 @@ impl Tool for BrowserClickTool { state.invalidate_snapshot(); Ok(BrowserOutput::success(format!( - "Clicked element at index {}", - args.index + "Clicked element at {label}" ))) } } @@ -999,7 +1041,9 @@ pub struct BrowserTypeTool { #[derive(Debug, Deserialize, JsonSchema)] pub struct BrowserTypeArgs { /// The element index from the snapshot. - pub index: usize, + pub index: Option, + /// CSS selector to target directly (e.g., "#login_field", "input[name='email']"). + pub selector: Option, /// The text to type into the element. pub text: String, /// Whether to clear the field before typing. Defaults to true. @@ -1020,48 +1064,74 @@ impl Tool for BrowserTypeTool { async fn definition(&self, _prompt: String) -> ToolDefinition { ToolDefinition { name: Self::NAME.to_string(), - description: "Type text into an input element by its index from browser_snapshot." + description: "Type text into an input element by index (from browser_snapshot) or \ + CSS selector." .to_string(), parameters: serde_json::json!({ "type": "object", "properties": { "index": { "type": "integer", "description": "Element index from snapshot" }, + "selector": { "type": "string", "description": "CSS selector (e.g. \"#login_field\", \"input[name='email']\")" }, "text": { "type": "string", "description": "Text to type" }, "clear": { "type": "boolean", "default": true, "description": "Clear the field before typing (default true)" } }, - "required": ["index", "text"] + "required": ["text"] }), } } async fn call(&self, args: Self::Args) -> Result { + let target = ElementTarget::from_args(args.index, args.selector)?; + let label = target.display(); + let mut state = self.context.state.lock().await; - // Use JS injection for reliable clear + type: focus, select all, then - // set value and fire events. chromiumoxide's type_str sends individual - // key events which is slow and fragile with JS frameworks. - let snapshot = self.context.extract_snapshot(&mut state).await?; - let selector = snapshot - .selector_for_index(args.index) - .ok_or_else(|| BrowserError::new(format!("element index {} not found", args.index)))? - .to_string(); + // Resolve the CSS selector from the target. + let selector = match &target { + ElementTarget::Selector(sel) => sel.clone(), + ElementTarget::Index(index) => { + let snapshot = self.context.extract_snapshot(&mut state).await?; + snapshot + .selector_for_index(*index) + .ok_or_else(|| BrowserError::new(format!("element index {index} not found")))? + .to_string() + } + }; let page = self.context.require_active_page(&state)?; + // Use JS injection for reliable clear + type: focus the element, set + // its value, and fire input/change events. Also validates that the + // target is an input, textarea, or contenteditable element. let text_json = serde_json::to_string(&args.text).unwrap_or_default(); let clear_js = if args.clear { "el.value = '';" } else { "" }; let js = format!( r#"(() => {{ const el = document.querySelector({selector_json}); - if (!el) return JSON.stringify({{success: false, error: 'element not found'}}); + if (!el) return JSON.stringify({{success: false, error: 'element not found via selector'}}); + const tag = el.tagName; + const editable = el.isContentEditable; + if (tag !== 'INPUT' && tag !== 'TEXTAREA' && tag !== 'SELECT' && !editable) {{ + return JSON.stringify({{success: false, error: 'element is a ' + tag.toLowerCase() + ', not a text input — check your index or selector'}}); + }} el.focus(); - {clear_js} - el.value = {text_json}; + if (editable) {{ + {clear_editable} + el.textContent = {text_json}; + }} else {{ + {clear_js} + el.value = {text_json}; + }} el.dispatchEvent(new Event('input', {{bubbles: true}})); el.dispatchEvent(new Event('change', {{bubbles: true}})); return JSON.stringify({{success: true}}); }})()"#, selector_json = serde_json::to_string(&selector).unwrap_or_default(), + clear_editable = if args.clear { + "el.textContent = '';" + } else { + "" + }, clear_js = clear_js, text_json = text_json, ); @@ -1092,8 +1162,7 @@ impl Tool for BrowserTypeTool { args.text.clone() }; Ok(BrowserOutput::success(format!( - "Typed '{display_text}' into element at index {}", - args.index + "Typed '{display_text}' into element at {label}" ))) } } From 5432565f7103cb1adc6104d9737860b53ca128ea Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 01:50:30 -0800 Subject: [PATCH 05/11] debug: surface DOM extraction errors and add CDP result logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move JSON.stringify inside the try/catch block so stringify errors (circular refs, size limits) are caught and reported. Add stack traces to the JS error fallback. Surface extraction errors to the LLM as tool errors instead of silently returning 0 elements. Add debug-level tracing of the raw CDP result type and size. This is diagnostic — we need to see why railway.com returns 0 elements despite having 92 interactive elements in the DOM. --- src/tools/browser.rs | 33 +++++++++++++++++++++++++++++++- src/tools/browser_extract_dom.js | 13 ++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/src/tools/browser.rs b/src/tools/browser.rs index 18c3b958b..6fa020611 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -123,6 +123,19 @@ fn is_v4_mapped_blocked(ip: std::net::Ipv6Addr) -> bool { } } +// Debug helpers + +fn value_type_name(value: &serde_json::Value) -> &'static str { + match value { + serde_json::Value::Null => "null", + serde_json::Value::Bool(_) => "bool", + serde_json::Value::Number(_) => "number", + serde_json::Value::String(_) => "string", + serde_json::Value::Array(_) => "array", + serde_json::Value::Object(_) => "object", + } +} + // Page readiness helper /// Wait for the page to be "ready enough" for DOM extraction. @@ -585,9 +598,22 @@ impl BrowserContext { let value = result.value().cloned().unwrap_or(serde_json::Value::Null); + tracing::debug!( + value_type = ?value_type_name(&value), + value_len = value.as_str().map(|s| s.len()), + "DOM extraction raw CDP result" + ); + // The JS wraps the result in JSON.stringify(), so we get a string back let json_value = if let Some(json_str) = value.as_str() { - serde_json::from_str::(json_str).unwrap_or(serde_json::Value::Null) + serde_json::from_str::(json_str).map_err(|error| { + tracing::warn!( + json_preview = &json_str[..json_str.len().min(200)], + %error, + "failed to parse DOM extraction JSON string" + ); + BrowserError::new(format!("failed to parse DOM extraction JSON: {error}")) + })? } else { value }; @@ -909,6 +935,11 @@ impl Tool for BrowserSnapshotTool { state.invalidate_snapshot(); let snapshot = self.context.extract_snapshot(&mut state).await?; + // Surface any JS-level errors from the extraction script + if let Some(ref error) = snapshot.error { + return Err(BrowserError::new(format!("DOM extraction failed: {error}"))); + } + let rendered = snapshot.render(); let element_count = snapshot.selectors.len(); let title = self diff --git a/src/tools/browser_extract_dom.js b/src/tools/browser_extract_dom.js index 782e5608b..a335acd5a 100644 --- a/src/tools/browser_extract_dom.js +++ b/src/tools/browser_extract_dom.js @@ -1,6 +1,6 @@ // ARIA Snapshot DOM Extraction // Based on Playwright's ariaSnapshot.ts - generates ARIA-tree structure for AI consumption -JSON.stringify((function() { +(function() { 'use strict'; let currentIndex = 0; @@ -755,18 +755,21 @@ JSON.stringify((function() { // Serialize and return const serialized = serializeAriaNode(snapshot); - return { + const result = { root: serialized, selectors: selectors, iframe_indices: iframe_indices }; + + // Stringify inside try/catch so we catch circular-ref or size errors + return JSON.stringify(result); } catch (error) { - return { - error: error.toString(), + return JSON.stringify({ + error: error.toString() + ' | stack: ' + (error.stack || 'none'), root: { role: 'fragment', name: '', children: [], props: {}, box_info: { visible: false } }, selectors: [], iframe_indices: [] - }; + }); } })()) From 36dcc767fce4d3997c983fe99e6bbf4f284892d6 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 02:35:26 -0800 Subject: [PATCH 06/11] refactor: replace JS DOM extraction with CDP Accessibility API Switch from injected browser_extract_dom.js to Chrome's native Accessibility.getFullAXTree CDP call. The JS injection approach failed silently on complex SPA pages (railway.com returned 0 elements despite 92 interactive elements being present). Key changes: - Extract snapshot via page.execute(GetFullAxTreeParams) instead of page.evaluate(EXTRACT_DOM_JS) - Reconstruct tree from flat Vec using parent_id/child_ids - Walk through ignored AX nodes transparently to reach non-ignored interactive descendants (fixes the 0-element bug) - Store BackendNodeId per indexed element instead of CSS selectors - Resolve clicks via DOM.getBoxModel + Input.dispatchMouseEvent instead of fragile CSS selector queries - Resolve typing via DOM.resolveNode + Runtime.callFunctionOn - Delete browser_extract_dom.js (775 lines of JS removed) --- src/tools/browser.rs | 940 +++++++++++++++++++++---------- src/tools/browser_extract_dom.js | 775 ------------------------- 2 files changed, 657 insertions(+), 1058 deletions(-) delete mode 100644 src/tools/browser_extract_dom.js diff --git a/src/tools/browser.rs b/src/tools/browser.rs index 6fa020611..8c4d9834c 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -1,21 +1,31 @@ //! Browser automation tools for workers. //! //! Provides a suite of separate tools (navigate, click, type, snapshot, etc.) -//! backed by a shared browser state. Uses an ARIA-tree DOM extraction script -//! (ported from browser-use-rs) that assigns numeric indices to interactive -//! elements. The LLM sees a compact YAML snapshot and interacts via index. +//! backed by a shared browser state. Uses Chrome's native CDP Accessibility API +//! (`Accessibility.getFullAXTree`) to extract the ARIA tree directly from the +//! browser's accessibility layer. Interactive elements get sequential indices +//! and their `BackendNodeId` is stored for reliable element resolution. //! -//! Element resolution: index → CSS selector (built from DOM position during -//! extraction) → chromiumoxide `page.find_element()`. This avoids stale CDP -//! node IDs and works with both native HTML elements and ARIA widgets. +//! Element resolution: index → `BackendNodeId` (from CDP accessibility tree) → +//! `DOM.getBoxModel` for coordinates → `Input.dispatchMouseEvent` for clicks. +//! This avoids fragile CSS selectors and works reliably on SPAs and complex +//! pages where JS injection fails. use crate::config::BrowserConfig; use chromiumoxide::browser::{Browser, BrowserConfig as ChromeConfig}; use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions}; +use chromiumoxide::handler::viewport::Viewport; use chromiumoxide::page::ScreenshotParams; +use chromiumoxide_cdp::cdp::browser_protocol::accessibility::{ + AxNode, AxPropertyName, GetFullAxTreeParams, +}; +use chromiumoxide_cdp::cdp::browser_protocol::dom::{ + BackendNodeId, GetBoxModelParams, ResolveNodeParams, +}; use chromiumoxide_cdp::cdp::browser_protocol::input::{ - DispatchKeyEventParams, DispatchKeyEventType, + DispatchKeyEventParams, DispatchKeyEventType, DispatchMouseEventParams, DispatchMouseEventType, + MouseButton, }; use chromiumoxide_cdp::cdp::browser_protocol::page::CaptureScreenshotFormat; use futures::StreamExt as _; @@ -123,19 +133,6 @@ fn is_v4_mapped_blocked(ip: std::net::Ipv6Addr) -> bool { } } -// Debug helpers - -fn value_type_name(value: &serde_json::Value) -> &'static str { - match value { - serde_json::Value::Null => "null", - serde_json::Value::Bool(_) => "bool", - serde_json::Value::Number(_) => "number", - serde_json::Value::String(_) => "string", - serde_json::Value::Array(_) => "array", - serde_json::Value::Object(_) => "object", - } -} - // Page readiness helper /// Wait for the page to be "ready enough" for DOM extraction. @@ -179,101 +176,345 @@ async fn wait_for_page_ready(page: &chromiumoxide::Page) { } } -// DOM snapshot types (ported from browser-use-rs) - -/// An ARIA tree snapshot of a page, extracted via injected JavaScript. +// Accessibility snapshot types (CDP-native) + +/// Roles that represent interactive elements the LLM can target via index. +/// Only nodes with these roles get assigned an index in the snapshot. +const INTERACTIVE_ROLES: &[&str] = &[ + "button", + "checkbox", + "combobox", + "link", + "menuitem", + "menuitemcheckbox", + "menuitemradio", + "option", + "radio", + "searchbox", + "slider", + "spinbutton", + "switch", + "tab", + "textbox", + "treeitem", +]; + +/// Roles that are structural noise and should be skipped when they have no +/// name and no interactive descendants. Keeps the YAML output compact. +const SKIP_UNNAMED_ROLES: &[&str] = &[ + "generic", + "none", + "presentation", + "InlineTextBox", + "LineBreak", +]; + +/// A snapshot of the page's accessibility tree, extracted via CDP. /// -/// Contains the tree structure for LLM display and a parallel `selectors` -/// array that maps element indices to CSS selectors for interaction. -#[derive(Debug, Clone, Deserialize)] -pub(crate) struct DomSnapshot { - pub root: AriaNode, - /// CSS selector for each indexed element. `selectors[i]` corresponds to the - /// element with `index == i` in the ARIA tree. - pub selectors: Vec, - #[allow(dead_code)] - pub iframe_indices: Vec, - /// Extraction error from the JS side, if any. - pub error: Option, -} - -/// A node in the ARIA accessibility tree. -#[derive(Debug, Clone, Deserialize)] -pub(crate) struct AriaNode { +/// Contains a tree of `SnapshotNode`s for LLM display and a parallel map from +/// element indices to `BackendNodeId` for interaction. +#[derive(Debug, Clone)] +pub(crate) struct AxSnapshot { + /// Root nodes of the tree (usually one, but CDP can return multiple roots). + pub roots: Vec, + /// `BackendNodeId` for each indexed interactive element. + /// `node_ids[i]` corresponds to the element with `index == i` in the tree. + pub node_ids: Vec, +} + +/// A node in the processed accessibility tree, ready for rendering. +#[derive(Debug, Clone)] +pub(crate) struct SnapshotNode { pub role: String, pub name: String, - #[serde(default)] + /// Sequential index assigned to interactive elements. `None` for structural nodes. pub index: Option, - #[serde(default)] - pub children: Vec, - #[serde(default)] - pub props: HashMap, - #[serde(default)] - pub active: Option, - #[serde(default)] - pub checked: Option, - #[serde(default)] - pub disabled: Option, - #[serde(default)] + pub children: Vec, + // State properties + pub checked: Option, + pub disabled: bool, pub expanded: Option, - #[serde(default)] - pub level: Option, - #[serde(default)] - pub pressed: Option, - #[serde(default)] - pub selected: Option, - #[serde(default)] - pub box_info: Option, -} + pub selected: bool, + pub level: Option, + pub pressed: Option, + pub value: Option, + pub description: Option, +} + +impl AxSnapshot { + /// Build from a flat CDP `AxNode` list by reconstructing the tree and + /// assigning sequential indices to interactive elements. + fn from_ax_nodes(nodes: Vec) -> Self { + // Build a lookup from AxNodeId → index in the flat list. + let id_to_idx: HashMap = nodes + .iter() + .enumerate() + .map(|(i, node)| (node.node_id.inner().clone(), i)) + .collect(); + + // Track which nodes are roots (no parent) or whose parent_id doesn't + // exist in the tree (happens with frame roots). + let root_indices: Vec = nodes + .iter() + .enumerate() + .filter(|(_, node)| { + node.parent_id + .as_ref() + .map_or(true, |pid| !id_to_idx.contains_key(pid.inner())) + }) + .map(|(i, _)| i) + .collect(); + + let mut next_index: usize = 0; + let mut node_ids: Vec = Vec::new(); + + // Collect non-ignored descendants of an ignored node. The CDP AX + // tree marks many structural nodes as ignored, but their children may + // be interactive (button inside an ignored generic div). We must walk + // through ignored nodes to reach them. + fn collect_children_of_ignored( + flat: &[AxNode], + idx: usize, + id_to_idx: &HashMap, + next_index: &mut usize, + node_ids: &mut Vec, + ) -> Vec { + let ax = &flat[idx]; + let mut result = Vec::new(); + if let Some(child_ids) = &ax.child_ids { + for child_id in child_ids { + if let Some(&child_idx) = id_to_idx.get(child_id.inner()) { + if flat[child_idx].ignored { + // Recursively collect through chains of ignored nodes. + result.extend(collect_children_of_ignored( + flat, child_idx, id_to_idx, next_index, node_ids, + )); + } else if let Some(child_node) = + build_node(flat, child_idx, id_to_idx, next_index, node_ids) + { + result.push(child_node); + } + } + } + } + result + } -#[derive(Debug, Clone, Deserialize)] -pub(crate) struct BoxInfo { - #[allow(dead_code)] - pub visible: bool, - pub cursor: Option, -} + // Recursive tree builder + fn build_node( + flat: &[AxNode], + idx: usize, + id_to_idx: &HashMap, + next_index: &mut usize, + node_ids: &mut Vec, + ) -> Option { + let ax = &flat[idx]; + + // Ignored nodes aren't rendered, but their children may contain + // non-ignored interactive elements. Handled by the caller via + // `collect_children_of_ignored`. + if ax.ignored { + return None; + } -/// A child of an `AriaNode` — either a text string or a nested node. -#[derive(Debug, Clone, Deserialize)] -#[serde(untagged)] -pub(crate) enum AriaChild { - Text(String), - Node(Box), -} + let role = ax + .role + .as_ref() + .and_then(|v| v.value.as_ref()) + .and_then(|v| v.as_str()) + .unwrap_or("unknown") + .to_string(); -/// The JS script injected into pages to extract the ARIA tree. -/// Ported from browser-use-rs's `extract_dom.js`. -const EXTRACT_DOM_JS: &str = include_str!("browser_extract_dom.js"); + let name = ax + .name + .as_ref() + .and_then(|v| v.value.as_ref()) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); -impl DomSnapshot { - /// Render the ARIA tree as compact YAML-like text for LLM consumption. + let description = ax + .description + .as_ref() + .and_then(|v| v.value.as_ref()) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .filter(|s| !s.is_empty()); + + let value_text = ax + .value + .as_ref() + .and_then(|v| v.value.as_ref()) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .filter(|s| !s.is_empty()); + + // Extract state properties from the properties array. + let mut checked = None; + let mut disabled = false; + let mut expanded = None; + let mut selected = false; + let mut level = None; + let mut pressed = None; + + if let Some(properties) = &ax.properties { + for prop in properties { + match prop.name { + AxPropertyName::Checked => { + checked = prop + .value + .value + .as_ref() + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + } + AxPropertyName::Disabled => { + disabled = prop + .value + .value + .as_ref() + .and_then(|v| v.as_bool()) + .unwrap_or(false); + } + AxPropertyName::Expanded => { + expanded = prop.value.value.as_ref().and_then(|v| v.as_bool()); + } + AxPropertyName::Selected => { + selected = prop + .value + .value + .as_ref() + .and_then(|v| v.as_bool()) + .unwrap_or(false); + } + AxPropertyName::Level => { + level = prop.value.value.as_ref().and_then(|v| v.as_i64()); + } + AxPropertyName::Pressed => { + pressed = prop + .value + .value + .as_ref() + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + } + _ => {} + } + } + } + + // Build children recursively. Ignored children are transparent — + // we walk through them to collect their non-ignored descendants. + let mut children = Vec::new(); + if let Some(child_ids) = &ax.child_ids { + for child_id in child_ids { + if let Some(&child_idx) = id_to_idx.get(child_id.inner()) { + if flat[child_idx].ignored { + children.extend(collect_children_of_ignored( + flat, child_idx, id_to_idx, next_index, node_ids, + )); + } else if let Some(child_node) = + build_node(flat, child_idx, id_to_idx, next_index, node_ids) + { + children.push(child_node); + } + } + } + } + + // Assign an index to interactive elements that have a BackendNodeId. + let role_lower = role.to_lowercase(); + let is_interactive = INTERACTIVE_ROLES.contains(&role_lower.as_str()); + let element_index = if is_interactive && ax.backend_dom_node_id.is_some() { + let idx = *next_index; + *next_index += 1; + node_ids.push(ax.backend_dom_node_id.expect("checked above")); + Some(idx) + } else { + None + }; + + // Skip structural nodes that have no name, no index, and only serve + // as tree wrappers — but only if they have exactly one child (pass + // through) or zero children (prune entirely). + if SKIP_UNNAMED_ROLES.contains(&role.as_str()) + && element_index.is_none() + && name.is_empty() + { + match children.len() { + 0 => return None, + 1 => return Some(children.into_iter().next().expect("len checked")), + _ => { + // Multiple children — keep the node as a container + // but only if children are non-trivial. + } + } + } + + Some(SnapshotNode { + role, + name, + index: element_index, + children, + checked, + disabled, + expanded, + selected, + level, + pressed, + value: value_text, + description, + }) + } + + let mut roots = Vec::new(); + for &root_idx in &root_indices { + if nodes[root_idx].ignored { + // Root itself is ignored — collect its non-ignored descendants. + roots.extend(collect_children_of_ignored( + &nodes, + root_idx, + &id_to_idx, + &mut next_index, + &mut node_ids, + )); + } else if let Some(node) = + build_node(&nodes, root_idx, &id_to_idx, &mut next_index, &mut node_ids) + { + roots.push(node); + } + } + + Self { roots, node_ids } + } + + /// Render the accessibility tree as compact YAML-like text for LLM consumption. pub fn render(&self) -> String { let mut output = String::with_capacity(4096); - render_node(&self.root, 0, &mut output); + for root in &self.roots { + render_snapshot_node(root, 0, &mut output); + } output } - /// Look up the CSS selector for an element index. - pub fn selector_for_index(&self, index: usize) -> Option<&str> { - self.selectors.get(index).map(|s| s.as_str()) + /// Look up the `BackendNodeId` for an element index. + pub fn backend_node_id_for_index(&self, index: usize) -> Option { + self.node_ids.get(index).copied() + } + + /// Total number of indexed interactive elements. + pub fn element_count(&self) -> usize { + self.node_ids.len() } } -fn render_node(node: &AriaNode, depth: usize, output: &mut String) { +fn render_snapshot_node(node: &SnapshotNode, depth: usize, output: &mut String) { let indent = " ".repeat(depth); - // Skip the root fragment — just render children - if node.role == "fragment" { - for child in &node.children { - render_child(child, depth, output); - } - return; - } - - // Skip generic nodes without index or name — they're structural noise - if node.role == "generic" && node.index.is_none() && node.name.is_empty() { + // Skip the root "RootWebArea" wrapper — just render children directly. + if node.role == "RootWebArea" || node.role == "rootWebArea" { for child in &node.children { - render_child(child, depth, output); + render_snapshot_node(child, depth, output); } return; } @@ -285,8 +526,13 @@ fn render_node(node: &AriaNode, depth: usize, output: &mut String) { if !node.name.is_empty() { output.push_str(" \""); - // Escape quotes in name for YAML safety - output.push_str(&node.name.replace('"', "\\\"")); + // Truncate very long names for context efficiency. + let display_name = if node.name.len() > 200 { + format!("{}...", &node.name[..200]) + } else { + node.name.clone() + }; + output.push_str(&display_name.replace('"', "\\\"")); output.push('"'); } @@ -297,28 +543,24 @@ fn render_node(node: &AriaNode, depth: usize, output: &mut String) { if let Some(level) = node.level { output.push_str(&format!(" [level={level}]")); } - if let Some(true) = node.active { - output.push_str(" [active]"); - } - if let Some(true) = node.disabled { + if node.disabled { output.push_str(" [disabled]"); } - if let Some(true) = node.selected { + if node.selected { output.push_str(" [selected]"); } if let Some(ref checked) = node.checked { - match checked { - serde_json::Value::Bool(true) => output.push_str(" [checked]"), - serde_json::Value::Bool(false) => output.push_str(" [unchecked]"), - serde_json::Value::String(s) if s == "mixed" => output.push_str(" [checked=mixed]"), + match checked.as_str() { + "true" => output.push_str(" [checked]"), + "false" => output.push_str(" [unchecked]"), + "mixed" => output.push_str(" [checked=mixed]"), _ => {} } } if let Some(ref pressed) = node.pressed { - match pressed { - serde_json::Value::Bool(true) => output.push_str(" [pressed]"), - serde_json::Value::Bool(false) => {} - serde_json::Value::String(s) if s == "mixed" => output.push_str(" [pressed=mixed]"), + match pressed.as_str() { + "true" => output.push_str(" [pressed]"), + "mixed" => output.push_str(" [pressed=mixed]"), _ => {} } } @@ -327,55 +569,38 @@ fn render_node(node: &AriaNode, depth: usize, output: &mut String) { } else if let Some(false) = node.expanded { output.push_str(" [collapsed]"); } - if let Some(ref box_info) = node.box_info - && box_info.cursor.as_deref() == Some("pointer") - { - output.push_str(" [cursor=pointer]"); + + // Value (e.g., text input current value) + if let Some(ref value) = node.value { + let display_value = if value.len() > 100 { + format!("{}...", &value[..100]) + } else { + value.clone() + }; + output.push_str(&format!( + " value=\"{}\"", + display_value.replace('"', "\\\"") + )); } - // Props on separate indented lines let has_children = !node.children.is_empty(); - let has_props = !node.props.is_empty(); + let has_description = node.description.is_some(); - if has_children || has_props { + if has_children || has_description { output.push_str(":\n"); } else { output.push('\n'); } - for (key, value) in &node.props { + if let Some(ref description) = node.description { output.push_str(&indent); - output.push_str(" /"); - output.push_str(key); - output.push_str(": "); - output.push_str(value); + output.push_str(" /description: "); + output.push_str(description); output.push('\n'); } for child in &node.children { - render_child(child, depth + 1, output); - } -} - -fn render_child(child: &AriaChild, depth: usize, output: &mut String) { - match child { - AriaChild::Text(text) => { - let trimmed = text.trim(); - if !trimmed.is_empty() { - let indent = " ".repeat(depth); - // Truncate long text for LLM context efficiency - let display = if trimmed.len() > 200 { - format!("{}...", &trimmed[..200]) - } else { - trimmed.to_string() - }; - output.push_str(&indent); - output.push_str("- text \""); - output.push_str(&display.replace('"', "\\\"")); - output.push_str("\"\n"); - } - } - AriaChild::Node(node) => render_node(node, depth, output), + render_snapshot_node(child, depth + 1, output); } } @@ -402,9 +627,9 @@ pub struct BrowserState { _handler_task: Option>, pages: HashMap, active_target: Option, - /// Cached DOM snapshot from the last `browser_snapshot` call. Invalidated - /// on navigation, tab switch, and explicit snapshot refresh. - snapshot: Option, + /// Cached accessibility snapshot from the last `browser_snapshot` call. + /// Invalidated on navigation, tab switch, and explicit snapshot refresh. + snapshot: Option, user_data_dir: Option, /// When true, `user_data_dir` is a stable path that should NOT be deleted /// on drop — it holds cookies, localStorage, and login sessions. @@ -579,100 +804,267 @@ impl BrowserContext { .ok_or_else(|| BrowserError::new("active tab no longer exists")) } - /// Extract the DOM snapshot from the active page via injected JavaScript. + /// Extract the accessibility snapshot from the active page via CDP. /// Caches the result on `BrowserState` so repeated reads don't re-extract. async fn extract_snapshot<'a>( &self, state: &'a mut BrowserState, - ) -> Result<&'a DomSnapshot, BrowserError> { - if let Some(ref snapshot) = state.snapshot { - return Ok(snapshot); + ) -> Result<&'a AxSnapshot, BrowserError> { + if state.snapshot.is_some() { + return Ok(state.snapshot.as_ref().expect("just checked")); } let page = self.require_active_page(state)?; let result = page - .evaluate(EXTRACT_DOM_JS) + .execute(GetFullAxTreeParams::default()) .await - .map_err(|error| BrowserError::new(format!("DOM extraction failed: {error}")))?; + .map_err(|error| { + BrowserError::new(format!("accessibility tree extraction failed: {error}")) + })?; - let value = result.value().cloned().unwrap_or(serde_json::Value::Null); + let ax_nodes = result.result.nodes; tracing::debug!( - value_type = ?value_type_name(&value), - value_len = value.as_str().map(|s| s.len()), - "DOM extraction raw CDP result" + node_count = ax_nodes.len(), + "CDP Accessibility.getFullAXTree returned" ); - // The JS wraps the result in JSON.stringify(), so we get a string back - let json_value = if let Some(json_str) = value.as_str() { - serde_json::from_str::(json_str).map_err(|error| { - tracing::warn!( - json_preview = &json_str[..json_str.len().min(200)], - %error, - "failed to parse DOM extraction JSON string" - ); - BrowserError::new(format!("failed to parse DOM extraction JSON: {error}")) - })? - } else { - value - }; - - let snapshot: DomSnapshot = serde_json::from_value(json_value) - .map_err(|error| BrowserError::new(format!("failed to parse DOM snapshot: {error}")))?; + let snapshot = AxSnapshot::from_ax_nodes(ax_nodes); - if let Some(ref error) = snapshot.error { - tracing::warn!(%error, "DOM extraction JS reported an error"); - } + tracing::debug!( + interactive_count = snapshot.element_count(), + "accessibility snapshot built" + ); state.snapshot = Some(snapshot); Ok(state.snapshot.as_ref().expect("just stored")) } - /// Resolve an element target (index or CSS selector) to a chromiumoxide Element. + /// Resolve an element target to a `BackendNodeId` for interaction. /// - /// When an index is provided, the cached snapshot's CSS selectors are used. - /// When a CSS selector string is provided, it's used directly — this is - /// useful when the LLM already knows the selector (e.g., `#login_field`). + /// When an index is provided, the cached snapshot's `BackendNodeId` map is + /// used. When a CSS selector string is provided, it's resolved via + /// `page.find_element()` — this path is a fallback for when the LLM already + /// knows a selector (e.g., `#login_field`). async fn find_element( &self, state: &mut BrowserState, target: &ElementTarget, - ) -> Result { - let selector = match target { + ) -> Result { + match target { ElementTarget::Index(index) => { - // Ensure snapshot is cached self.extract_snapshot(state).await?; let snapshot = state.snapshot.as_ref().expect("just extracted"); - let sel = snapshot.selector_for_index(*index).ok_or_else(|| { + let backend_node_id = + snapshot.backend_node_id_for_index(*index).ok_or_else(|| { + BrowserError::new(format!( + "element index {index} not found — run browser_snapshot to get fresh \ + indices (max index in current snapshot: {})", + snapshot.element_count().saturating_sub(1) + )) + })?; + + Ok(ElementHandle::BackendNode(backend_node_id)) + } + ElementTarget::Selector(selector) => { + let page = self.require_active_page(state)?; + let element = page.find_element(selector).await.map_err(|error| { BrowserError::new(format!( - "element index {index} not found — run browser_snapshot to get fresh \ - indices (max index in current snapshot: {})", - snapshot.selectors.len().saturating_sub(1) + "element not found via selector '{selector}': {error}. \ + The page may have changed — run browser_snapshot again." )) })?; + Ok(ElementHandle::CssElement(element)) + } + } + } + + /// Click an element resolved by `find_element`. Uses CDP mouse events for + /// `BackendNodeId` targets (avoids CSS selector fragility) and the + /// chromiumoxide `Element::click()` for CSS selector targets. + async fn click_element( + &self, + state: &BrowserState, + handle: &ElementHandle, + ) -> Result<(), BrowserError> { + match handle { + ElementHandle::BackendNode(backend_node_id) => { + let page = self.require_active_page(state)?; + let (center_x, center_y) = get_element_center(page, *backend_node_id).await?; + + page.execute( + DispatchMouseEventParams::builder() + .r#type(DispatchMouseEventType::MousePressed) + .x(center_x) + .y(center_y) + .button(MouseButton::Left) + .click_count(1) + .build() + .map_err(|error| { + BrowserError::new(format!("failed to build mouse press event: {error}")) + })?, + ) + .await + .map_err(|error| BrowserError::new(format!("mouse press failed: {error}")))?; + + page.execute( + DispatchMouseEventParams::builder() + .r#type(DispatchMouseEventType::MouseReleased) + .x(center_x) + .y(center_y) + .button(MouseButton::Left) + .click_count(1) + .build() + .map_err(|error| { + BrowserError::new(format!( + "failed to build mouse release event: {error}" + )) + })?, + ) + .await + .map_err(|error| BrowserError::new(format!("mouse release failed: {error}")))?; + + Ok(()) + } + ElementHandle::CssElement(element) => { + element + .click() + .await + .map_err(|error| BrowserError::new(format!("click failed: {error}")))?; + Ok(()) + } + } + } + + /// Focus an element and type text into it. Uses CDP `DOM.focus` for + /// `BackendNodeId` targets. + async fn focus_and_type( + &self, + state: &BrowserState, + handle: &ElementHandle, + text: &str, + clear: bool, + ) -> Result<(), BrowserError> { + let page = self.require_active_page(state)?; + + match handle { + ElementHandle::BackendNode(backend_node_id) => { + // Resolve BackendNodeId to RemoteObject so we can call JS on it + let resolve_result = page + .execute(ResolveNodeParams { + backend_node_id: Some(*backend_node_id), + ..Default::default() + }) + .await + .map_err(|error| { + BrowserError::new(format!("failed to resolve node for typing: {error}")) + })?; + + let object_id = resolve_result.result.object.object_id.ok_or_else(|| { + BrowserError::new( + "resolved node has no object_id — element may have been removed", + ) + })?; - if sel.is_empty() { - return Err(BrowserError::new(format!( - "element index {index} has an empty CSS selector — the element may be \ - in an iframe. Try using the `selector` parameter instead." - ))); + // Use callFunctionOn to focus, validate, clear, and set value + let text_json = serde_json::to_string(text).unwrap_or_default(); + let clear_js = if clear { + "if (this.isContentEditable) { this.textContent = ''; } else { this.value = ''; }" + } else { + "" + }; + + let js = format!( + r#"function() {{ + const tag = this.tagName; + const editable = this.isContentEditable; + if (tag !== 'INPUT' && tag !== 'TEXTAREA' && tag !== 'SELECT' && !editable) {{ + return 'element is a ' + tag.toLowerCase() + ', not a text input'; + }} + this.focus(); + {clear_js} + if (editable) {{ + this.textContent = {text_json}; + }} else {{ + this.value = {text_json}; + }} + this.dispatchEvent(new Event('input', {{bubbles: true}})); + this.dispatchEvent(new Event('change', {{bubbles: true}})); + return null; + }}"#, + ); + + let call_result = page + .execute( + chromiumoxide_cdp::cdp::js_protocol::runtime::CallFunctionOnParams::builder() + .function_declaration(js) + .object_id(object_id) + .return_by_value(true) + .user_gesture(true) + .silent(true) + .build() + .map_err(|error| { + BrowserError::new(format!( + "failed to build CallFunctionOn params: {error}" + )) + })?, + ) + .await + .map_err(|error| { + BrowserError::new(format!("type action failed: {error}")) + })?; + + // Check for validation error returned from the JS function. + // The function returns null on success, or an error string. + if let Some(value) = &call_result.result.result.value + && let Some(error_msg) = value.as_str() + { + return Err(BrowserError::new(error_msg.to_string())); } - sel.to_string() + Ok(()) } - ElementTarget::Selector(selector) => selector.clone(), - }; + ElementHandle::CssElement(element) => { + // CSS selector path — click to focus, then use JS to type. + element + .click() + .await + .map_err(|error| BrowserError::new(format!("focus failed: {error}")))?; + + let text_json = serde_json::to_string(text).unwrap_or_default(); + let clear_js = if clear { "el.value = '';" } else { "" }; + let js = format!( + r#"(() => {{ + const el = document.activeElement; + if (!el) return 'no focused element after click'; + const tag = el.tagName; + const editable = el.isContentEditable; + if (tag !== 'INPUT' && tag !== 'TEXTAREA' && tag !== 'SELECT' && !editable) {{ + return 'element is a ' + tag.toLowerCase() + ', not a text input'; + }} + if (editable) {{ + {clear_editable} + el.textContent = {text_json}; + }} else {{ + {clear_js} + el.value = {text_json}; + }} + el.dispatchEvent(new Event('input', {{bubbles: true}})); + el.dispatchEvent(new Event('change', {{bubbles: true}})); + return null; + }})()"#, + clear_editable = if clear { "el.textContent = '';" } else { "" }, + ); - let page = self.require_active_page(state)?; + page.evaluate(js) + .await + .map_err(|error| BrowserError::new(format!("type failed: {error}")))?; - page.find_element(&selector).await.map_err(|error| { - BrowserError::new(format!( - "element not found via selector '{selector}': {error}. \ - The page may have changed — run browser_snapshot again." - )) - }) + Ok(()) + } + } } /// Launch the browser if not already running. Returns a status message. @@ -710,8 +1102,20 @@ impl BrowserContext { .chrome_executable(&executable) .user_data_dir(&user_data_dir); - if !self.config.headless { - builder = builder.with_head().window_size(1280, 900); + if self.config.headless { + // Headless has no real window — set an explicit viewport so + // screenshots render at a reasonable desktop size instead of + // the chromiumoxide default of 800x600. + builder = builder.viewport(Viewport { + width: 1280, + height: 900, + ..Default::default() + }); + } else { + // Headed mode: disable viewport emulation so the page fills + // the actual window. The default 800x600 viewport constrains + // page content to a smaller area than the window. + builder = builder.with_head().window_size(1280, 900).viewport(None); } let chrome_config = builder.build().map_err(|error| { @@ -935,13 +1339,8 @@ impl Tool for BrowserSnapshotTool { state.invalidate_snapshot(); let snapshot = self.context.extract_snapshot(&mut state).await?; - // Surface any JS-level errors from the extraction script - if let Some(ref error) = snapshot.error { - return Err(BrowserError::new(format!("DOM extraction failed: {error}"))); - } - let rendered = snapshot.render(); - let element_count = snapshot.selectors.len(); + let element_count = snapshot.element_count(); let title = self .context .require_active_page(&state)? @@ -973,6 +1372,13 @@ impl Tool for BrowserSnapshotTool { // Shared element targeting — tools accept either an index or a CSS selector. +/// A resolved element handle — either a CDP `BackendNodeId` (from snapshot +/// index lookup) or a chromiumoxide `Element` (from CSS selector query). +enum ElementHandle { + BackendNode(BackendNodeId), + CssElement(chromiumoxide::Element), +} + /// Resolved element target for click/type/press_key tools. enum ElementTarget { Index(usize), @@ -1042,12 +1448,9 @@ impl Tool for BrowserClickTool { let label = target.display(); let mut state = self.context.state.lock().await; - let element = self.context.find_element(&mut state, &target).await?; + let handle = self.context.find_element(&mut state, &target).await?; - element - .click() - .await - .map_err(|error| BrowserError::new(format!("click failed: {error}")))?; + self.context.click_element(&state, &handle).await?; // Clicks often trigger navigation or DOM changes — give the page a // moment to settle before the next snapshot. @@ -1116,74 +1519,11 @@ impl Tool for BrowserTypeTool { let label = target.display(); let mut state = self.context.state.lock().await; + let handle = self.context.find_element(&mut state, &target).await?; - // Resolve the CSS selector from the target. - let selector = match &target { - ElementTarget::Selector(sel) => sel.clone(), - ElementTarget::Index(index) => { - let snapshot = self.context.extract_snapshot(&mut state).await?; - snapshot - .selector_for_index(*index) - .ok_or_else(|| BrowserError::new(format!("element index {index} not found")))? - .to_string() - } - }; - - let page = self.context.require_active_page(&state)?; - - // Use JS injection for reliable clear + type: focus the element, set - // its value, and fire input/change events. Also validates that the - // target is an input, textarea, or contenteditable element. - let text_json = serde_json::to_string(&args.text).unwrap_or_default(); - let clear_js = if args.clear { "el.value = '';" } else { "" }; - let js = format!( - r#"(() => {{ - const el = document.querySelector({selector_json}); - if (!el) return JSON.stringify({{success: false, error: 'element not found via selector'}}); - const tag = el.tagName; - const editable = el.isContentEditable; - if (tag !== 'INPUT' && tag !== 'TEXTAREA' && tag !== 'SELECT' && !editable) {{ - return JSON.stringify({{success: false, error: 'element is a ' + tag.toLowerCase() + ', not a text input — check your index or selector'}}); - }} - el.focus(); - if (editable) {{ - {clear_editable} - el.textContent = {text_json}; - }} else {{ - {clear_js} - el.value = {text_json}; - }} - el.dispatchEvent(new Event('input', {{bubbles: true}})); - el.dispatchEvent(new Event('change', {{bubbles: true}})); - return JSON.stringify({{success: true}}); - }})()"#, - selector_json = serde_json::to_string(&selector).unwrap_or_default(), - clear_editable = if args.clear { - "el.textContent = '';" - } else { - "" - }, - clear_js = clear_js, - text_json = text_json, - ); - - let result = page - .evaluate(js) - .await - .map_err(|error| BrowserError::new(format!("type failed: {error}")))?; - - // Check for JS-level errors - if let Some(value) = result.value() - && let Some(json_str) = value.as_str() - && let Ok(parsed) = serde_json::from_str::(json_str) - && parsed.get("success").and_then(|v| v.as_bool()) == Some(false) - { - let error = parsed - .get("error") - .and_then(|v| v.as_str()) - .unwrap_or("type action failed"); - return Err(BrowserError::new(error.to_string())); - } + self.context + .focus_and_type(&state, &handle, &args.text, args.clear) + .await?; state.invalidate_snapshot(); @@ -1816,6 +2156,40 @@ fn page_target_id(page: &chromiumoxide::Page) -> String { page.target_id().inner().clone() } +/// Get the center coordinates of an element identified by `BackendNodeId`. +/// Uses `DOM.getBoxModel` to get the content box, then computes center. +async fn get_element_center( + page: &chromiumoxide::Page, + backend_node_id: BackendNodeId, +) -> Result<(f64, f64), BrowserError> { + let box_model = page + .execute(GetBoxModelParams { + backend_node_id: Some(backend_node_id), + ..Default::default() + }) + .await + .map_err(|error| { + BrowserError::new(format!( + "failed to get box model for element: {error}. \ + The element may not be visible — try scrolling or taking a screenshot." + )) + })?; + + // The content quad is a flat array of 8 values: [x1,y1, x2,y2, x3,y3, x4,y4] + let quad = &box_model.result.model.content; + if quad.inner().len() < 8 { + return Err(BrowserError::new( + "element has no visible content box — it may be hidden or zero-sized", + )); + } + + let points = quad.inner(); + let center_x = (points[0] + points[2] + points[4] + points[6]) / 4.0; + let center_y = (points[1] + points[3] + points[5] + points[7]) / 4.0; + + Ok((center_x, center_y)) +} + // Chrome executable resolution async fn resolve_chrome_executable(config: &BrowserConfig) -> Result { diff --git a/src/tools/browser_extract_dom.js b/src/tools/browser_extract_dom.js deleted file mode 100644 index a335acd5a..000000000 --- a/src/tools/browser_extract_dom.js +++ /dev/null @@ -1,775 +0,0 @@ -// ARIA Snapshot DOM Extraction -// Based on Playwright's ariaSnapshot.ts - generates ARIA-tree structure for AI consumption -(function() { - 'use strict'; - - let currentIndex = 0; - - // Helper: normalize whitespace - function normalizeWhiteSpace(text) { - return text.replace(/\s+/g, ' ').trim(); - } - - // Helper: check if element is visible for ARIA - function isElementHiddenForAria(element) { - const tagName = element.tagName; - if (['STYLE', 'SCRIPT', 'NOSCRIPT', 'TEMPLATE'].includes(tagName)) { - return true; - } - - const style = window.getComputedStyle(element); - - // Check display: contents - if (style.display === 'contents' && element.nodeName !== 'SLOT') { - let hasVisibleChild = false; - for (let child of element.childNodes) { - if (child.nodeType === 1 && !isElementHiddenForAria(child)) { - hasVisibleChild = true; - break; - } - if (child.nodeType === 3 && child.textContent && child.textContent.trim()) { - hasVisibleChild = true; - break; - } - } - if (!hasVisibleChild) return true; - } - - // Check visibility - if (style.visibility !== 'visible') { - return true; - } - - // Check display: none and aria-hidden - if (style.display === 'none') { - return true; - } - - if (element.getAttribute('aria-hidden') === 'true') { - return true; - } - - return false; - } - - // Helper: check if element is visible (bounding box check) - function isElementVisible(element) { - const rect = element.getBoundingClientRect(); - return rect.width > 0 && rect.height > 0; - } - - // Helper: compute element box information - function computeBox(element) { - const style = window.getComputedStyle(element); - const rect = element.getBoundingClientRect(); - const visible = rect.width > 0 && rect.height > 0; - const inline = style.display === 'inline'; - const cursor = style.cursor; - - return { visible, inline, cursor, rect }; - } - - // Helper: check if element receives pointer events - function receivesPointerEvents(element) { - const box = computeBox(element); - if (!box.visible) return false; - - const style = window.getComputedStyle(element); - return style.pointerEvents !== 'none'; - } - - // Helper: get ARIA role for element - function getAriaRole(element) { - // Check explicit role - const explicitRole = element.getAttribute('role'); - if (explicitRole) { - const roles = explicitRole.split(' ').map(r => r.trim()); - const validRole = roles[0]; // take first role - if (validRole) return validRole; - } - - // Implicit roles based on tag name - const tagName = element.tagName; - const implicitRoles = { - 'BUTTON': 'button', - 'A': element.hasAttribute('href') ? 'link' : null, - 'INPUT': getInputRole(element), - 'TEXTAREA': 'textbox', - 'SELECT': element.hasAttribute('multiple') || element.size > 1 ? 'listbox' : 'combobox', - 'H1': 'heading', 'H2': 'heading', 'H3': 'heading', - 'H4': 'heading', 'H5': 'heading', 'H6': 'heading', - 'IMG': element.getAttribute('alt') === '' ? 'presentation' : 'img', - 'NAV': 'navigation', - 'MAIN': 'main', - 'ARTICLE': 'article', - 'SECTION': element.hasAttribute('aria-label') || element.hasAttribute('aria-labelledby') ? 'region' : null, - 'HEADER': 'banner', - 'FOOTER': 'contentinfo', - 'ASIDE': 'complementary', - 'FORM': 'form', - 'TABLE': 'table', - 'UL': 'list', 'OL': 'list', - 'LI': 'listitem', - 'P': 'paragraph', - 'DIALOG': 'dialog', - 'IFRAME': 'iframe' - }; - - return implicitRoles[tagName] || 'generic'; - } - - function getInputRole(input) { - const type = (input.type || 'text').toLowerCase(); - const roles = { - 'button': 'button', - 'checkbox': 'checkbox', - 'radio': 'radio', - 'range': 'slider', - 'search': 'searchbox', - 'text': 'textbox', - 'email': 'textbox', - 'tel': 'textbox', - 'url': 'textbox', - 'number': 'spinbutton' - }; - return roles[type] || 'textbox'; - } - - // Helper: get accessible name for element - function getElementAccessibleName(element) { - // Check aria-label - const ariaLabel = element.getAttribute('aria-label'); - if (ariaLabel) return ariaLabel; - - // Check aria-labelledby - const labelledBy = element.getAttribute('aria-labelledby'); - if (labelledBy) { - const ids = labelledBy.split(/\s+/); - const texts = ids.map(id => { - const el = document.getElementById(id); - return el ? el.textContent : ''; - }).filter(t => t); - if (texts.length) return texts.join(' '); - } - - // Check associated label (for inputs) - if (element.tagName === 'INPUT' || element.tagName === 'TEXTAREA' || element.tagName === 'SELECT') { - const id = element.id; - if (id) { - const label = document.querySelector('label[for="' + id + '"]'); - if (label) return label.textContent || ''; - } - // Check if wrapped in label - const parentLabel = element.closest('label'); - if (parentLabel) { - return parentLabel.textContent || ''; - } - } - - // Check alt for images - if (element.tagName === 'IMG') { - return element.getAttribute('alt') || ''; - } - - // Check title - const title = element.getAttribute('title'); - if (title) return title; - - // Check placeholder for inputs - if (element.tagName === 'INPUT' || element.tagName === 'TEXTAREA') { - const placeholder = element.getAttribute('placeholder'); - if (placeholder) return placeholder; - } - - // For links and buttons, use text content if no other name found - if (element.tagName === 'A' || element.tagName === 'BUTTON') { - const text = element.textContent || ''; - if (text.trim()) return text.trim(); - } - - return ''; - } - - // Helper: get ARIA checked state - function getAriaChecked(element) { - const checked = element.getAttribute('aria-checked'); - if (checked === 'true') return true; - if (checked === 'false') return false; - if (checked === 'mixed') return 'mixed'; - - // Native checkbox/radio - if (element.tagName === 'INPUT') { - if (element.type === 'checkbox' || element.type === 'radio') { - return element.checked; - } - } - - return undefined; - } - - // Helper: get ARIA disabled state - function getAriaDisabled(element) { - const disabled = element.getAttribute('aria-disabled'); - if (disabled === 'true') return true; - - // Native disabled - if (element.disabled !== undefined) { - return element.disabled; - } - - return undefined; - } - - // Helper: get ARIA expanded state - function getAriaExpanded(element) { - const expanded = element.getAttribute('aria-expanded'); - if (expanded === 'true') return true; - if (expanded === 'false') return false; - return undefined; - } - - // Helper: get ARIA level - function getAriaLevel(element) { - const level = element.getAttribute('aria-level'); - if (level) { - const num = parseInt(level, 10); - if (!isNaN(num)) return num; - } - - // Heading level - if (element.tagName.match(/^H[1-6]$/)) { - return parseInt(element.tagName[1], 10); - } - - return undefined; - } - - // Helper: get ARIA pressed state - function getAriaPressed(element) { - const pressed = element.getAttribute('aria-pressed'); - if (pressed === 'true') return true; - if (pressed === 'false') return false; - if (pressed === 'mixed') return 'mixed'; - return undefined; - } - - // Helper: get ARIA selected state - function getAriaSelected(element) { - const selected = element.getAttribute('aria-selected'); - if (selected === 'true') return true; - if (selected === 'false') return false; - return undefined; - } - - // Helper: get CSS content (::before, ::after) - function getCSSContent(element, pseudo) { - try { - const style = window.getComputedStyle(element, pseudo); - const content = style.content; - if (content && content !== 'none' && content !== 'normal') { - // Simple extraction - remove quotes - return content.replace(/^["']|["']$/g, ''); - } - } catch (e) { - // Ignore errors - } - return ''; - } - - // Compute ARIA index for element - function computeAriaIndex(ariaNode) { - // Only assign indices to visible, interactive ARIA roles - if (!ariaNode.box.visible) { - return; - } - - // Only truly interactive roles get an index — things the user can - // click, type into, or toggle. Structural/landmark roles (region, - // navigation, banner, contentinfo, article, list, listitem, heading, - // generic, etc.) are intentionally excluded to keep snapshots small. - const interactiveRoles = [ - 'button', 'link', 'textbox', 'searchbox', 'checkbox', 'radio', - 'combobox', 'listbox', 'option', 'menuitem', 'menuitemcheckbox', - 'menuitemradio', 'tab', 'slider', 'spinbutton', - 'switch', 'treeitem', 'gridcell', 'columnheader', 'rowheader', - 'dialog', 'alertdialog', 'img' - ]; - - // Only assign index to interactive roles or elements with pointer cursor - const hasPointerCursor = ariaNode.box.cursor === 'pointer'; - const isInteractiveRole = interactiveRoles.includes(ariaNode.role); - - if (!isInteractiveRole && !hasPointerCursor) { - return; - } - - // Assign sequential index - ariaNode.index = currentIndex++; - } - - // Convert element to AriaNode - function toAriaNode(element) { - const active = document.activeElement === element; - - // Handle iframe specially - if (element.tagName === 'IFRAME') { - const ariaNode = { - role: 'iframe', - name: '', - children: [], - props: {}, - element: element, - box: computeBox(element), - receivesPointerEvents: true, - active: active - }; - computeAriaIndex(ariaNode); - return ariaNode; - } - - const role = getAriaRole(element); - - // Skip elements without role or with presentation/none - if (!role || role === 'presentation' || role === 'none') { - return null; - } - - const name = normalizeWhiteSpace(getElementAccessibleName(element) || ''); - const box = computeBox(element); - - // Skip inline generic elements with just text - if (role === 'generic' && box.inline && - element.childNodes.length === 1 && - element.childNodes[0].nodeType === 3) { - return null; - } - - const result = { - role: role, - name: name, - children: [], - props: {}, - element: element, - box: box, - receivesPointerEvents: receivesPointerEvents(element), - active: active - }; - - computeAriaIndex(result); - - // Add ARIA properties based on role - const checkedRoles = ['checkbox', 'menuitemcheckbox', 'menuitemradio', 'radio', 'switch']; - if (checkedRoles.includes(role)) { - const checked = getAriaChecked(element); - if (checked !== undefined) result.checked = checked; - } - - const disabledRoles = ['button', 'input', 'select', 'textarea']; - if (disabledRoles.includes(role) || role.includes('menuitem')) { - const disabled = getAriaDisabled(element); - if (disabled !== undefined) result.disabled = disabled; - } - - const expandedRoles = ['button', 'combobox', 'gridcell', 'link', 'menuitem', 'row', 'tab', 'treeitem']; - if (expandedRoles.includes(role)) { - const expanded = getAriaExpanded(element); - if (expanded !== undefined) result.expanded = expanded; - } - - const levelRoles = ['heading', 'listitem', 'row', 'treeitem']; - if (levelRoles.includes(role)) { - const level = getAriaLevel(element); - if (level !== undefined) result.level = level; - } - - const pressedRoles = ['button']; - if (pressedRoles.includes(role)) { - const pressed = getAriaPressed(element); - if (pressed !== undefined) result.pressed = pressed; - } - - const selectedRoles = ['gridcell', 'option', 'row', 'tab', 'treeitem']; - if (selectedRoles.includes(role)) { - const selected = getAriaSelected(element); - if (selected !== undefined) result.selected = selected; - } - - // Special handling for input/textarea values - if (element.tagName === 'INPUT' || element.tagName === 'TEXTAREA') { - if (element.type !== 'checkbox' && element.type !== 'radio' && element.type !== 'file') { - result.children = [element.value || '']; - } - } - - return result; - } - - // Main visitor function - function visit(ariaNode, node, parentElementVisible, visited) { - if (visited.has(node)) return; - visited.add(node); - - // Handle text nodes - if (node.nodeType === 3) { // TEXT_NODE - if (!parentElementVisible) return; - - const text = node.nodeValue; - // Skip text inside textbox - if (ariaNode.role !== 'textbox' && text) { - ariaNode.children.push(text); - } - return; - } - - // Only process element nodes - if (node.nodeType !== 1) return; // ELEMENT_NODE - - const element = node; - - // Check visibility - const isElementVisibleForAria = !isElementHiddenForAria(element); - let visible = isElementVisibleForAria || isElementVisible(element); - - // Skip if not visible for ARIA - if (!visible) return; - - // Handle aria-owns - const ariaChildren = []; - if (element.hasAttribute('aria-owns')) { - const ids = element.getAttribute('aria-owns').split(/\s+/); - for (const id of ids) { - const ownedElement = document.getElementById(id); - if (ownedElement) ariaChildren.push(ownedElement); - } - } - - // Convert to aria node - const childAriaNode = toAriaNode(element); - if (childAriaNode) { - ariaNode.children.push(childAriaNode); - } - - // Process element (add CSS content, children, etc.) - processElement(childAriaNode || ariaNode, element, ariaChildren, visible, visited); - } - - function processElement(ariaNode, element, ariaChildren, parentElementVisible, visited) { - const style = window.getComputedStyle(element); - const display = style ? style.display : 'inline'; - const treatAsBlock = (display !== 'inline' || element.nodeName === 'BR') ? ' ' : ''; - - if (treatAsBlock) { - ariaNode.children.push(treatAsBlock); - } - - // Add ::before content - const beforeContent = getCSSContent(element, '::before'); - if (beforeContent) { - ariaNode.children.push(beforeContent); - } - - // Process shadow DOM slots - if (element.nodeName === 'SLOT') { - const assignedNodes = element.assignedNodes(); - for (const child of assignedNodes) { - visit(ariaNode, child, parentElementVisible, visited); - } - } else { - // Process regular children - for (let child = element.firstChild; child; child = child.nextSibling) { - if (!child.assignedSlot) { - visit(ariaNode, child, parentElementVisible, visited); - } - } - - // Process shadow root - if (element.shadowRoot) { - for (let child = element.shadowRoot.firstChild; child; child = child.nextSibling) { - visit(ariaNode, child, parentElementVisible, visited); - } - } - } - - // Process aria-owns children - for (const child of ariaChildren) { - visit(ariaNode, child, parentElementVisible, visited); - } - - // Add ::after content - const afterContent = getCSSContent(element, '::after'); - if (afterContent) { - ariaNode.children.push(afterContent); - } - - if (treatAsBlock) { - ariaNode.children.push(treatAsBlock); - } - - // Remove redundant children - if (ariaNode.children.length === 1 && ariaNode.name === ariaNode.children[0]) { - ariaNode.children = []; - } - - // Add special props - if (ariaNode.role === 'link' && element.hasAttribute('href')) { - ariaNode.props.url = element.getAttribute('href'); - } - - if (ariaNode.role === 'textbox' && element.hasAttribute('placeholder')) { - const placeholder = element.getAttribute('placeholder'); - if (placeholder !== ariaNode.name) { - ariaNode.props.placeholder = placeholder; - } - } - } - - // Normalize string children - function normalizeStringChildren(ariaNode) { - const normalizedChildren = []; - let buffer = []; - - function flushBuffer() { - if (buffer.length === 0) return; - const text = normalizeWhiteSpace(buffer.join('')); - if (text) { - normalizedChildren.push(text); - } - buffer = []; - } - - for (const child of ariaNode.children || []) { - if (typeof child === 'string') { - buffer.push(child); - } else { - flushBuffer(); - normalizeStringChildren(child); - normalizedChildren.push(child); - } - } - flushBuffer(); - - ariaNode.children = normalizedChildren; - - // Remove if child equals name - if (ariaNode.children.length === 1 && ariaNode.children[0] === ariaNode.name) { - ariaNode.children = []; - } - } - - // Normalize generic roles (remove unnecessary generic wrappers) - function normalizeGenericRoles(node) { - function normalizeChildren(node) { - const result = []; - - for (const child of node.children || []) { - if (typeof child === 'string') { - result.push(child); - continue; - } - - const normalized = normalizeChildren(child); - result.push(...normalized); - } - - // Remove generic that encloses single element - const removeSelf = node.role === 'generic' && !node.name && - result.length <= 1 && - result.every(c => typeof c !== 'string' && c.index !== undefined); - - if (removeSelf) { - return result; - } - - node.children = result; - return [node]; - } - - normalizeChildren(node); - } - - // Serialize ariaNode to plain object (remove Element references) - function serializeAriaNode(ariaNode) { - const result = { - role: ariaNode.role, - name: ariaNode.name, - children: [], - props: ariaNode.props - }; - - // Include index if present - if (ariaNode.index !== undefined) result.index = ariaNode.index; - if (ariaNode.active) result.active = true; - if (ariaNode.checked !== undefined) result.checked = ariaNode.checked; - if (ariaNode.disabled !== undefined) result.disabled = ariaNode.disabled; - if (ariaNode.expanded !== undefined) result.expanded = ariaNode.expanded; - if (ariaNode.level !== undefined) result.level = ariaNode.level; - if (ariaNode.pressed !== undefined) result.pressed = ariaNode.pressed; - if (ariaNode.selected !== undefined) result.selected = ariaNode.selected; - - // Serialize box info - result.box_info = { - visible: ariaNode.box.visible, - cursor: ariaNode.box.cursor - }; - - // Serialize children - for (const child of ariaNode.children) { - if (typeof child === 'string') { - result.children.push(child); - } else { - result.children.push(serializeAriaNode(child)); - } - } - - return result; - } - - // Collect selectors and iframe indices - function collectSelectorsAndIframes(ariaNode, selectors, iframe_indices) { - if (ariaNode.index !== undefined && ariaNode.element) { - // Store CSS selector for element at its index position - const selector = buildSelector(ariaNode.element); - // Ensure selectors array is large enough - while (selectors.length <= ariaNode.index) { - selectors.push(''); - } - selectors[ariaNode.index] = selector; - - if (ariaNode.role === 'iframe') { - iframe_indices.push(ariaNode.index); - } - } - - for (const child of ariaNode.children) { - if (typeof child !== 'string') { - collectSelectorsAndIframes(child, selectors, iframe_indices); - } - } - } - - // Build a short, stable CSS selector for an element. - // Strategy: walk up the tree looking for the nearest ancestor with an ID, - // then build a short relative path from there. Falls back to a short - // path from if no ID is found within a reasonable depth. - function buildSelector(element) { - // Quick path: element itself has an ID - if (element.id) { - return '#' + CSS.escape(element.id); - } - - // Build a segment for one level: tag + first-class + nth-child if needed - function segment(el) { - let sel = el.tagName.toLowerCase(); - if (el.className && typeof el.className === 'string') { - const cls = el.className.trim().split(/\s+/).filter(Boolean); - if (cls.length > 0) { - sel += '.' + CSS.escape(cls[0]); - } - } - const parent = el.parentElement; - if (parent) { - const sameTag = Array.from(parent.children).filter( - s => s.tagName === el.tagName - ); - if (sameTag.length > 1) { - const idx = Array.from(parent.children).indexOf(el); - sel += ':nth-child(' + (idx + 1) + ')'; - } - } - return sel; - } - - // Walk up looking for an ancestor with an ID (max 6 levels) - const parts = []; - let current = element; - const maxDepth = 6; - - for (let i = 0; i < maxDepth && current && current !== document.body; i++) { - if (i > 0 && current.id) { - // Found an ID anchor — build selector from here - const anchor = '#' + CSS.escape(current.id); - parts.unshift(anchor); - const candidate = parts.join(' > '); - // Verify uniqueness - try { - if (document.querySelectorAll(candidate).length === 1) { - return candidate; - } - } catch(e) { /* selector parse error, fall through */ } - // If not unique with ID anchor, keep going - parts.shift(); - parts.unshift(segment(current)); - } else { - parts.unshift(segment(current)); - } - current = current.parentElement; - } - - // No nearby ID found — use the path from body - const candidate = parts.join(' > '); - try { - if (document.querySelectorAll(candidate).length === 1) { - return candidate; - } - } catch(e) { /* fall through */ } - - // If still not unique, keep walking up to body - while (current && current !== document.body) { - parts.unshift(segment(current)); - current = current.parentElement; - } - return parts.join(' > '); - } - - // Main execution - try { - const rootElement = document.body || document.documentElement; - const visited = new Set(); - - // Reset index counter - currentIndex = 0; - - // Create root fragment node - const snapshot = { - role: 'fragment', - name: '', - children: [], - props: {}, - element: rootElement, - box: computeBox(rootElement), - receivesPointerEvents: true - }; - - // Visit the DOM tree - visit(snapshot, rootElement, true, visited); - - // Normalize - normalizeStringChildren(snapshot); - normalizeGenericRoles(snapshot); - - // Collect selectors and iframe indices - const selectors = []; - const iframe_indices = []; - collectSelectorsAndIframes(snapshot, selectors, iframe_indices); - - // Serialize and return - const serialized = serializeAriaNode(snapshot); - - const result = { - root: serialized, - selectors: selectors, - iframe_indices: iframe_indices - }; - - // Stringify inside try/catch so we catch circular-ref or size errors - return JSON.stringify(result); - - } catch (error) { - return JSON.stringify({ - error: error.toString() + ' | stack: ' + (error.stack || 'none'), - root: { role: 'fragment', name: '', children: [], props: {}, box_info: { visible: false } }, - selectors: [], - iframe_indices: [] - }); - } -})()) From bba9c13b88ff18ef1ce259d7a0c23dfd45e48eb3 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 03:46:29 -0800 Subject: [PATCH 07/11] fix: prevent workers from reaching Done via reasoning-only responses Workers using models with extended thinking could exit the agent loop with a Reasoning-only response (no Text, no ToolCall). The nudge gate in should_nudge_tool_usage only checked for Text and ToolCall variants, so a Reasoning-only response slipped through without triggering a nudge. The worker reached Done with an empty result string, producing no RESULT block for the channel to relay. Two mechanical guards: 1. should_nudge_tool_usage now treats responses with no Text and no ToolCall (e.g. reasoning-only) as nudge-worthy, sending the worker back to actually do the work. 2. Empty-result safety net in worker.rs: if the initial task loop produces an empty/whitespace result, the worker fails instead of reaching Done. Also strengthened prompt language around premature outcome signaling in worker.md.j2, set_status tool description, and parameter docs. --- docs/content/docs/(configuration)/secrets.mdx | 17 +++ docs/content/docs/(features)/browser.mdx | 39 ++++-- docs/design-docs/tool-nudging.md | 17 ++- interface/src/components/WebChatPanel.tsx | 95 --------------- interface/src/routes/AgentWorkers.tsx | 10 +- prompts/en/tools/browser_description.md.j2 | 2 +- prompts/en/tools/set_status_description.md.j2 | 2 +- prompts/en/worker.md.j2 | 12 +- src/agent/worker.rs | 16 +++ src/api/state.rs | 53 ++++++++ src/api/workers.rs | 13 +- src/hooks/spacebot.rs | 60 ++++++++- src/tools/browser.rs | 114 ++++++++++++++---- src/tools/set_status.rs | 8 +- 14 files changed, 304 insertions(+), 154 deletions(-) diff --git a/docs/content/docs/(configuration)/secrets.mdx b/docs/content/docs/(configuration)/secrets.mdx index 52342560c..9c954bdd5 100644 --- a/docs/content/docs/(configuration)/secrets.mdx +++ b/docs/content/docs/(configuration)/secrets.mdx @@ -100,6 +100,23 @@ Worker creates a GitHub bot account The tool accepts an optional `category` parameter to override auto-categorization. If omitted, the same rules as the API apply -- known internal credentials are categorized as system, everything else as tool. +### Browser Tool Integration + +The `browser_type` tool supports a `secret` parameter for typing sensitive values (passwords, tokens, API keys) into web form fields without exposing them in tool arguments, output, or LLM context. + +``` +Task says "sign in with password from GH_PASSWORD" + → Worker calls browser_type(index: 2, secret: "GH_PASSWORD") + → Secret store resolves "GH_PASSWORD" to its stored value + → Value is typed into the password field via CDP + → Tool output: "Typed secret 'GH_PASSWORD' into element at index 2" + → The actual password never appears in tool arguments or results +``` + +This is the only safe way to enter credentials in the browser. The `text` parameter types values in plain text that appears in tool output and LLM context -- never use it for passwords or tokens. Both tool-category and system-category secrets can be resolved via the `secret` parameter, since the value is used internally by the browser tool and never exposed to subprocesses. + +If a task references an environment variable for a credential (e.g. "use password from `$GH_PASSWORD`"), the secret should be stored in the secret store under that name and passed via the `secret` parameter. Do not shell out to `echo` or otherwise resolve credential values into plain text. + ## Encryption at Rest The store has two modes: diff --git a/docs/content/docs/(features)/browser.mdx b/docs/content/docs/(features)/browser.mdx index 4c299022f..bfe2809ba 100644 --- a/docs/content/docs/(features)/browser.mdx +++ b/docs/content/docs/(features)/browser.mdx @@ -79,12 +79,27 @@ Single `browser` tool with an `action` discriminator. All actions share one argu Act kinds: - **`click`** -- Click an element by ref. -- **`type`** -- Click an element to focus it, then type `text` into it. +- **`type`** -- Click an element to focus it, then type `text` into it. For passwords and credentials, use `secret` instead of `text` (see below). - **`press_key`** -- Press a keyboard key (e.g., `Enter`, `Tab`, `Escape`). With `element_ref`, presses on that element. Without, dispatches to the page. - **`hover`** -- Move the mouse over an element. - **`scroll_into_view`** -- Scroll an element into the viewport. - **`focus`** -- Focus an element without clicking. +### Secure Text Entry + +The `browser_type` tool accepts a `secret` parameter as an alternative to `text` for typing sensitive values into form fields. When `secret` is provided, the value is resolved from the [secret store](/docs/secrets) and typed into the element without ever appearing in tool arguments, output, or LLM context. + +``` +browser_type { index: 2, secret: "GH_PASSWORD" } + → resolves GH_PASSWORD from the secret store + → types the value into the element + → returns: "Typed secret 'GH_PASSWORD' into element at index 2" +``` + +The `text` and `secret` parameters are mutually exclusive. Use `text` for non-sensitive input (usernames, search queries, form data) and `secret` for passwords, tokens, API keys, and any other credentials. Both tool-category and system-category secrets can be used. + +If a task references a credential by environment variable name (e.g. "use password from `$GH_PASSWORD`"), the worker should pass that name to the `secret` parameter. Never shell out to `echo` or otherwise resolve credential values into plain text for browser entry. + ### JavaScript | Action | Required Args | Description | @@ -96,19 +111,17 @@ Act kinds: ## Typical Workflow ``` -browser { action: "launch" } -browser { action: "navigate", url: "https://example.com" } -browser { action: "snapshot" } - → e0: [textbox] "Email", e1: [textbox] "Password", e2: [button] "Sign In" -browser { action: "act", act_kind: "click", element_ref: "e0" } -browser { action: "act", act_kind: "type", element_ref: "e0", text: "user@example.com" } -browser { action: "act", act_kind: "click", element_ref: "e1" } -browser { action: "act", act_kind: "type", element_ref: "e1", text: "hunter2" } -browser { action: "act", act_kind: "click", element_ref: "e2" } -browser { action: "snapshot" } +browser_launch {} +browser_navigate { url: "https://example.com" } +browser_snapshot {} + → [index=0] textbox "Email", [index=1] textbox "Password", [index=2] button "Sign In" +browser_type { index: 0, text: "user@example.com" } +browser_type { index: 1, secret: "ACCOUNT_PASSWORD" } ← password from secret store +browser_click { index: 2 } +browser_snapshot {} → (new page state after login) -browser { action: "screenshot" } -browser { action: "close" } +browser_screenshot {} +browser_close {} ``` ## Screenshots diff --git a/docs/design-docs/tool-nudging.md b/docs/design-docs/tool-nudging.md index 106d9ed86..958410883 100644 --- a/docs/design-docs/tool-nudging.md +++ b/docs/design-docs/tool-nudging.md @@ -22,11 +22,14 @@ Workers must explicitly signal a terminal outcome via `set_status(kind: "outcome Worker loop starts → LLM completion → If response includes tool calls → continue normally - → If text-only response: + → If response has no tool calls: → Has outcome been signaled via set_status(kind: "outcome")? → allow exit - → No outcome signal? → Terminate with "tool_nudge" reason → retry with nudge prompt - → Max 2 retries per prompt request - → If retries exhausted → worker fails (PromptCancelled) + → No outcome signal? + → Response has text OR is reasoning-only (no text, no tools)? + → Terminate with "tool_nudge" reason → retry with nudge prompt + → Max 2 retries per prompt request + → If retries exhausted → worker fails (PromptCancelled) + → After initial task loop: if result text is empty → worker fails (safety net) ``` ### Outcome Signaling @@ -62,9 +65,12 @@ let hook = SpacebotHook::new(...) - The flag persists for the rest of the prompt request **Nudge decision** (`src/hooks/spacebot.rs:should_nudge_tool_usage`): -- Returns `true` when: policy enabled, nudge active, no outcome signaled, response is text-only +- Returns `true` when: policy enabled, nudge active, no outcome signaled, AND response has no tool calls (either non-empty text or reasoning-only with no text) - Returns `false` when: outcome signaled, response has tool calls, policy disabled +**Empty result guard** (`src/agent/worker.rs`): +- After the initial task loop, if the result text is empty/whitespace-only, the worker fails instead of reaching `Done`. This is a safety net for cases where a reasoning-only response slips past the nudge gate (e.g. after nudge retries are exhausted). + **Retry flow** (`prompt_with_tool_nudge_retry`): 1. Reset nudge state at start of prompt (clears `outcome_signaled`) 2. On text-only response without outcome: terminate with `TOOL_NUDGE_REASON` @@ -100,6 +106,7 @@ The nudging behavior has comprehensive test coverage: - **Unit tests** (`src/hooks/spacebot.rs`): - `nudges_on_every_text_only_response_without_outcome` — nudge fires on every text-only response - `nudges_after_tool_calls_without_outcome` — the exact bug case (read_skill + progress status + text exit) + - `nudges_on_reasoning_only_response_without_outcome` — nudge fires when response is only Reasoning blocks (no text, no tools) - `outcome_signal_allows_text_only_completion` — outcome signal unlocks exit - `progress_status_does_not_signal_outcome` — explicit progress kind doesn't unlock - `default_status_kind_does_not_signal_outcome` — omitted kind doesn't unlock diff --git a/interface/src/components/WebChatPanel.tsx b/interface/src/components/WebChatPanel.tsx index e473ddc4f..d46c7bd66 100644 --- a/interface/src/components/WebChatPanel.tsx +++ b/interface/src/components/WebChatPanel.tsx @@ -4,8 +4,6 @@ import { useWebChat } from "@/hooks/useWebChat"; import { isOpenCodeWorker, type ActiveWorker } from "@/hooks/useChannelLiveState"; import { useLiveContext } from "@/hooks/useLiveContext"; import { Markdown } from "@/components/Markdown"; -import { LiveDuration } from "@/components/LiveDuration"; -import type { TimelineWorkerRun } from "@/api/client"; interface WebChatPanelProps { agentId: string; @@ -60,88 +58,6 @@ function ActiveWorkersPanel({ workers, agentId }: { workers: ActiveWorker[]; age ); } -function ChatWorkerRunItem({ item, live, agentId }: { item: TimelineWorkerRun; live?: ActiveWorker; agentId: string }) { - const [expanded, setExpanded] = useState(!!live); - const wasLiveRef = useRef(!!live); - - // Auto-expand when a worker becomes live after initial mount - useEffect(() => { - if (live && !wasLiveRef.current) { - setExpanded(true); - } - wasLiveRef.current = !!live; - }, [live]); - - const oc = isOpenCodeWorker(live ?? { task: item.task }); - const isLive = !!live; - - return ( -
-
- - - Open - -
- {expanded && isLive && live && ( -
- - {live.status} - {live.currentTool && ( - {live.currentTool} - )} - {live.toolCalls > 0 && ( - {live.toolCalls} tool calls - )} -
- )} - {expanded && !isLive && item.result && ( -
-
- {item.result} -
-
- )} -
- ); -} - function ThinkingIndicator() { return (
@@ -285,17 +201,6 @@ export function WebChatPanel({ agentId }: WebChatPanelProps) { )} {timeline.map((item) => { - if (item.type === "worker_run") { - const live = liveState?.workers[item.id]; - return ( - - ); - } if (item.type !== "message") return null; return (
diff --git a/interface/src/routes/AgentWorkers.tsx b/interface/src/routes/AgentWorkers.tsx index 184bb5f87..db98b08d3 100644 --- a/interface/src/routes/AgentWorkers.tsx +++ b/interface/src/routes/AgentWorkers.tsx @@ -408,9 +408,13 @@ function WorkerDetail({ setActiveTab(hasOpenCodeEmbed ? "opencode" : "transcript"); }, [detail.id, hasOpenCodeEmbed]); - // Use persisted transcript if available, otherwise fall back to live SSE transcript. - // Strip the final action step if it duplicates the result text shown above. - const rawTranscript = detail.transcript ?? (isLive ? liveTranscript : null); + // For live workers, prefer the SSE-accumulated transcript when it has content + // (it's the most up-to-date). Fall back to detail.transcript which may come + // from the DB or the server-side live cache (survives page refreshes). + // For completed workers, always use the persisted DB transcript. + const rawTranscript = isLive + ? (liveTranscript && liveTranscript.length > 0 ? liveTranscript : detail.transcript ?? null) + : detail.transcript ?? null; const transcript = useMemo(() => { if (!rawTranscript || !detail.result) return rawTranscript; const last = rawTranscript[rawTranscript.length - 1]; diff --git a/prompts/en/tools/browser_description.md.j2 b/prompts/en/tools/browser_description.md.j2 index 2a21dd338..7c3da4c5b 100644 --- a/prompts/en/tools/browser_description.md.j2 +++ b/prompts/en/tools/browser_description.md.j2 @@ -1 +1 @@ -Browser automation tools. Navigate pages, get ARIA snapshots with numbered element indices, click/type by index, take screenshots. Workflow: browser_navigate → browser_snapshot → browser_click/browser_type by index → browser_screenshot. +Browser automation tools. Navigate pages, get ARIA snapshots with numbered element indices, click/type by index, take screenshots. Workflow: browser_navigate → browser_snapshot → browser_click/browser_type by index → browser_screenshot. For passwords and credentials, use browser_type with `secret` (not `text`) to type values from the secret store without exposing them. diff --git a/prompts/en/tools/set_status_description.md.j2 b/prompts/en/tools/set_status_description.md.j2 index c9eaf1281..c0b609a4c 100644 --- a/prompts/en/tools/set_status_description.md.j2 +++ b/prompts/en/tools/set_status_description.md.j2 @@ -1 +1 @@ -Report the current status of your work. The status appears in the channel's status block. Keep statuses concise (1-2 sentences) and informative. Use kind "progress" (default) for intermediate updates. Use kind "outcome" when the task has reached a terminal result — you must signal an outcome before finishing. \ No newline at end of file +Report the current status of your work. The status appears in the channel's status block. Keep statuses concise (1-2 sentences) and informative. Use kind "progress" (default) for intermediate updates. Use kind "outcome" ONLY when ALL parts of the task have reached a terminal result — you must signal an outcome before finishing, but signaling prematurely (before every step is complete) is a critical error that causes the task to be reported as done when it isn't. \ No newline at end of file diff --git a/prompts/en/worker.md.j2 b/prompts/en/worker.md.j2 index 5b7d3354e..224010736 100644 --- a/prompts/en/worker.md.j2 +++ b/prompts/en/worker.md.j2 @@ -43,7 +43,7 @@ Sandbox mode is **disabled**. Execute the task you were given. Use your tools. Report your status as you make progress. -**When the task is complete** (or you've reached a definitive result), call `set_status` with `kind: "outcome"` and a summary of what happened. Only then provide your final text response. If you try to finish without signaling an outcome, the system will send you back to keep working. +**When the task is fully complete** (or you've reached a definitive result for every part of the task), call `set_status` with `kind: "outcome"` and a summary of what happened. Only then provide your final text response. If you try to finish without signaling an outcome, the system will send you back to keep working. Do not signal an outcome prematurely — completing part of a multi-step task is not completion. ## Task @@ -69,7 +69,11 @@ Bad progress updates: - "starting" - "reading file" -**Outcome** (`kind: "outcome"`): Signal that you have reached a terminal result. You **must** call `set_status` with `kind: "outcome"` before finishing your task. Include a concise summary of the result. Examples: +**Outcome** (`kind: "outcome"`): Signal that you have reached a terminal result. You **must** call `set_status` with `kind: "outcome"` before finishing your task. Include a concise summary of the result. + +**CRITICAL: Do not signal an outcome until every step of the task is actually complete.** If the task has multiple steps, you must finish ALL of them before calling `set_status(kind: "outcome")`. Signaling an outcome after completing only some steps is a failure — it tells the channel the work is done when it isn't. Review your original task description and verify every requirement has been met before signaling. + +Examples: - `set_status(status: "Email sent to jamie@spacedrive.com", kind: "outcome")` - `set_status(status: "Build failed: 3 type errors in auth module", kind: "outcome")` @@ -111,6 +115,8 @@ Automate a headless Chrome browser. Each action is a separate tool. **Element indices** are assigned during `browser_snapshot` and look like `[index=0]`, `[index=1]`. Always snapshot before using index-based interaction — indices reset on each snapshot. +**Passwords and credentials:** `browser_type` accepts a `secret` parameter for secure text entry. When a task requires entering a password, token, or API key, use `secret` instead of `text`: `{"index": 2, "secret": "GH_PASSWORD"}`. The secret is resolved from the secret store and typed into the field — the actual value never appears in tool arguments or output. NEVER put credentials in the `text` parameter or resolve them via `shell` (`echo $PASSWORD`). If the task references an environment variable like `$GH_PASSWORD`, pass that name directly to the `secret` parameter. + **Other tools:** `browser_press_key` (Enter, Tab, Escape), `browser_evaluate` (run JS on the page), `browser_tab_open`, `browser_tab_list`, `browser_tab_close`. **Prefer index or selector over evaluate:** Use `browser_snapshot` + `browser_click`/`browser_type` to interact with elements. Fall back to `browser_evaluate` for reading page data or when the other tools don't work for a specific page. @@ -138,7 +144,7 @@ Do not log or echo the secret value after storing it. 1. Do the work. Don't describe what you would do — use the tools and do it. 2. Update your status at meaningful checkpoints. The channel is using your status to keep the user informed. 3. If a tool call fails, try to recover. Read the error, adjust, and retry. Don't give up on the first failure. -4. **Signal your outcome.** When the task is done, call `set_status(kind: "outcome")` with a summary of the result before providing your final text. You cannot finish without this — the system will reject premature exits. +4. **Signal your outcome — only when truly done.** When ALL steps of the task are complete, call `set_status(kind: "outcome")` with a summary of the result before providing your final text. You cannot finish without this — the system will reject premature exits. Conversely, do NOT signal an outcome if there are remaining steps. Completing 2 out of 7 steps is not done. Clicking a button and not waiting for the result is not done. If the task says "do X, Y, and Z", you must do X AND Y AND Z before signaling. 5. Stay focused on the task. Don't explore tangential work unless it's necessary to complete what you were asked to do. 6. If you receive follow-up messages (interactive mode), treat them as additional instructions building on your existing context. {% if tool_secret_names %} diff --git a/src/agent/worker.rs b/src/agent/worker.rs index b0f40706d..4dd7a1528 100644 --- a/src/agent/worker.rs +++ b/src/agent/worker.rs @@ -336,6 +336,7 @@ impl Worker { }); } + self.persist_transcript(&compacted_history, &history).await; self.maybe_compact_history(&mut compacted_history, &mut history) .await; prompt = @@ -395,6 +396,21 @@ impl Worker { } }; + // Safety net: if the worker produced an empty result (e.g. reasoning-only + // response that slipped past the nudge gate), treat it as a failure. An empty + // result means the channel has nothing to relay to the user. + if !resuming && result.trim().is_empty() { + self.state = WorkerState::Failed; + self.hook.send_status("failed (empty result)"); + self.write_failure_log(&history, "worker produced empty result — likely a reasoning-only exit that bypassed the outcome gate"); + self.persist_transcript(&compacted_history, &history).await; + tracing::error!(worker_id = %self.id, "worker produced empty result, treating as failure"); + return Err(crate::error::AgentError::Other(anyhow::anyhow!( + "worker produced empty result without signaling a meaningful outcome" + )) + .into()); + } + // For interactive workers, enter a follow-up loop let mut follow_up_failure: Option = None; if let Some(mut input_rx) = self.input_rx.take() { diff --git a/src/api/state.rs b/src/api/state.rs index 0f97f5d6e..9ff4ecc20 100644 --- a/src/api/state.rs +++ b/src/api/state.rs @@ -4,6 +4,7 @@ use crate::agent::channel::ChannelState; use crate::agent::cortex_chat::CortexChatSession; use crate::agent::status::StatusBlock; use crate::config::{Binding, DefaultsConfig, DiscordPermissions, RuntimeConfig, SlackPermissions}; +use crate::conversation::worker_transcript::{ActionContent, TranscriptStep}; use crate::cron::{CronStore, Scheduler}; use crate::llm::LlmManager; use crate::mcp::McpManager; @@ -115,6 +116,11 @@ pub struct ApiState { pub agent_groups: ArcSwap>, /// Org-level humans for the topology UI. pub agent_humans: ArcSwap>, + /// Live transcript cache for running workers. Accumulates `TranscriptStep`s + /// from `ToolStarted`/`ToolCompleted` events so that page refreshes can + /// recover the transcript without waiting for the worker to complete. + /// Keyed by worker_id, cleared on worker completion. + pub live_worker_transcripts: Arc>>>, } /// Events sent to SSE clients. Wraps ProcessEvents with agent context. @@ -291,6 +297,7 @@ impl ApiState { agent_links: ArcSwap::from_pointee(Vec::new()), agent_groups: ArcSwap::from_pointee(Vec::new()), agent_humans: ArcSwap::from_pointee(Vec::new()), + live_worker_transcripts: Arc::new(RwLock::new(HashMap::new())), } } @@ -321,6 +328,16 @@ impl ApiState { self.channel_states.write().await.remove(channel_id); } + /// Retrieve the live transcript cache for a running worker. + /// + /// Returns `Some` with the accumulated transcript steps if the worker is + /// currently running and has emitted tool calls. Returns `None` if no + /// cached transcript exists (worker completed or never started). + pub async fn get_live_transcript(&self, worker_id: &str) -> Option> { + let guard = self.live_worker_transcripts.read().await; + guard.get(worker_id).cloned() + } + /// Register an agent's event stream. Spawns a task that forwards /// ProcessEvents into the aggregated API event stream. pub fn register_agent_events( @@ -329,6 +346,7 @@ impl ApiState { mut agent_event_rx: broadcast::Receiver, ) { let api_tx = self.event_tx.clone(); + let live_transcripts = self.live_worker_transcripts.clone(); tokio::spawn(async move { loop { match agent_event_rx.recv().await { @@ -343,6 +361,10 @@ impl ApiState { interactive, .. } => { + live_transcripts + .write() + .await + .insert(worker_id.to_string(), Vec::new()); api_tx .send(ApiEvent::WorkerStarted { agent_id: agent_id.clone(), @@ -404,6 +426,10 @@ impl ApiState { success, .. } => { + live_transcripts + .write() + .await + .remove(&worker_id.to_string()); api_tx .send(ApiEvent::WorkerCompleted { agent_id: agent_id.clone(), @@ -437,6 +463,21 @@ impl ApiState { .. } => { let (process_type, id_str) = process_id_info(process_id); + // Accumulate tool call into live transcript for workers. + if let ProcessId::Worker(worker_id) = process_id { + let call_id = format!("live_{}", uuid::Uuid::new_v4()); + let step = TranscriptStep::Action { + content: vec![ActionContent::ToolCall { + id: call_id, + name: tool_name.clone(), + args: args.clone(), + }], + }; + let mut guard = live_transcripts.write().await; + if let Some(steps) = guard.get_mut(&worker_id.to_string()) { + steps.push(step); + } + } api_tx .send(ApiEvent::ToolStarted { agent_id: agent_id.clone(), @@ -456,6 +497,18 @@ impl ApiState { .. } => { let (process_type, id_str) = process_id_info(process_id); + // Accumulate tool result into live transcript for workers. + if let ProcessId::Worker(worker_id) = process_id { + let step = TranscriptStep::ToolResult { + call_id: String::new(), + name: tool_name.clone(), + text: result.clone(), + }; + let mut guard = live_transcripts.write().await; + if let Some(steps) = guard.get_mut(&worker_id.to_string()) { + steps.push(step); + } + } api_tx .send(ApiEvent::ToolCompleted { agent_id: agent_id.clone(), diff --git a/src/api/workers.rs b/src/api/workers.rs index dff7a4f09..aecd65df9 100644 --- a/src/api/workers.rs +++ b/src/api/workers.rs @@ -170,13 +170,18 @@ pub(super) async fn worker_detail( })? .ok_or(StatusCode::NOT_FOUND)?; - let transcript = detail.transcript_blob.as_deref().and_then(|blob| { - worker_transcript::deserialize_transcript(blob) + let transcript = match detail.transcript_blob.as_deref() { + Some(blob) => worker_transcript::deserialize_transcript(blob) .map_err(|error| { tracing::warn!(%error, worker_id = %query.worker_id, "failed to decompress transcript"); }) - .ok() - }); + .ok(), + None => { + // No persisted transcript yet — check the live transcript cache + // so page refreshes can recover in-progress worker transcripts. + state.get_live_transcript(&query.worker_id).await + } + }; Ok(Json(WorkerDetailResponse { id: detail.id, diff --git a/src/hooks/spacebot.rs b/src/hooks/spacebot.rs index 9cc7bace9..fe606b8e8 100644 --- a/src/hooks/spacebot.rs +++ b/src/hooks/spacebot.rs @@ -402,14 +402,29 @@ impl SpacebotHook { return false; } - // Text-only response without a prior outcome signal — nudge. - response.choice.iter().any(|content| { + // Response without tool calls and without a prior outcome signal. + // Nudge if the response contains any non-empty text, OR if it + // contains no text at all (e.g. reasoning-only). A response that + // is purely reasoning/image with no text and no tool calls means + // the worker hasn't actually done anything — send it back. + let has_any_text = response.choice.iter().any(|content| { if let rig::message::AssistantContent::Text(text) = content { !text.text.trim().is_empty() } else { false } - }) + }); + + // Nudge on non-empty text (worker tried to narrate instead of + // working) or on no text at all (reasoning-only exit attempt). + has_any_text + || !response.choice.iter().any(|content| { + matches!( + content, + rig::message::AssistantContent::Text(_) + | rig::message::AssistantContent::ToolCall(_) + ) + }) } } @@ -1134,4 +1149,43 @@ mod tests { } if text_delta == "hi" && aggregated_text == "hi" )); } + + #[tokio::test] + async fn nudges_on_reasoning_only_response_without_outcome() { + // A response with only Reasoning content (no Text, no ToolCall) should + // trigger a nudge. This is the exact bug case: models with extended + // thinking produce a Reasoning block and exit the agent loop without + // doing any work. + let hook = make_hook().with_tool_nudge_policy(ToolNudgePolicy::Enabled); + let prompt = prompt_message(); + hook.reset_tool_nudge_state(); + hook.set_tool_nudge_request_active(true); + + let reasoning_response = CompletionResponse { + choice: OneOrMany::one(AssistantContent::reasoning("thinking about the task...")), + message_id: None, + usage: Usage::default(), + raw_response: RawResponse { + body: serde_json::json!({}), + }, + }; + + let _ = + >::on_completion_call(&hook, &prompt, &[]) + .await; + let response = >::on_completion_response( + &hook, + &prompt, + &reasoning_response, + ) + .await; + assert!( + matches!( + response, + HookAction::Terminate { ref reason } + if reason == SpacebotHook::TOOL_NUDGE_REASON + ), + "Expected nudge — reasoning-only response without outcome should be rejected" + ); + } } diff --git a/src/tools/browser.rs b/src/tools/browser.rs index 8c4d9834c..fb231e09e 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -12,6 +12,7 @@ //! pages where JS injection fails. use crate::config::BrowserConfig; +use crate::secrets::store::SecretsStore; use chromiumoxide::browser::{Browser, BrowserConfig as ChromeConfig}; use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions}; @@ -260,7 +261,7 @@ impl AxSnapshot { .filter(|(_, node)| { node.parent_id .as_ref() - .map_or(true, |pid| !id_to_idx.contains_key(pid.inner())) + .is_none_or(|pid| !id_to_idx.contains_key(pid.inner())) }) .map(|(i, _)| i) .collect(); @@ -425,14 +426,15 @@ impl AxSnapshot { // Assign an index to interactive elements that have a BackendNodeId. let role_lower = role.to_lowercase(); let is_interactive = INTERACTIVE_ROLES.contains(&role_lower.as_str()); - let element_index = if is_interactive && ax.backend_dom_node_id.is_some() { - let idx = *next_index; - *next_index += 1; - node_ids.push(ax.backend_dom_node_id.expect("checked above")); - Some(idx) - } else { - None - }; + let element_index = + if let Some(backend_node_id) = ax.backend_dom_node_id.filter(|_| is_interactive) { + let idx = *next_index; + *next_index += 1; + node_ids.push(backend_node_id); + Some(idx) + } else { + None + }; // Skip structural nodes that have no name, no index, and only serve // as tree wrappers — but only if they have exactly one child (pass @@ -767,12 +769,16 @@ pub struct TabInfo { // Shared helper struct that all tools reference /// Shared context cloned into each browser tool. Holds the browser state mutex, -/// config, and screenshot directory. +/// config, screenshot directory, and optional secrets store for secure text entry. #[derive(Debug, Clone)] pub(crate) struct BrowserContext { state: Arc>, config: BrowserConfig, screenshot_dir: PathBuf, + /// Secrets store for resolving secret names in `browser_type`. When present, + /// the `secret` parameter can look up credential values without exposing + /// them in tool arguments or output. + secrets: Option>, } impl BrowserContext { @@ -780,11 +786,13 @@ impl BrowserContext { state: Arc>, config: BrowserConfig, screenshot_dir: PathBuf, + secrets: Option>, ) -> Self { Self { state, config, screenshot_dir, + secrets, } } @@ -810,6 +818,10 @@ impl BrowserContext { &self, state: &'a mut BrowserState, ) -> Result<&'a AxSnapshot, BrowserError> { + // The `is_some` + `expect` pattern is intentional — `if let Some(s) = + // state.snapshot.as_ref()` would borrow `state` for `'a`, preventing + // the mutable assignment to `state.snapshot` in the cache-miss path. + #[allow(clippy::unnecessary_unwrap)] if state.snapshot.is_some() { return Ok(state.snapshot.as_ref().expect("just checked")); } @@ -1326,7 +1338,9 @@ impl Tool for BrowserSnapshotTool { ToolDefinition { name: Self::NAME.to_string(), description: "Get the page's ARIA accessibility tree with numbered element indices. \ - Use the [index=N] values in browser_click, browser_type, etc." + Use the [index=N] values in browser_click, browser_type, etc. When \ + you see password fields or other sensitive inputs, use browser_type \ + with the `secret` parameter (not `text`) to type credentials securely." .to_string(), parameters: serde_json::json!({ "type": "object", "properties": {} }), } @@ -1478,8 +1492,15 @@ pub struct BrowserTypeArgs { pub index: Option, /// CSS selector to target directly (e.g., "#login_field", "input[name='email']"). pub selector: Option, - /// The text to type into the element. - pub text: String, + /// The text to type into the element. Mutually exclusive with `secret`. + /// Do NOT put secret values (passwords, tokens, API keys) here — they will + /// appear in tool output. Use the `secret` parameter instead. + pub text: Option, + /// Name of a secret from the secret store to type into the element. + /// The secret value is resolved server-side and never appears in tool + /// arguments, output, or LLM context. Use this for passwords, tokens, + /// API keys, and any other sensitive values. Mutually exclusive with `text`. + pub secret: Option, /// Whether to clear the field before typing. Defaults to true. #[serde(default = "default_true")] pub clear: bool, @@ -1499,17 +1520,22 @@ impl Tool for BrowserTypeTool { ToolDefinition { name: Self::NAME.to_string(), description: "Type text into an input element by index (from browser_snapshot) or \ - CSS selector." + CSS selector. Provide either `text` for plain text or `secret` for \ + sensitive values (passwords, tokens, API keys). When using `secret`, \ + pass the secret name (e.g. \"GH_PASSWORD\") — the value is resolved \ + from the secret store and typed without ever appearing in tool \ + arguments or output. NEVER put passwords or credentials in the `text` \ + parameter — always use `secret` for sensitive values." .to_string(), parameters: serde_json::json!({ "type": "object", "properties": { "index": { "type": "integer", "description": "Element index from snapshot" }, "selector": { "type": "string", "description": "CSS selector (e.g. \"#login_field\", \"input[name='email']\")" }, - "text": { "type": "string", "description": "Text to type" }, + "text": { "type": "string", "description": "Plain text to type. Do NOT use for passwords or secrets — use the `secret` parameter instead." }, + "secret": { "type": "string", "description": "Name of a secret from the secret store (e.g. \"GH_PASSWORD\", \"NPM_TOKEN\"). The value is resolved securely and never appears in output." }, "clear": { "type": "boolean", "default": true, "description": "Clear the field before typing (default true)" } - }, - "required": ["text"] + } }), } } @@ -1518,23 +1544,59 @@ impl Tool for BrowserTypeTool { let target = ElementTarget::from_args(args.index, args.selector)?; let label = target.display(); + // Resolve the text to type: either from `secret` (secure) or `text` (plain). + let (text_value, is_secret) = match (&args.secret, &args.text) { + (Some(secret_name), None) => { + let store = self.context.secrets.as_ref().ok_or_else(|| { + BrowserError::new( + "secret store is not available — secrets cannot be resolved. \ + Add the secret via the API or use the `text` parameter instead.", + ) + })?; + let decrypted = store.get(secret_name).map_err(|error| { + BrowserError::new(format!("failed to resolve secret '{secret_name}': {error}")) + })?; + (decrypted.expose().to_string(), true) + } + (None, Some(text)) => (text.clone(), false), + (Some(_), Some(_)) => { + return Err(BrowserError::new( + "`text` and `secret` are mutually exclusive — provide one or the other", + )); + } + (None, None) => { + return Err(BrowserError::new( + "provide either `text` (plain text) or `secret` (secret name from the \ + secret store) to type into the element", + )); + } + }; + let mut state = self.context.state.lock().await; let handle = self.context.find_element(&mut state, &target).await?; self.context - .focus_and_type(&state, &handle, &args.text, args.clear) + .focus_and_type(&state, &handle, &text_value, args.clear) .await?; state.invalidate_snapshot(); - let display_text = if args.text.len() > 50 { - format!("{}...", &args.text[..50]) + // Secret values must never appear in tool output. + let message = if is_secret { + format!( + "Typed secret '{}' into element at {label}", + args.secret.as_deref().unwrap_or("unknown") + ) } else { - args.text.clone() + let display_text = if text_value.len() > 50 { + format!("{}...", &text_value[..50]) + } else { + text_value + }; + format!("Typed '{display_text}' into element at {label}") }; - Ok(BrowserOutput::success(format!( - "Typed '{display_text}' into element at {label}" - ))) + + Ok(BrowserOutput::success(message)) } } @@ -2052,7 +2114,9 @@ pub fn register_browser_tools( Arc::new(Mutex::new(BrowserState::new())) }; - let context = BrowserContext::new(state, config, screenshot_dir); + let secrets = runtime_config.secrets.load().as_ref().as_ref().cloned(); + + let context = BrowserContext::new(state, config, screenshot_dir, secrets); server .tool(BrowserLaunchTool { diff --git a/src/tools/set_status.rs b/src/tools/set_status.rs index 8ad89b35d..ee2e0f092 100644 --- a/src/tools/set_status.rs +++ b/src/tools/set_status.rs @@ -54,6 +54,12 @@ pub struct SetStatusError(String); /// way the worker can describe). Workers **must** emit an `outcome` status /// before finishing; the system will nudge them back to work if they try to /// stop without one. +/// +/// NOTE: The outcome gate only checks *whether* an outcome was signaled, not +/// *whether all task steps are actually complete*. Premature outcome signaling +/// (e.g. after 2 of 7 steps) is handled via prompt-level instructions, not +/// structural enforcement. See the worker prompt for the anti-premature-exit +/// language. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Deserialize, Serialize, JsonSchema)] #[serde(rename_all = "snake_case")] pub enum StatusKind { @@ -110,7 +116,7 @@ impl Tool for SetStatusTool { "type": "string", "enum": ["progress", "outcome"], "default": "progress", - "description": "Use \"progress\" for intermediate updates. Use \"outcome\" when the task has reached a terminal result (success or failure) and you are ready to finish." + "description": "Use \"progress\" for intermediate updates. Use \"outcome\" ONLY when ALL steps of the task have reached a terminal result (success or failure) and you are ready to finish. Do not signal outcome if there are remaining steps — premature outcome signaling causes the task to be incorrectly reported as complete." } }, "required": ["status"] From dbd8995b8a203c9d0ef61ec35a585479819d91d8 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 04:37:51 -0800 Subject: [PATCH 08/11] fix: track consecutive nudge attempts, add WorkerText event, refactor transcript UI Tool nudge retries now track consecutive text-only responses instead of total nudges per prompt lifetime. Successful tool calls reset the counter via on_tool_result, so long-running workers (e.g. browser exploration with 12+ tool calls) no longer get killed after a brief narration blip. Add WorkerText process event to stream model reasoning text between tool calls to the dashboard in real-time. Extract ToolCall component with tool-specific renderers (browser, shell, file) and paired call/result matching. Remove duplicate status text injection from worker transcripts. --- docs/design-docs/tool-nudging.md | 23 +- interface/src/api/client.ts | 10 +- interface/src/components/ToolCall.tsx | 644 +++++++++++++++++++++++++ interface/src/hooks/useLiveContext.tsx | 34 +- interface/src/routes/AgentWorkers.tsx | 109 +---- src/agent/channel_history.rs | 3 +- src/agent/cortex.rs | 3 +- src/api/state.rs | 27 ++ src/api/system.rs | 1 + src/hooks/spacebot.rs | 129 ++++- src/lib.rs | 8 + 11 files changed, 864 insertions(+), 127 deletions(-) create mode 100644 interface/src/components/ToolCall.tsx diff --git a/docs/design-docs/tool-nudging.md b/docs/design-docs/tool-nudging.md index 958410883..c5fb2f635 100644 --- a/docs/design-docs/tool-nudging.md +++ b/docs/design-docs/tool-nudging.md @@ -21,17 +21,20 @@ Workers must explicitly signal a terminal outcome via `set_status(kind: "outcome ``` Worker loop starts → LLM completion - → If response includes tool calls → continue normally + → If response includes tool calls → continue normally (reset consecutive nudge counter) → If response has no tool calls: → Has outcome been signaled via set_status(kind: "outcome")? → allow exit → No outcome signal? → Response has text OR is reasoning-only (no text, no tools)? → Terminate with "tool_nudge" reason → retry with nudge prompt - → Max 2 retries per prompt request - → If retries exhausted → worker fails (PromptCancelled) + → Max 2 consecutive text-only responses before failure + → Any successful tool call resets the counter + → If consecutive retries exhausted → worker fails (PromptCancelled) → After initial task loop: if result text is empty → worker fails (safety net) ``` +The retry budget tracks **consecutive** text-only responses, not total nudges across the worker's lifetime. Whenever a tool call completes successfully, the counter resets to zero. This prevents long-running workers from being killed after a brief narration blip mid-task — a worker that has made 12 tool calls and then narrates once gets the full retry budget again, not a depleted one from earlier nudges. + ### Outcome Signaling The `set_status` tool has a `kind` field: @@ -71,12 +74,18 @@ let hook = SpacebotHook::new(...) **Empty result guard** (`src/agent/worker.rs`): - After the initial task loop, if the result text is empty/whitespace-only, the worker fails instead of reaching `Done`. This is a safety net for cases where a reasoning-only response slips past the nudge gate (e.g. after nudge retries are exhausted). +**Consecutive nudge counter** (`on_tool_result` in `src/hooks/spacebot.rs`): +- When nudge policy is enabled and a tool call completes, `nudge_attempts` is reset to zero +- This means the retry budget is per-consecutive-text-only-streak, not per-prompt-lifetime +- A worker that alternates between tool calls and narration will never exhaust its budget + **Retry flow** (`prompt_with_tool_nudge_retry`): -1. Reset nudge state at start of prompt (clears `outcome_signaled`) +1. Reset nudge state at start of prompt (clears `outcome_signaled` and `nudge_attempts`) 2. On text-only response without outcome: terminate with `TOOL_NUDGE_REASON` -3. Catch termination in retry loop, prune history, retry with nudge prompt +3. Catch termination in retry loop, increment `nudge_attempts`, prune history, retry with nudge prompt 4. On success: prune the nudge prompt from history to keep context clean -5. After `TOOL_NUDGE_MAX_RETRIES` (2) exhausted: `PromptCancelled` propagates to worker → `WorkerState::Failed` +5. After `TOOL_NUDGE_MAX_RETRIES` (2) consecutive text-only responses: `PromptCancelled` propagates to worker → `WorkerState::Failed` +6. Any successful tool call (inside the Rig agent loop) resets `nudge_attempts` to zero via `on_tool_result` **History hygiene**: - Synthetic nudge prompts are removed from history on both success and retry @@ -107,6 +116,8 @@ The nudging behavior has comprehensive test coverage: - `nudges_on_every_text_only_response_without_outcome` — nudge fires on every text-only response - `nudges_after_tool_calls_without_outcome` — the exact bug case (read_skill + progress status + text exit) - `nudges_on_reasoning_only_response_without_outcome` — nudge fires when response is only Reasoning blocks (no text, no tools) + - `tool_result_resets_consecutive_nudge_counter` — successful tool calls reset the nudge attempt counter + - `nudge_counter_not_reset_when_policy_disabled` — counter untouched when nudging is disabled - `outcome_signal_allows_text_only_completion` — outcome signal unlocks exit - `progress_status_does_not_signal_outcome` — explicit progress kind doesn't unlock - `default_status_kind_does_not_signal_outcome` — omitted kind doesn't unlock diff --git a/interface/src/api/client.ts b/interface/src/api/client.ts index 5ffd47ff6..893f81a1e 100644 --- a/interface/src/api/client.ts +++ b/interface/src/api/client.ts @@ -146,6 +146,13 @@ export interface OpenCodePartUpdatedEvent { part: OpenCodePart; } +export interface WorkerTextEvent { + type: "worker_text"; + agent_id: string; + worker_id: string; + text: string; +} + export type ApiEvent = | InboundMessageEvent | OutboundMessageEvent @@ -159,7 +166,8 @@ export type ApiEvent = | BranchCompletedEvent | ToolStartedEvent | ToolCompletedEvent - | OpenCodePartUpdatedEvent; + | OpenCodePartUpdatedEvent + | WorkerTextEvent; async function fetchJson(path: string): Promise { const response = await fetch(`${API_BASE}${path}`); diff --git a/interface/src/components/ToolCall.tsx b/interface/src/components/ToolCall.tsx new file mode 100644 index 000000000..c07cc6c78 --- /dev/null +++ b/interface/src/components/ToolCall.tsx @@ -0,0 +1,644 @@ +import { useState, useMemo } from "react"; +import { cx } from "@/ui/utils"; +import type { ActionContent, TranscriptStep } from "@/api/client"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export type ToolCallStatus = "running" | "completed" | "error"; + +export interface ToolCallPair { + /** The call_id linking tool_call to tool_result */ + id: string; + /** Tool name (e.g. "browser_navigate", "shell") */ + name: string; + /** Raw JSON args string from the tool_call */ + argsRaw: string; + /** Parsed args object (null if parse failed) */ + args: Record | null; + /** Raw result text from tool_result (null if still running) */ + resultRaw: string | null; + /** Parsed result object (null if not JSON or still running) */ + result: Record | null; + /** Current state */ + status: ToolCallStatus; +} + +// --------------------------------------------------------------------------- +// Transcript → ToolCallPair[] pairing +// --------------------------------------------------------------------------- + +/** + * Walk a flat TranscriptStep[] and pair each tool_call with its tool_result + * via call_id matching, plus emit standalone text steps. Returns an ordered + * list of renderable items: text blocks and paired tool calls. + */ +export type TranscriptItem = + | { kind: "text"; text: string } + | { kind: "tool"; pair: ToolCallPair }; + +export function pairTranscriptSteps(steps: TranscriptStep[]): TranscriptItem[] { + const items: TranscriptItem[] = []; + const resultsById = new Map(); + + // First pass: index all tool_result steps by call_id + for (const step of steps) { + if (step.type === "tool_result") { + resultsById.set(step.call_id, { name: step.name, text: step.text }); + } + } + + // Second pass: emit items in order + for (const step of steps) { + if (step.type === "action") { + for (const content of step.content) { + if (content.type === "text") { + items.push({ kind: "text", text: content.text }); + } else if (content.type === "tool_call") { + const result = resultsById.get(content.id); + const parsedArgs = tryParseJson(content.args); + const parsedResult = result ? tryParseJson(result.text) : null; + + // Detect error: result text starts with "Error" or contains error indicators + const isError = result + ? isErrorResult(result.text, parsedResult) + : false; + + items.push({ + kind: "tool", + pair: { + id: content.id, + name: content.name, + argsRaw: content.args, + args: parsedArgs, + resultRaw: result?.text ?? null, + result: parsedResult, + status: result + ? isError + ? "error" + : "completed" + : "running", + }, + }); + } + } + } + // tool_result steps are consumed by the pairing above, not rendered standalone + } + + return items; +} + +function tryParseJson(text: string): Record | null { + if (!text || text.trim().length === 0) return null; + try { + const parsed = JSON.parse(text); + if (typeof parsed === "object" && parsed !== null && !Array.isArray(parsed)) { + return parsed as Record; + } + return null; + } catch { + return null; + } +} + +function isErrorResult( + text: string, + parsed: Record | null, +): boolean { + if (parsed?.error) return true; + if (parsed?.status === "error") return true; + const lower = text.toLowerCase(); + return ( + lower.startsWith("error:") || + lower.startsWith("error -") || + lower.startsWith("failed:") + ); +} + +// --------------------------------------------------------------------------- +// Tool-specific rendering +// --------------------------------------------------------------------------- + +interface ToolRenderer { + /** One-line summary shown in the collapsed header (after tool name) */ + summary(pair: ToolCallPair): string | null; + /** Custom args display (return null to use default JSON) */ + argsView?(pair: ToolCallPair): React.ReactNode | null; + /** Custom result display (return null to use default text) */ + resultView?(pair: ToolCallPair): React.ReactNode | null; +} + +const toolRenderers: Record = { + browser_launch: { + summary(pair) { + const headless = pair.args?.headless; + return headless === false ? "Launch browser (visible)" : "Launch browser"; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + browser_navigate: { + summary(pair) { + const url = pair.args?.url; + return url ? truncate(String(url), 60) : null; + }, + resultView(pair) { + if (!pair.result) return ; + const title = pair.result.title as string | undefined; + const url = pair.result.url as string | undefined; + return ( +
+ {title && ( +

+ Title: + {title} +

+ )} + {url && ( +

+ {truncate(url, 80)} +

+ )} +
+ ); + }, + }, + + browser_snapshot: { + summary(pair) { + if (!pair.resultRaw) return "Taking snapshot..."; + // Count elements: look for "[N]" patterns in the ARIA snapshot + const matches = pair.resultRaw.match(/\[\d+\]/g); + const count = matches?.length ?? 0; + return count > 0 + ? `${count} interactive element${count !== 1 ? "s" : ""}` + : "Page snapshot"; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + // ARIA tree snapshots are YAML-like text — render in a scrollable pre + // but cap the default view + return ; + }, + }, + + browser_click: { + summary(pair) { + const index = pair.args?.index; + return index !== undefined ? `Click element [${index}]` : "Click"; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + browser_type: { + summary(pair) { + const index = pair.args?.index; + const hasSecret = pair.args?.secret !== undefined; + const text = pair.args?.text; + if (hasSecret) { + return index !== undefined + ? `Type secret into [${index}]` + : "Type secret"; + } + if (text) { + const display = truncate(String(text), 30); + return index !== undefined + ? `Type "${display}" into [${index}]` + : `Type "${display}"`; + } + return index !== undefined ? `Type into [${index}]` : "Type"; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + browser_press_key: { + summary(pair) { + const key = pair.args?.key; + return key ? `Press ${key}` : "Press key"; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + browser_screenshot: { + summary() { + return "Capture screenshot"; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + // Screenshot results may contain base64 image data or a path + if (pair.result?.base64) { + const mimeType = + (pair.result.mime_type as string) ?? "image/png"; + return ( +
+ Browser screenshot +
+ ); + } + return ; + }, + }, + + browser_evaluate: { + summary(pair) { + const expression = pair.args?.expression; + return expression + ? truncate(String(expression), 50) + : "Evaluate JS"; + }, + argsView(pair) { + const expression = pair.args?.expression; + if (!expression) return null; + return ( +
+

+ JavaScript +

+
+						{String(expression)}
+					
+
+ ); + }, + }, + + browser_tab_open: { + summary(pair) { + const url = pair.args?.url; + return url ? `Open tab: ${truncate(String(url), 50)}` : "Open new tab"; + }, + }, + + browser_tab_list: { + summary() { + return "List tabs"; + }, + }, + + browser_tab_close: { + summary(pair) { + const tabId = pair.args?.tab_id; + return tabId !== undefined + ? `Close tab ${tabId}` + : "Close tab"; + }, + }, + + browser_close: { + summary() { + return "Close browser"; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + shell: { + summary(pair) { + const command = pair.args?.command; + return command ? truncate(String(command), 60) : null; + }, + argsView(pair) { + const command = pair.args?.command; + if (!command) return null; + return ( +
+
+						$ 
+						{String(command)}
+					
+
+ ); + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + file: { + summary(pair) { + const path = pair.args?.path; + const action = pair.args?.action; + if (action && path) { + return `${action}: ${truncate(String(path), 50)}`; + } + return path ? truncate(String(path), 60) : null; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + exec: { + summary(pair) { + const command = pair.args?.command; + return command ? truncate(String(command), 60) : null; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + set_status: { + summary(pair) { + const kind = pair.args?.kind; + const message = pair.args?.message; + if (kind === "outcome") { + return message + ? `Outcome: ${truncate(String(message), 50)}` + : "Outcome set"; + } + return message ? truncate(String(message), 60) : null; + }, + resultView() { + // set_status results are not interesting — just "ok" + return null; + }, + }, +}; + +// Default renderer for tools without a specific renderer +const defaultRenderer: ToolRenderer = { + summary(pair) { + if (!pair.argsRaw || pair.argsRaw === "{}") return null; + return truncate(pair.argsRaw, 60); + }, +}; + +function getRenderer(name: string): ToolRenderer { + return toolRenderers[name] ?? defaultRenderer; +} + +// --------------------------------------------------------------------------- +// Shared sub-components +// --------------------------------------------------------------------------- + +function ResultLine({ text }: { text: string }) { + return ( +

{text}

+ ); +} + +function ResultText({ text }: { text: string | null }) { + if (!text) return null; + return ( +
+			{text}
+		
+ ); +} + +function CollapsiblePre({ + text, + maxLines = 20, +}: { + text: string; + maxLines?: number; +}) { + const [expanded, setExpanded] = useState(false); + const lines = text.split("\n"); + const needsCollapse = lines.length > maxLines; + const displayText = + needsCollapse && !expanded + ? lines.slice(0, maxLines).join("\n") + "\n..." + : text; + + return ( +
+
+				{displayText}
+			
+ {needsCollapse && ( + + )} +
+ ); +} + +// --------------------------------------------------------------------------- +// Status display helpers +// --------------------------------------------------------------------------- + +const STATUS_ICONS: Record = { + running: "\u25B6", // ▶ + completed: "\u2713", // ✓ + error: "\u2717", // ✗ +}; + +const STATUS_COLORS: Record = { + running: "text-accent", + completed: "text-emerald-500", + error: "text-red-400", +}; + +/** Human-readable tool name: browser_navigate → Navigate */ +function formatToolName(name: string): string { + // Strip common prefixes for cleaner display + const stripped = name + .replace(/^browser_/, "") + .replace(/^tab_/, "Tab "); + + return stripped + .split("_") + .map((word) => word.charAt(0).toUpperCase() + word.slice(1)) + .join(" "); +} + +/** Tool category label shown as a faint prefix */ +function toolCategory(name: string): string | null { + if (name.startsWith("browser_")) return "Browser"; + return null; +} + +// --------------------------------------------------------------------------- +// Main component +// --------------------------------------------------------------------------- + +export function ToolCall({ pair }: { pair: ToolCallPair }) { + const [expanded, setExpanded] = useState(false); + const renderer = getRenderer(pair.name); + const summary = renderer.summary(pair); + const category = toolCategory(pair.name); + const displayName = formatToolName(pair.name); + + return ( +
+ {/* Header — always visible */} + + + {/* Expanded body */} + {expanded && ( +
+ {/* Args section */} + {renderArgs(pair, renderer)} + {/* Result section */} + {renderResult(pair, renderer)} +
+ )} +
+ ); +} + +function renderArgs(pair: ToolCallPair, renderer: ToolRenderer): React.ReactNode { + // Try custom args view first + if (renderer.argsView) { + const custom = renderer.argsView(pair); + if (custom) return custom; + } + + // Default: show parsed args as key-value pairs, or raw JSON + if (pair.args && Object.keys(pair.args).length > 0) { + return ( +
+
+ {Object.entries(pair.args).map(([key, value]) => ( +

+ {key}: + + {formatArgValue(key, value)} + +

+ ))} +
+
+ ); + } + + if (pair.argsRaw && pair.argsRaw !== "{}" && pair.argsRaw.trim().length > 0) { + return ( +
+
+					{pair.argsRaw}
+				
+
+ ); + } + + return null; +} + +function renderResult(pair: ToolCallPair, renderer: ToolRenderer): React.ReactNode { + if (pair.status === "running") { + return ( +
+ + Running... +
+ ); + } + + // Try custom result view first + if (renderer.resultView) { + const custom = renderer.resultView(pair); + if (custom !== undefined) return custom; + } + + // Default result rendering + if (!pair.resultRaw) return null; + + // If we have parsed JSON result, show key-value pairs + if (pair.result && Object.keys(pair.result).length > 0) { + return ( +
+
+ {Object.entries(pair.result).map(([key, value]) => ( +

+ {key}: + + {typeof value === "string" + ? truncate(value, 200) + : JSON.stringify(value)} + +

+ ))} +
+
+ ); + } + + // Plain text result + return ; +} + +function formatArgValue(key: string, value: unknown): string { + // Redact secret references + if (key === "secret" && typeof value === "string") { + return "***"; + } + if (typeof value === "string") { + return truncate(value, 100); + } + if (typeof value === "boolean" || typeof value === "number") { + return String(value); + } + return JSON.stringify(value); +} + +function truncate(text: string, maxLen: number): string { + if (text.length <= maxLen) return text; + return text.slice(0, maxLen) + "..."; +} diff --git a/interface/src/hooks/useLiveContext.tsx b/interface/src/hooks/useLiveContext.tsx index 0fc58d4c8..584413e87 100644 --- a/interface/src/hooks/useLiveContext.tsx +++ b/interface/src/hooks/useLiveContext.tsx @@ -1,6 +1,6 @@ import { createContext, useContext, useCallback, useRef, useState, useMemo, type ReactNode } from "react"; import { useQuery, useQueryClient } from "@tanstack/react-query"; -import { api, type AgentMessageEvent, type ChannelInfo, type ToolStartedEvent, type ToolCompletedEvent, type WorkerStatusEvent, type TranscriptStep, type OpenCodePart, type OpenCodePartUpdatedEvent } from "@/api/client"; +import { api, type AgentMessageEvent, type ChannelInfo, type ToolStartedEvent, type ToolCompletedEvent, type TranscriptStep, type OpenCodePart, type OpenCodePartUpdatedEvent, type WorkerTextEvent } from "@/api/client"; import { generateId } from "@/lib/id"; import { useEventSource, type ConnectionState } from "@/hooks/useEventSource"; import { useChannelLiveState, type ChannelLiveState, type ActiveWorker } from "@/hooks/useChannelLiveState"; @@ -147,18 +147,9 @@ export function LiveContextProvider({ children }: { children: ReactNode }) { const wrappedWorkerStatus = useCallback((data: unknown) => { channelHandlers.worker_status(data); - const event = data as WorkerStatusEvent; - // Push status text as an action step in the live transcript - if (event.status && event.status !== "starting" && event.status !== "running") { - setLiveTranscripts((prev) => { - const steps = prev[event.worker_id] ?? []; - const step: TranscriptStep = { - type: "action", - content: [{ type: "text", text: event.status }], - }; - return { ...prev, [event.worker_id]: [...steps, step] }; - }); - } + // Status text comes from set_status tool calls which already appear as + // paired tool_started/tool_completed events in the transcript. No need + // to duplicate them as standalone text steps. bumpWorkerVersion(); }, [channelHandlers, bumpWorkerVersion]); @@ -249,6 +240,20 @@ export function LiveContextProvider({ children }: { children: ReactNode }) { bumpWorkerVersion(); }, [bumpWorkerVersion]); + // Handle worker text — model reasoning text emitted between tool calls + const handleWorkerText = useCallback((data: unknown) => { + const event = data as WorkerTextEvent; + setLiveTranscripts((prev) => { + const steps = prev[event.worker_id] ?? []; + const step: TranscriptStep = { + type: "action", + content: [{ type: "text", text: event.text }], + }; + return { ...prev, [event.worker_id]: [...steps, step] }; + }); + bumpWorkerVersion(); + }, [bumpWorkerVersion]); + // Merge channel handlers with agent message + task handlers const handlers = useMemo( () => ({ @@ -260,11 +265,12 @@ export function LiveContextProvider({ children }: { children: ReactNode }) { tool_started: wrappedToolStarted, tool_completed: wrappedToolCompleted, opencode_part_updated: handleOpenCodePartUpdated, + worker_text: handleWorkerText, agent_message_sent: handleAgentMessage, agent_message_received: handleAgentMessage, task_updated: bumpTaskVersion, }), - [channelHandlers, wrappedWorkerStarted, wrappedWorkerStatus, wrappedWorkerIdle, wrappedWorkerCompleted, wrappedToolStarted, wrappedToolCompleted, handleOpenCodePartUpdated, handleAgentMessage, bumpTaskVersion], + [channelHandlers, wrappedWorkerStarted, wrappedWorkerStatus, wrappedWorkerIdle, wrappedWorkerCompleted, wrappedToolStarted, wrappedToolCompleted, handleOpenCodePartUpdated, handleWorkerText, handleAgentMessage, bumpTaskVersion], ); const onReconnect = useCallback(() => { diff --git a/interface/src/routes/AgentWorkers.tsx b/interface/src/routes/AgentWorkers.tsx index db98b08d3..b45449ed0 100644 --- a/interface/src/routes/AgentWorkers.tsx +++ b/interface/src/routes/AgentWorkers.tsx @@ -8,9 +8,9 @@ import { type WorkerRunInfo, type WorkerDetailResponse, type TranscriptStep, - type ActionContent, type OpenCodePart, } from "@/api/client"; +import {ToolCall, pairTranscriptSteps} from "@/components/ToolCall"; import {Badge} from "@/ui/Badge"; import {formatTimeAgo, formatDuration} from "@/lib/format"; import {LiveDuration} from "@/components/LiveDuration"; @@ -579,22 +579,22 @@ function WorkerDetail({ {isLive && !isIdle ? "Live Transcript" : "Transcript"}
- {transcript.map((step, index) => ( + {pairTranscriptSteps(transcript).map((item, index) => ( - + {item.kind === "text" ? ( +
+ {item.text} +
+ ) : ( + + )}
))} - {isRunning && currentTool && ( -
- - Running {currentTool}... -
- )} {isIdle && (
Waiting for follow-up input... @@ -1030,20 +1030,6 @@ function TaskText({text}: {text: string}) { ); } -function TranscriptStepView({step}: {step: TranscriptStep}) { - if (step.type === "action") { - return ( -
- {step.content.map((content, index) => ( - - ))} -
- ); - } - - return ; -} - function CancelWorkerButton({ channelId, workerId, @@ -1070,82 +1056,7 @@ function CancelWorkerButton({ ); } -function ActionContentView({content}: {content: ActionContent}) { - if (content.type === "text") { - return ( -
- {content.text} -
- ); - } - - return ; -} - -function ToolCallView({ - content, -}: { - content: Extract; -}) { - const [expanded, setExpanded] = useState(false); - return ( -
- - {expanded && ( -
-					{content.args}
-				
- )} -
- ); -} - -function ToolResultView({ - step, -}: { - step: Extract; -}) { - const [expanded, setExpanded] = useState(false); - const isLong = step.text.length > 300; - const displayText = - isLong && !expanded ? step.text.slice(0, 300) + "..." : step.text; - - return ( -
-
- - {step.name && ( - - {step.name} - - )} -
-
-				{displayText}
-			
- {isLong && ( - - )} -
- ); -} // -- OpenCode-native part renderers -- diff --git a/src/agent/channel_history.rs b/src/agent/channel_history.rs index 27c455449..f56a9624f 100644 --- a/src/agent/channel_history.rs +++ b/src/agent/channel_history.rs @@ -422,7 +422,8 @@ pub(crate) fn event_is_for_channel(event: &ProcessEvent, channel_id: &ChannelId) } => event_channel.as_ref() == Some(channel_id), ProcessEvent::OpenCodePartUpdated { .. } | ProcessEvent::StatusUpdate { .. } - | ProcessEvent::TaskUpdated { .. } => false, + | ProcessEvent::TaskUpdated { .. } + | ProcessEvent::WorkerText { .. } => false, } } diff --git a/src/agent/cortex.rs b/src/agent/cortex.rs index 6d7342361..7e44d6fa6 100644 --- a/src/agent/cortex.rs +++ b/src/agent/cortex.rs @@ -1257,7 +1257,8 @@ fn signal_from_event(event: ProcessEvent) -> Option { // UI-only events — no cortex signal needed. ProcessEvent::OpenCodeSessionCreated { .. } | ProcessEvent::OpenCodePartUpdated { .. } - | ProcessEvent::WorkerInitialResult { .. } => return None, + | ProcessEvent::WorkerInitialResult { .. } + | ProcessEvent::WorkerText { .. } => return None, }) } diff --git a/src/api/state.rs b/src/api/state.rs index 9ff4ecc20..fac9fe079 100644 --- a/src/api/state.rs +++ b/src/api/state.rs @@ -246,6 +246,12 @@ pub enum ApiEvent { worker_id: String, part: crate::opencode::types::OpenCodePart, }, + /// A worker emitted text content (model reasoning between tool calls). + WorkerText { + agent_id: String, + worker_id: String, + text: String, + }, } impl ApiState { @@ -593,6 +599,27 @@ impl ApiState { }) .ok(); } + ProcessEvent::WorkerText { + worker_id, text, .. + } => { + // Accumulate into live transcript cache + let step = TranscriptStep::Action { + content: vec![ActionContent::Text { text: text.clone() }], + }; + let mut guard = live_transcripts.write().await; + if let Some(steps) = guard.get_mut(&worker_id.to_string()) { + steps.push(step); + } + drop(guard); + + api_tx + .send(ApiEvent::WorkerText { + agent_id: agent_id.clone(), + worker_id: worker_id.to_string(), + text: text.clone(), + }) + .ok(); + } _ => {} } } diff --git a/src/api/system.rs b/src/api/system.rs index 2891caa9e..4fe9d27f6 100644 --- a/src/api/system.rs +++ b/src/api/system.rs @@ -99,6 +99,7 @@ pub(super) async fn events_sse( ApiEvent::AgentMessageReceived { .. } => "agent_message_received", ApiEvent::TaskUpdated { .. } => "task_updated", ApiEvent::OpenCodePartUpdated { .. } => "opencode_part_updated", + ApiEvent::WorkerText { .. } => "worker_text", }; yield Ok(axum::response::sse::Event::default() .event(event_type) diff --git a/src/hooks/spacebot.rs b/src/hooks/spacebot.rs index fe606b8e8..9f1a4a9a3 100644 --- a/src/hooks/spacebot.rs +++ b/src/hooks/spacebot.rs @@ -41,6 +41,10 @@ pub struct SpacebotHook { /// Once signaled, the nudge system allows text-only responses to pass /// through as legitimate completions. outcome_signaled: std::sync::Arc, + /// Counts consecutive text-only nudge attempts. Reset to zero whenever a + /// tool call completes successfully, so the budget tracks *consecutive* + /// text-only responses rather than total across the worker's lifetime. + nudge_attempts: std::sync::Arc, } impl SpacebotHook { @@ -71,6 +75,7 @@ impl SpacebotHook { completion_calls: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)), nudge_request_active: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)), outcome_signaled: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)), + nudge_attempts: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)), } } @@ -91,6 +96,8 @@ impl SpacebotHook { .store(0, std::sync::atomic::Ordering::Relaxed); self.outcome_signaled .store(false, std::sync::atomic::Ordering::Relaxed); + self.nudge_attempts + .store(0, std::sync::atomic::Ordering::Relaxed); } fn set_tool_nudge_request_active(&self, active: bool) { @@ -119,7 +126,6 @@ impl SpacebotHook { self.reset_tool_nudge_state(); self.set_tool_nudge_request_active(true); - let mut nudge_attempts = 0usize; let mut current_prompt = std::borrow::Cow::Borrowed(prompt); let mut using_tool_nudge_prompt = false; @@ -133,19 +139,29 @@ impl SpacebotHook { match &result { Err(PromptError::PromptCancelled { reason, .. }) - if Self::is_tool_nudge_reason(reason) - && nudge_attempts < Self::TOOL_NUDGE_MAX_RETRIES => + if Self::is_tool_nudge_reason(reason) => { + // Read the current attempt count. on_tool_result resets + // this to zero whenever a tool call completes, so this + // tracks *consecutive* text-only responses rather than + // total nudges across the worker's lifetime. + let attempts = self + .nudge_attempts + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if attempts >= Self::TOOL_NUDGE_MAX_RETRIES { + // Retries exhausted — propagate the cancellation. + self.set_tool_nudge_request_active(false); + return result; + } Self::prune_tool_nudge_retry_history( history, history_len_before_attempt, using_tool_nudge_prompt, ); - nudge_attempts += 1; tracing::warn!( process_id = %self.process_id, process_type = %self.process_type, - attempt = nudge_attempts, + attempt = attempts + 1, "text-only response without outcome signal, nudging tool usage" ); current_prompt = std::borrow::Cow::Borrowed(Self::TOOL_NUDGE_PROMPT); @@ -473,6 +489,40 @@ where }; } + // Emit text content from worker completion responses so the live + // transcript can show the model's reasoning between tool calls. + if self.process_type == ProcessType::Worker { + let text: String = response + .choice + .iter() + .filter_map(|content| { + if let rig::message::AssistantContent::Text(text) = content { + let trimmed = text.text.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed) + } + } else { + None + } + }) + .collect::>() + .join("\n\n"); + + if !text.is_empty() + && let ProcessId::Worker(worker_id) = &self.process_id + { + let event = ProcessEvent::WorkerText { + agent_id: self.agent_id.clone(), + worker_id: *worker_id, + channel_id: self.channel_id.clone(), + text, + }; + self.event_tx.send(event).ok(); + } + } + HookAction::Continue } @@ -598,6 +648,14 @@ where self.record_tool_result_metrics(tool_name, internal_call_id); + // A successful tool call proves the worker is still productive. + // Reset the consecutive nudge counter so a brief narration blip + // after many tool calls doesn't exhaust the retry budget. + if self.tool_nudge_policy.is_enabled() { + self.nudge_attempts + .store(0, std::sync::atomic::Ordering::Relaxed); + } + // Detect terminal outcome signal from successful set_status results. // We check the *result* (not the args) so the flag is only set after the // tool actually executed successfully. A failed set_status call won't @@ -1150,6 +1208,67 @@ mod tests { )); } + #[tokio::test] + async fn tool_result_resets_consecutive_nudge_counter() { + // The exact scenario from the Railway browser worker failure: + // worker makes many tool calls, then narrates, gets nudged, but the + // nudge counter should have been reset by the prior tool calls so + // the budget isn't exhausted from earlier narration blips. + let hook = make_hook().with_tool_nudge_policy(ToolNudgePolicy::Enabled); + hook.reset_tool_nudge_state(); + hook.set_tool_nudge_request_active(true); + + // Simulate 2 nudge attempts (would exhaust budget under old behavior) + hook.nudge_attempts + .store(2, std::sync::atomic::Ordering::Relaxed); + + // A successful tool call should reset the counter + let _ = >::on_tool_result( + &hook, + "browser_click", + None, + "internal_1", + "{\"index\": 13}", + "{\"success\":true,\"message\":\"Clicked element at index 13\"}", + ) + .await; + + assert_eq!( + hook.nudge_attempts + .load(std::sync::atomic::Ordering::Relaxed), + 0, + "Tool result should reset consecutive nudge counter to zero" + ); + } + + #[tokio::test] + async fn nudge_counter_not_reset_when_policy_disabled() { + // When nudge policy is disabled, on_tool_result should not touch + // the counter (it's irrelevant but ensures no side effects). + let hook = make_hook().with_tool_nudge_policy(ToolNudgePolicy::Disabled); + hook.reset_tool_nudge_state(); + + hook.nudge_attempts + .store(2, std::sync::atomic::Ordering::Relaxed); + + let _ = >::on_tool_result( + &hook, + "shell", + None, + "internal_1", + "{\"command\":\"ls\"}", + "file1.txt\nfile2.txt", + ) + .await; + + assert_eq!( + hook.nudge_attempts + .load(std::sync::atomic::Ordering::Relaxed), + 2, + "Nudge counter should not be reset when policy is disabled" + ); + } + #[tokio::test] async fn nudges_on_reasoning_only_response_without_outcome() { // A response with only Reasoning content (no Text, no ToolCall) should diff --git a/src/lib.rs b/src/lib.rs index 87cfa126f..805e603e8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -288,6 +288,14 @@ pub enum ProcessEvent { text_delta: String, aggregated_text: String, }, + /// A worker emitted text content (model reasoning between tool calls). + /// Sent once per completion response, containing the full text for that turn. + WorkerText { + agent_id: AgentId, + worker_id: WorkerId, + channel_id: Option, + text: String, + }, } /// Default broadcast capacity for the per-agent control event bus. From 9ee27ea2fb8780db8c26cac1714e4527ddf65d12 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 7 Mar 2026 14:55:22 -0800 Subject: [PATCH 09/11] add tool renderers for opencode, web_search, read_skill, spacebot_docs; unify live/transcript rendering --- interface/src/components/ToolCall.tsx | 362 +++++++++++++++++++++++++- interface/src/routes/AgentWorkers.tsx | 93 +------ 2 files changed, 359 insertions(+), 96 deletions(-) diff --git a/interface/src/components/ToolCall.tsx b/interface/src/components/ToolCall.tsx index c07cc6c78..0cf590203 100644 --- a/interface/src/components/ToolCall.tsx +++ b/interface/src/components/ToolCall.tsx @@ -1,6 +1,6 @@ -import { useState, useMemo } from "react"; +import { useState } from "react"; import { cx } from "@/ui/utils"; -import type { ActionContent, TranscriptStep } from "@/api/client"; +import type { ActionContent, TranscriptStep, OpenCodePart } from "@/api/client"; // --------------------------------------------------------------------------- // Types @@ -23,6 +23,8 @@ export interface ToolCallPair { result: Record | null; /** Current state */ status: ToolCallStatus; + /** Human-readable summary provided by live opencode parts */ + title?: string | null; } // --------------------------------------------------------------------------- @@ -90,6 +92,47 @@ export function pairTranscriptSteps(steps: TranscriptStep[]): TranscriptItem[] { return items; } +/** + * Convert an OpenCode live part (tool type) into a ToolCallPair so it can be + * rendered by the unified ToolCall component. + */ +export function openCodePartToPair( + part: Extract, +): ToolCallPair { + const input = + part.status === "running" || part.status === "completed" + ? (part as any).input + : undefined; + const output = + part.status === "completed" ? (part as any).output : undefined; + const error = + part.status === "error" ? (part as any).error : undefined; + const title = + part.status === "running" || part.status === "completed" + ? (part as any).title + : undefined; + + const argsRaw = input ?? ""; + const resultRaw = error ?? output ?? null; + + return { + id: part.id, + name: part.tool, + argsRaw, + args: tryParseJson(argsRaw), + resultRaw, + result: resultRaw ? tryParseJson(resultRaw) : null, + status: + part.status === "error" + ? "error" + : part.status === "completed" + ? "completed" + : "running", + // Carry the title through for renderers that want it + title: title ?? null, + }; +} + function tryParseJson(text: string): Record | null { if (!text || text.trim().length === 0) return null; try { @@ -337,13 +380,33 @@ const toolRenderers: Record = { file: { summary(pair) { + if (pair.title) return pair.title; const path = pair.args?.path; - const action = pair.args?.action; - if (action && path) { - return `${action}: ${truncate(String(path), 50)}`; + const op = pair.args?.operation ?? pair.args?.action; + if (op && path) { + return `${op}: ${truncate(String(path), 50)}`; } return path ? truncate(String(path), 60) : null; }, + argsView(pair) { + const path = pair.args?.path; + const op = pair.args?.operation ?? pair.args?.action; + const content = pair.args?.content; + if (!path) return null; + return ( +
+

+ {op && {String(op)}: } + {String(path)} +

+ {op === "write" && content && ( +
+							{truncate(String(content), 2000)}
+						
+ )} +
+ ); + }, resultView(pair) { if (!pair.resultRaw) return null; return ; @@ -377,6 +440,285 @@ const toolRenderers: Record = { return null; }, }, + + // ----------------------------------------------------------------------- + // OpenCode tools + // ----------------------------------------------------------------------- + + read: { + summary(pair) { + if (pair.title) return pair.title; + const filePath = pair.args?.filePath ?? pair.args?.file_path; + return filePath ? truncate(String(filePath), 60) : null; + }, + argsView(pair) { + const filePath = pair.args?.filePath ?? pair.args?.file_path; + if (!filePath) return null; + const offset = pair.args?.offset; + const limit = pair.args?.limit; + return ( +
+

+ {String(filePath)} + {offset ? ` (from line ${offset})` : ""} + {limit ? ` (${limit} lines)` : ""} +

+
+ ); + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + write: { + summary(pair) { + if (pair.title) return pair.title; + const filePath = pair.args?.filePath ?? pair.args?.file_path; + return filePath ? truncate(String(filePath), 60) : null; + }, + argsView(pair) { + const filePath = pair.args?.filePath ?? pair.args?.file_path; + const content = pair.args?.content; + if (!filePath && !content) return null; + return ( +
+ {filePath && ( +

+ {String(filePath)} +

+ )} + {content && ( +
+							{truncate(String(content), 2000)}
+						
+ )} +
+ ); + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + edit: { + summary(pair) { + if (pair.title) return pair.title; + const filePath = pair.args?.filePath ?? pair.args?.file_path; + return filePath ? truncate(String(filePath), 60) : null; + }, + argsView(pair) { + const filePath = pair.args?.filePath ?? pair.args?.file_path; + const oldStr = pair.args?.oldString ?? pair.args?.old_string; + const newStr = pair.args?.newString ?? pair.args?.new_string; + if (!filePath) return null; + return ( +
+

+ {String(filePath)} +

+ {oldStr && ( +
+

+ Old +

+
+								{truncate(String(oldStr), 500)}
+							
+
+ )} + {newStr && ( +
+

+ New +

+
+								{truncate(String(newStr), 500)}
+							
+
+ )} +
+ ); + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + bash: { + summary(pair) { + if (pair.title) return pair.title; + const command = pair.args?.command; + return command ? truncate(String(command), 60) : null; + }, + argsView(pair) { + const command = pair.args?.command; + if (!command) return null; + return ( +
+
+						$ 
+						{String(command)}
+					
+
+ ); + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + glob: { + summary(pair) { + if (pair.title) return pair.title; + const pattern = pair.args?.pattern; + return pattern ? truncate(String(pattern), 60) : null; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + grep: { + summary(pair) { + if (pair.title) return pair.title; + const pattern = pair.args?.pattern; + const include = pair.args?.include; + if (pattern && include) { + return `/${pattern}/ in ${include}`; + } + return pattern ? `/${truncate(String(pattern), 40)}/` : null; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + webfetch: { + summary(pair) { + if (pair.title) return pair.title; + const url = pair.args?.url; + return url ? truncate(String(url), 60) : null; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + read_skill: { + summary(pair) { + if (pair.title) return pair.title; + const name = pair.args?.name; + return name ? String(name) : null; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + web_search: { + summary(pair) { + if (pair.title) return pair.title; + const query = pair.args?.query; + const resultCount = pair.result?.result_count ?? (Array.isArray(pair.result?.results) ? (pair.result!.results as unknown[]).length : null); + const queryStr = query ? truncate(String(query), 50) : null; + if (queryStr && resultCount != null) { + return `${queryStr} (${resultCount} result${resultCount !== 1 ? "s" : ""})`; + } + return queryStr; + }, + argsView(pair) { + const query = pair.args?.query; + if (!query) return null; + const count = pair.args?.count; + const freshness = pair.args?.freshness; + return ( +
+

+ Search: + {String(query)} +

+ {(count || freshness) && ( +

+ {count ? `${count} results` : ""} + {count && freshness ? " · " : ""} + {freshness ? `${freshness}` : ""} +

+ )} +
+ ); + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + spacebot_docs: { + summary(pair) { + if (pair.title) return pair.title; + const action = pair.args?.action; + const docId = pair.args?.doc_id; + if (action === "read" && docId) { + return truncate(String(docId), 50); + } + return action ? String(action) : "list"; + }, + argsView(pair) { + const action = pair.args?.action; + const docId = pair.args?.doc_id; + const query = pair.args?.query; + if (!action && !docId) return null; + return ( +
+ {docId && ( +

+ {String(docId)} +

+ )} + {query && ( +

+ filter: {String(query)} +

+ )} +
+ ); + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + todowrite: { + summary(pair) { + if (pair.title) return pair.title; + return "Update tasks"; + }, + resultView() { + return null; + }, + }, + + task: { + summary(pair) { + if (pair.title) return pair.title; + const description = pair.args?.description; + return description ? truncate(String(description), 60) : null; + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, }; // Default renderer for tools without a specific renderer @@ -462,6 +804,16 @@ const STATUS_COLORS: Record = { /** Human-readable tool name: browser_navigate → Navigate */ function formatToolName(name: string): string { + // Handle specific tool name overrides + const overrides: Record = { + webfetch: "Web Fetch", + todowrite: "Todo", + read_skill: "Read Skill", + web_search: "Web Search", + spacebot_docs: "Docs", + }; + if (overrides[name]) return overrides[name]; + // Strip common prefixes for cleaner display const stripped = name .replace(/^browser_/, "") diff --git a/interface/src/routes/AgentWorkers.tsx b/interface/src/routes/AgentWorkers.tsx index b45449ed0..93217cc07 100644 --- a/interface/src/routes/AgentWorkers.tsx +++ b/interface/src/routes/AgentWorkers.tsx @@ -10,7 +10,7 @@ import { type TranscriptStep, type OpenCodePart, } from "@/api/client"; -import {ToolCall, pairTranscriptSteps} from "@/components/ToolCall"; +import {ToolCall, pairTranscriptSteps, openCodePartToPair} from "@/components/ToolCall"; import {Badge} from "@/ui/Badge"; import {formatTimeAgo, formatDuration} from "@/lib/format"; import {LiveDuration} from "@/components/LiveDuration"; @@ -1069,7 +1069,7 @@ function OpenCodePartView({part}: {part: OpenCodePart}) {
); case "tool": - return ; + return ; case "step_start": return (
@@ -1093,93 +1093,4 @@ function OpenCodePartView({part}: {part: OpenCodePart}) { } } -function OpenCodeToolPartView({ - part, -}: { - part: Extract; -}) { - const [expanded, setExpanded] = useState(false); - const isRunning = part.status === "running"; - const isCompleted = part.status === "completed"; - const isError = part.status === "error"; - - const statusIcon = isCompleted - ? "\u2713" - : isError - ? "\u2717" - : isRunning - ? "\u25B6" - : "\u25CB"; - - const statusColor = isCompleted - ? "text-emerald-500" - : isError - ? "text-red-400" - : isRunning - ? "text-accent" - : "text-ink-faint"; - - const title = - (part.status === "running" || part.status === "completed") - ? (part as any).title - : undefined; - - const input = - (part.status === "running" || part.status === "completed") - ? (part as any).input - : undefined; - - const output = part.status === "completed" ? (part as any).output : undefined; - const error = part.status === "error" ? (part as any).error : undefined; - return ( -
- - {expanded && ( -
- {input && ( -
-

Input

-
-								{input}
-							
-
- )} - {output && ( -
-

Output

-
-								{output.length > 500 ? output.slice(0, 500) + "..." : output}
-							
-
- )} - {error && ( -
-

Error

-
{error}
-
- )} -
- )} -
- ); -} From 2f6312f52541d76037ae008d94360421246c3a99 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 16:31:56 -0800 Subject: [PATCH 10/11] improve shell/exec/bash tool result rendering and error detection Parse structured shell results (stdout, stderr, exit_code) into distinct sections instead of dumping raw JSON. Non-zero exit codes show a red badge, stderr gets its own labeled section with appropriate coloring. Also split the generic file renderer into file_read/file_write/file_edit/file_list with tailored displays (diff-style old/new for edits, line range info for reads). Error detection now catches success:false, non-zero exit_code, and 'Toolset error:' prefixes so failed tools render with a red X instead of a green checkmark. --- interface/src/components/ToolCall.tsx | 204 ++++- prompts/en/tools/file_description.md.j2 | 1 - prompts/en/tools/file_edit_description.md.j2 | 1 + prompts/en/tools/file_list_description.md.j2 | 1 + prompts/en/tools/file_read_description.md.j2 | 1 + prompts/en/tools/file_write_description.md.j2 | 1 + prompts/en/worker.md.j2 | 11 +- src/agent/channel_dispatch.rs | 4 - src/agent/worker.rs | 37 +- src/hooks.rs | 2 + src/hooks/loop_guard.rs | 645 +++++++++++++++ src/hooks/spacebot.rs | 56 +- src/prompts/text.rs | 13 +- src/tools.rs | 18 +- src/tools/file.rs | 756 ++++++++++++++---- src/tools/spawn_worker.rs | 9 +- 16 files changed, 1562 insertions(+), 198 deletions(-) delete mode 100644 prompts/en/tools/file_description.md.j2 create mode 100644 prompts/en/tools/file_edit_description.md.j2 create mode 100644 prompts/en/tools/file_list_description.md.j2 create mode 100644 prompts/en/tools/file_read_description.md.j2 create mode 100644 prompts/en/tools/file_write_description.md.j2 create mode 100644 src/hooks/loop_guard.rs diff --git a/interface/src/components/ToolCall.tsx b/interface/src/components/ToolCall.tsx index 0cf590203..058822502 100644 --- a/interface/src/components/ToolCall.tsx +++ b/interface/src/components/ToolCall.tsx @@ -152,11 +152,15 @@ function isErrorResult( ): boolean { if (parsed?.error) return true; if (parsed?.status === "error") return true; + // Shell/exec structured results: { success: false } or non-zero exit code + if (parsed?.success === false) return true; + if (typeof parsed?.exit_code === "number" && parsed.exit_code !== 0) return true; const lower = text.toLowerCase(); return ( lower.startsWith("error:") || lower.startsWith("error -") || - lower.startsWith("failed:") + lower.startsWith("failed:") || + lower.startsWith("toolset error:") ); } @@ -358,7 +362,14 @@ const toolRenderers: Record = { shell: { summary(pair) { const command = pair.args?.command; - return command ? truncate(String(command), 60) : null; + if (!command) return null; + // If we have a parsed result, append exit code info + if (pair.result && typeof pair.result.exit_code === "number") { + const code = pair.result.exit_code; + const cmdStr = truncate(String(command), 50); + return code === 0 ? cmdStr : `${cmdStr} (exit ${code})`; + } + return truncate(String(command), 60); }, argsView(pair) { const command = pair.args?.command; @@ -374,39 +385,119 @@ const toolRenderers: Record = { }, resultView(pair) { if (!pair.resultRaw) return null; - return ; + return ; }, }, - file: { + file_read: { summary(pair) { if (pair.title) return pair.title; const path = pair.args?.path; - const op = pair.args?.operation ?? pair.args?.action; - if (op && path) { - return `${op}: ${truncate(String(path), 50)}`; - } return path ? truncate(String(path), 60) : null; }, argsView(pair) { const path = pair.args?.path; - const op = pair.args?.operation ?? pair.args?.action; - const content = pair.args?.content; if (!path) return null; + const offset = pair.args?.offset; + const limit = pair.args?.limit; return (

- {op && {String(op)}: } {String(path)} + {offset ? ` (from line ${offset})` : ""} + {limit ? ` (${limit} lines)` : ""}

- {op === "write" && content && ( -
+				
+ ); + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + file_write: { + summary(pair) { + if (pair.title) return pair.title; + const path = pair.args?.path; + return path ? truncate(String(path), 60) : null; + }, + argsView(pair) { + const path = pair.args?.path; + const content = pair.args?.content; + if (!path && !content) return null; + return ( +
+ {path && ( +

+ {String(path)} +

+ )} + {content && ( +
 							{truncate(String(content), 2000)}
 						
)}
); }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + file_edit: { + summary(pair) { + if (pair.title) return pair.title; + const path = pair.args?.path; + return path ? truncate(String(path), 60) : null; + }, + argsView(pair) { + const path = pair.args?.path; + const oldStr = pair.args?.old_string; + const newStr = pair.args?.new_string; + if (!path) return null; + return ( +
+

+ {String(path)} +

+ {oldStr && ( +
+

+ Old +

+
+								{truncate(String(oldStr), 500)}
+							
+
+ )} + {newStr && ( +
+

+ New +

+
+								{truncate(String(newStr), 500)}
+							
+
+ )} +
+ ); + }, + resultView(pair) { + if (!pair.resultRaw) return null; + return ; + }, + }, + + file_list: { + summary(pair) { + if (pair.title) return pair.title; + const path = pair.args?.path; + return path ? truncate(String(path), 60) : null; + }, resultView(pair) { if (!pair.resultRaw) return null; return ; @@ -416,11 +507,17 @@ const toolRenderers: Record = { exec: { summary(pair) { const command = pair.args?.command; - return command ? truncate(String(command), 60) : null; + if (!command) return null; + if (pair.result && typeof pair.result.exit_code === "number") { + const code = pair.result.exit_code; + const cmdStr = truncate(String(command), 50); + return code === 0 ? cmdStr : `${cmdStr} (exit ${code})`; + } + return truncate(String(command), 60); }, resultView(pair) { if (!pair.resultRaw) return null; - return ; + return ; }, }, @@ -552,7 +649,13 @@ const toolRenderers: Record = { summary(pair) { if (pair.title) return pair.title; const command = pair.args?.command; - return command ? truncate(String(command), 60) : null; + if (!command) return null; + if (pair.result && typeof pair.result.exit_code === "number") { + const code = pair.result.exit_code; + const cmdStr = truncate(String(command), 50); + return code === 0 ? cmdStr : `${cmdStr} (exit ${code})`; + } + return truncate(String(command), 60); }, argsView(pair) { const command = pair.args?.command; @@ -568,7 +671,7 @@ const toolRenderers: Record = { }, resultView(pair) { if (!pair.resultRaw) return null; - return ; + return ; }, }, @@ -786,6 +889,71 @@ function CollapsiblePre({ ); } +// --------------------------------------------------------------------------- +// Shell result rendering +// --------------------------------------------------------------------------- + +function ShellResultView({ pair }: { pair: ToolCallPair }) { + const r = pair.result; + + // If we can't parse structured output, fall back to raw text + if (!r || typeof r.exit_code !== "number") { + return ; + } + + const exitCode = r.exit_code as number; + const stdout = typeof r.stdout === "string" ? r.stdout : ""; + const stderr = typeof r.stderr === "string" ? r.stderr : ""; + const hasStdout = stdout.trim().length > 0; + const hasStderr = stderr.trim().length > 0; + const isError = exitCode !== 0; + + // Nothing to show + if (!hasStdout && !hasStderr && exitCode === 0) { + return ; + } + + return ( +
+ {/* Exit code badge for non-zero */} + {isError && ( +
+ + exit {exitCode} + +
+ )} + + {/* stdout */} + {hasStdout && ( +
+ +
+ )} + + {/* stderr */} + {hasStderr && ( +
+
+ + stderr + +
+
+						{stderr.replace(/\n$/, "")}
+					
+
+ )} +
+ ); +} + // --------------------------------------------------------------------------- // Status display helpers // --------------------------------------------------------------------------- @@ -817,6 +985,7 @@ function formatToolName(name: string): string { // Strip common prefixes for cleaner display const stripped = name .replace(/^browser_/, "") + .replace(/^file_/, "") .replace(/^tab_/, "Tab "); return stripped @@ -828,6 +997,7 @@ function formatToolName(name: string): string { /** Tool category label shown as a faint prefix */ function toolCategory(name: string): string | null { if (name.startsWith("browser_")) return "Browser"; + if (name.startsWith("file_")) return "File"; return null; } diff --git a/prompts/en/tools/file_description.md.j2 b/prompts/en/tools/file_description.md.j2 deleted file mode 100644 index c019f0c2a..000000000 --- a/prompts/en/tools/file_description.md.j2 +++ /dev/null @@ -1 +0,0 @@ -Perform file operations: read, write, or list files. Use this to examine code, read documentation, write files, or explore directory structures. Protected paths (prompts/, identity/, data/, SOUL.md, IDENTITY.md, USER.md) cannot be accessed - use memory_save instead for that content. \ No newline at end of file diff --git a/prompts/en/tools/file_edit_description.md.j2 b/prompts/en/tools/file_edit_description.md.j2 new file mode 100644 index 000000000..869c758ff --- /dev/null +++ b/prompts/en/tools/file_edit_description.md.j2 @@ -0,0 +1 @@ +Edit a file by replacing exact text matches. Finds old_string in the file and replaces it with new_string. The match must be exact (including whitespace and indentation). By default, requires a unique match — if multiple matches exist, provide more surrounding context or set replace_all to true. Use file_read first to verify the exact text you want to replace. \ No newline at end of file diff --git a/prompts/en/tools/file_list_description.md.j2 b/prompts/en/tools/file_list_description.md.j2 new file mode 100644 index 000000000..b6a474143 --- /dev/null +++ b/prompts/en/tools/file_list_description.md.j2 @@ -0,0 +1 @@ +List directory contents. Returns file and directory entries with names, types, and sizes. Directories are listed first, then files, both sorted alphabetically. Use this to explore the filesystem structure. \ No newline at end of file diff --git a/prompts/en/tools/file_read_description.md.j2 b/prompts/en/tools/file_read_description.md.j2 new file mode 100644 index 000000000..173e72ae0 --- /dev/null +++ b/prompts/en/tools/file_read_description.md.j2 @@ -0,0 +1 @@ +Read a file's contents. Supports line-based offset and limit for reading specific sections of large files. Returns the full file content (up to 50KB), or a windowed view when offset/limit are provided. Use this to examine source code, read documentation, or inspect file content. \ No newline at end of file diff --git a/prompts/en/tools/file_write_description.md.j2 b/prompts/en/tools/file_write_description.md.j2 new file mode 100644 index 000000000..fe02efa78 --- /dev/null +++ b/prompts/en/tools/file_write_description.md.j2 @@ -0,0 +1 @@ +Write content to a file. Creates the file if it doesn't exist, or overwrites it entirely if it does. Parent directories are created automatically. Use this for creating new files or replacing an entire file's content. For targeted edits to existing files, prefer file_edit instead. Protected paths (SOUL.md, IDENTITY.md, USER.md) cannot be written — use the identity management API for those. \ No newline at end of file diff --git a/prompts/en/worker.md.j2 b/prompts/en/worker.md.j2 index 224010736..366bbe81b 100644 --- a/prompts/en/worker.md.j2 +++ b/prompts/en/worker.md.j2 @@ -36,7 +36,7 @@ Sandbox mode is **disabled**. - `shell`/`exec` subprocesses run without OS-level filesystem containment. - Host filesystem access follows the OS permissions of the Spacebot process. - Environment sanitization still applies (clean env + explicit passthrough/tool vars). -- The `file` tool can access any path readable by the process (no workspace restriction). +- The file tools (`file_read`, `file_write`, `file_edit`, `file_list`) can access any path readable by the process (no workspace restriction). {%- endif %} ## Your Role @@ -83,9 +83,14 @@ Examples: Execute shell commands. Use this for running builds, tests, git operations, package management, and any system commands. -### file +### File tools (file_read, file_write, file_edit, file_list) -Read, write, and list files. Use this for viewing source code, writing changes, and navigating the filesystem. +Four separate tools for file operations: + +- `file_read` — Read a file's contents. Supports `offset` (1-indexed line number) and `limit` (line count) for reading specific sections. +- `file_write` — Write content to a file (create or overwrite). Both `path` and `content` are required. +- `file_edit` — Edit a file by replacing exact text: provide `path`, `old_string`, and `new_string`. Use `file_read` first to see the exact text. Set `replace_all: true` for multiple occurrences. +- `file_list` — List directory contents with names, types, and sizes. Path restrictions apply: you cannot write to identity files (SOUL.md, IDENTITY.md, USER.md) or memory storage paths. Use the appropriate system tools for those. diff --git a/src/agent/channel_dispatch.rs b/src/agent/channel_dispatch.rs index cfd682aec..f6097098b 100644 --- a/src/agent/channel_dispatch.rs +++ b/src/agent/channel_dispatch.rs @@ -419,7 +419,6 @@ pub async fn spawn_worker_from_state( "worker.run", worker_id = %worker_id, channel_id = %state.channel_id, - task = %task, ); let secrets_store = state.deps.runtime_config.secrets.load().as_ref().clone(); let handle = spawn_worker_task( @@ -550,7 +549,6 @@ pub async fn spawn_opencode_worker_from_state( "worker.run", worker_id = %worker_id, channel_id = %state.channel_id, - task = %task, worker_type = "opencode", ); let sqlite_pool = state.deps.sqlite_pool.clone(); @@ -802,7 +800,6 @@ pub async fn resume_idle_worker_into_state( "worker.resume", worker_id = %worker_id, channel_id = %state.channel_id, - task = %idle_worker.task, worker_type = "opencode", ); let sqlite_pool = state.deps.sqlite_pool.clone(); @@ -926,7 +923,6 @@ pub async fn resume_idle_worker_into_state( "worker.resume", worker_id = %worker_id, channel_id = %state.channel_id, - task = %idle_worker.task, ); let secrets_store = state.deps.runtime_config.secrets.load().as_ref().clone(); let handle = spawn_worker_task( diff --git a/src/agent/worker.rs b/src/agent/worker.rs index 4dd7a1528..acf4af1de 100644 --- a/src/agent/worker.rs +++ b/src/agent/worker.rs @@ -303,7 +303,7 @@ impl Worker { let mut segments_run = 0; let mut overflow_retries = 0; - let result = if resuming { + let mut result = if resuming { // For resumed workers, synthesize a "result" from the task // since the original initial result was already relayed. String::new() @@ -397,18 +397,31 @@ impl Worker { }; // Safety net: if the worker produced an empty result (e.g. reasoning-only - // response that slipped past the nudge gate), treat it as a failure. An empty - // result means the channel has nothing to relay to the user. + // response that slipped past the nudge gate), treat it as a failure — unless + // the worker already signaled a meaningful outcome via set_status. A worker + // that signaled outcome but ran out of turns still completed its task; the + // outcome status text is the result. if !resuming && result.trim().is_empty() { - self.state = WorkerState::Failed; - self.hook.send_status("failed (empty result)"); - self.write_failure_log(&history, "worker produced empty result — likely a reasoning-only exit that bypassed the outcome gate"); - self.persist_transcript(&compacted_history, &history).await; - tracing::error!(worker_id = %self.id, "worker produced empty result, treating as failure"); - return Err(crate::error::AgentError::Other(anyhow::anyhow!( - "worker produced empty result without signaling a meaningful outcome" - )) - .into()); + if self.hook.outcome_signaled() { + tracing::info!( + worker_id = %self.id, + "worker produced empty text but outcome was signaled, treating as success" + ); + // Use a synthetic result — the channel already received the + // outcome status via the event stream, so this is just for the + // worker result record. + result = "Task completed (outcome signaled via set_status).".to_string(); + } else { + self.state = WorkerState::Failed; + self.hook.send_status("failed (empty result)"); + self.write_failure_log(&history, "worker produced empty result — likely a reasoning-only exit that bypassed the outcome gate"); + self.persist_transcript(&compacted_history, &history).await; + tracing::error!(worker_id = %self.id, "worker produced empty result, treating as failure"); + return Err(crate::error::AgentError::Other(anyhow::anyhow!( + "worker produced empty result without signaling a meaningful outcome" + )) + .into()); + } } // For interactive workers, enter a follow-up loop diff --git a/src/hooks.rs b/src/hooks.rs index 6ae37cccb..c1ef0c45f 100644 --- a/src/hooks.rs +++ b/src/hooks.rs @@ -1,7 +1,9 @@ //! Prompt hooks for observing and controlling agent behavior. pub mod cortex; +pub mod loop_guard; pub mod spacebot; pub use cortex::CortexHook; +pub use loop_guard::{LoopGuard, LoopGuardConfig, LoopGuardVerdict}; pub use spacebot::{SpacebotHook, ToolNudgePolicy}; diff --git a/src/hooks/loop_guard.rs b/src/hooks/loop_guard.rs new file mode 100644 index 000000000..7d37e4294 --- /dev/null +++ b/src/hooks/loop_guard.rs @@ -0,0 +1,645 @@ +//! Tool loop detection for the agent execution loop. + +use crate::ProcessType; + +use sha2::{Digest, Sha256}; +use std::collections::{HashMap, HashSet, VecDeque}; + +const POLL_TOOLS: &[&str] = &["shell"]; +const HISTORY_CAPACITY: usize = 30; + +/// Avoids hashing multi-MB tool outputs while still catching identical short +/// results. Large outputs that differ only in the tail (growing log files, +/// etc.) won't hash-match, which is correct — the tool is returning new data. +const RESULT_HASH_TRUNCATION: usize = 1_000; + +#[derive(Debug, Clone)] +pub struct LoopGuardConfig { + pub warn_threshold: u32, + pub block_threshold: u32, + pub global_circuit_breaker: u32, + pub poll_multiplier: u32, + pub outcome_warn_threshold: u32, + pub outcome_block_threshold: u32, + pub ping_pong_min_repeats: u32, + pub max_warnings_per_call: u32, +} + +impl LoopGuardConfig { + pub fn for_process(process_type: ProcessType) -> Self { + match process_type { + // Workers do repetitive tool work (build, test, fix cycles). + // Generous thresholds to avoid interfering with legitimate iteration. + ProcessType::Worker => Self { + warn_threshold: 4, + block_threshold: 7, + global_circuit_breaker: 80, + poll_multiplier: 3, + outcome_warn_threshold: 3, + outcome_block_threshold: 4, + ping_pong_min_repeats: 4, + max_warnings_per_call: 3, + }, + // Branches think and recall, moderate iteration expected. + ProcessType::Branch => Self { + warn_threshold: 3, + block_threshold: 5, + global_circuit_breaker: 40, + poll_multiplier: 2, + outcome_warn_threshold: 2, + outcome_block_threshold: 3, + ping_pong_min_repeats: 3, + max_warnings_per_call: 2, + }, + // Channels, compactors, cortex: minimal tool loops expected. + ProcessType::Channel | ProcessType::Compactor | ProcessType::Cortex => Self { + warn_threshold: 2, + block_threshold: 4, + global_circuit_breaker: 20, + poll_multiplier: 2, + outcome_warn_threshold: 2, + outcome_block_threshold: 3, + ping_pong_min_repeats: 3, + max_warnings_per_call: 2, + }, + } + } +} + +/// Maps to Rig hook actions: `Allow` -> `Continue`, `Block` -> `Skip` (message +/// becomes the tool result the LLM sees), `CircuitBreak` -> `Terminate`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LoopGuardVerdict { + Allow, + Block(String), + CircuitBreak(String), +} + +/// Held behind `Arc>` on `SpacebotHook`. Persists across tool calls +/// within one Rig `agent.prompt()` invocation, reset at prompt boundaries. +pub struct LoopGuard { + config: LoopGuardConfig, + call_counts: HashMap, + total_calls: u32, + outcome_counts: HashMap, + // Call hashes poisoned by outcome detection — next check() auto-blocks. + blocked_outcomes: HashSet, + recent_calls: VecDeque, + warnings_emitted: HashMap, + hash_to_tool: HashMap, +} + +impl LoopGuard { + pub fn new(config: LoopGuardConfig) -> Self { + Self { + config, + call_counts: HashMap::new(), + total_calls: 0, + outcome_counts: HashMap::new(), + blocked_outcomes: HashSet::new(), + recent_calls: VecDeque::with_capacity(HISTORY_CAPACITY), + warnings_emitted: HashMap::new(), + hash_to_tool: HashMap::new(), + } + } + + pub fn reset(&mut self) { + self.call_counts.clear(); + self.total_calls = 0; + self.outcome_counts.clear(); + self.blocked_outcomes.clear(); + self.recent_calls.clear(); + self.warnings_emitted.clear(); + self.hash_to_tool.clear(); + } + + pub fn check(&mut self, tool_name: &str, args: &str) -> LoopGuardVerdict { + self.total_calls += 1; + + if self.total_calls > self.config.global_circuit_breaker { + return LoopGuardVerdict::CircuitBreak(format!( + "Circuit breaker: exceeded {} total tool calls in this loop. \ + The agent appears to be stuck.", + self.config.global_circuit_breaker + )); + } + + let call_hash = Self::compute_call_hash(tool_name, args); + self.hash_to_tool + .entry(call_hash.clone()) + .or_insert_with(|| tool_name.to_string()); + + if self.recent_calls.len() >= HISTORY_CAPACITY { + self.recent_calls.pop_front(); + } + self.recent_calls.push_back(call_hash.clone()); + + if self.blocked_outcomes.contains(&call_hash) { + return LoopGuardVerdict::Block(format!( + "Blocked: tool '{}' is returning identical results repeatedly. \ + The current approach is not working — try something different.", + tool_name + )); + } + + let count = self.call_counts.entry(call_hash.clone()).or_insert(0); + *count += 1; + let count_value = *count; + + let is_poll = Self::is_poll_call(tool_name, args); + let multiplier = if is_poll { + self.config.poll_multiplier + } else { + 1 + }; + let effective_warn = self.config.warn_threshold * multiplier; + let effective_block = self.config.block_threshold * multiplier; + + if count_value >= effective_block { + return LoopGuardVerdict::Block(format!( + "Blocked: tool '{}' called {} times with identical parameters. \ + Try a different approach or different parameters.", + tool_name, count_value + )); + } + + // Warn threshold — escalates to block after max_warnings_per_call. + if count_value >= effective_warn { + let warning_count = self.warnings_emitted.entry(call_hash.clone()).or_insert(0); + *warning_count += 1; + if *warning_count > self.config.max_warnings_per_call { + return LoopGuardVerdict::Block(format!( + "Blocked: tool '{}' called {} times with identical parameters \ + (warnings exhausted). Try a different approach.", + tool_name, count_value + )); + } + return LoopGuardVerdict::Block(format!( + "Warning: tool '{}' has been called {} times with identical parameters. \ + Consider trying a different approach or different parameters.", + tool_name, count_value + )); + } + + // Ping-pong detection runs even if individual hash counts are low. + if let Some(ping_pong_message) = self.detect_ping_pong() { + let repeats = self.count_ping_pong_repeats(); + if repeats >= self.config.ping_pong_min_repeats { + return LoopGuardVerdict::Block(ping_pong_message); + } + // Below min_repeats, send a warning via skip. + let warning_count = self + .warnings_emitted + .entry(format!("pingpong_{call_hash}")) + .or_insert(0); + *warning_count += 1; + if *warning_count <= self.config.max_warnings_per_call { + return LoopGuardVerdict::Block(ping_pong_message); + } + } + + LoopGuardVerdict::Allow + } + + /// If the same `(tool_name, args, result)` triple is seen enough times, + /// the call hash is poisoned so the next `check()` blocks before execution. + pub fn record_outcome(&mut self, tool_name: &str, args: &str, result: &str) { + let outcome_hash = Self::compute_outcome_hash(tool_name, args, result); + let call_hash = Self::compute_call_hash(tool_name, args); + + let count = self.outcome_counts.entry(outcome_hash).or_insert(0); + *count += 1; + let count_value = *count; + + if count_value >= self.config.outcome_block_threshold { + // Poison the call hash so the next check() auto-blocks. + self.blocked_outcomes.insert(call_hash); + tracing::debug!( + tool_name, + outcome_count = count_value, + "loop guard: outcome block threshold reached, poisoning call hash" + ); + } else if count_value >= self.config.outcome_warn_threshold { + tracing::debug!( + tool_name, + outcome_count = count_value, + "loop guard: identical outcome detected" + ); + } + } + + fn detect_ping_pong(&self) -> Option { + let history: Vec<&String> = self.recent_calls.iter().collect(); + let length = history.len(); + + // Check for pattern of length 2 (A-B-A-B-A-B) — need 6 entries. + if length >= 6 { + let tail = &history[length - 6..]; + let a = tail[0]; + let b = tail[1]; + if a != b && tail[2] == a && tail[3] == b && tail[4] == a && tail[5] == b { + let tool_a = self.resolve_tool_name(a); + let tool_b = self.resolve_tool_name(b); + return Some(format!( + "Ping-pong detected: tools '{}' and '{}' are alternating \ + repeatedly. Break the cycle by trying a different approach.", + tool_a, tool_b + )); + } + } + + // Check for pattern of length 3 (A-B-C-A-B-C-A-B-C) — need 9 entries. + if length >= 9 { + let tail = &history[length - 9..]; + let a = tail[0]; + let b = tail[1]; + let c = tail[2]; + // Ensure they're not all the same (that's just repetition). + if !(a == b && b == c) + && tail[3] == a + && tail[4] == b + && tail[5] == c + && tail[6] == a + && tail[7] == b + && tail[8] == c + { + let tool_a = self.resolve_tool_name(a); + let tool_b = self.resolve_tool_name(b); + let tool_c = self.resolve_tool_name(c); + return Some(format!( + "Ping-pong detected: tools '{}', '{}', '{}' are cycling \ + repeatedly. Break the cycle by trying a different approach.", + tool_a, tool_b, tool_c + )); + } + } + + None + } + + fn count_ping_pong_repeats(&self) -> u32 { + let history: Vec<&String> = self.recent_calls.iter().collect(); + let length = history.len(); + + if length >= 4 { + let a = history[length - 2]; + let b = history[length - 1]; + if a != b { + let mut repeats: u32 = 0; + let mut index = length; + while index >= 2 { + index -= 2; + if history[index] == a && history[index + 1] == b { + repeats += 1; + } else { + break; + } + } + if repeats >= 2 { + return repeats; + } + } + } + + if length >= 6 { + let a = history[length - 3]; + let b = history[length - 2]; + let c = history[length - 1]; + if !(a == b && b == c) { + let mut repeats: u32 = 0; + let mut index = length; + while index >= 3 { + index -= 3; + if history[index] == a && history[index + 1] == b && history[index + 2] == c { + repeats += 1; + } else { + break; + } + } + if repeats >= 2 { + return repeats; + } + } + } + + 0 + } + + fn resolve_tool_name(&self, hash: &str) -> String { + self.hash_to_tool + .get(hash) + .cloned() + .unwrap_or_else(|| "unknown".to_string()) + } + + // Poll tools get relaxed thresholds because they're expected to be + // called repeatedly (checking build output, watching deployment status). + fn is_poll_call(tool_name: &str, args: &str) -> bool { + if POLL_TOOLS.contains(&tool_name) { + let args_lower = args.to_lowercase(); + if args.len() < 200 + && (args_lower.contains("status") + || args_lower.contains("poll") + || args_lower.contains("wait") + || args_lower.contains("watch") + || args_lower.contains("tail") + || args_lower.contains("\"ps ") + || args_lower.contains("jobs") + || args_lower.contains("pgrep") + || args_lower.contains("docker ps") + || args_lower.contains("kubectl get")) + { + return true; + } + } + false + } + + fn compute_call_hash(tool_name: &str, args: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(tool_name.as_bytes()); + hasher.update(b"|"); + hasher.update(args.as_bytes()); + hex::encode(hasher.finalize()) + } + + fn compute_outcome_hash(tool_name: &str, args: &str, result: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(tool_name.as_bytes()); + hasher.update(b"|"); + hasher.update(args.as_bytes()); + hasher.update(b"|"); + let truncated = if result.len() > RESULT_HASH_TRUNCATION { + &result[..RESULT_HASH_TRUNCATION] + } else { + result + }; + hasher.update(truncated.as_bytes()); + hex::encode(hasher.finalize()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn worker_config() -> LoopGuardConfig { + LoopGuardConfig::for_process(ProcessType::Worker) + } + + fn channel_config() -> LoopGuardConfig { + LoopGuardConfig::for_process(ProcessType::Channel) + } + + #[test] + fn allow_below_threshold() { + let mut guard = LoopGuard::new(worker_config()); + let verdict = guard.check("shell", r#"{"command":"ls"}"#); + assert_eq!(verdict, LoopGuardVerdict::Allow); + let verdict = guard.check("shell", r#"{"command":"ls"}"#); + assert_eq!(verdict, LoopGuardVerdict::Allow); + } + + #[test] + fn block_at_warn_threshold() { + let mut guard = LoopGuard::new(worker_config()); + let args = r#"{"command":"cargo build"}"#; + // Worker warn_threshold = 4, so calls 1-3 are Allow. + for _ in 0..3 { + assert_eq!(guard.check("shell", args), LoopGuardVerdict::Allow); + } + // Call 4 hits warn threshold — delivered as a Block/Skip with warning. + let verdict = guard.check("shell", args); + assert!( + matches!(verdict, LoopGuardVerdict::Block(ref message) if message.contains("Warning")) + ); + } + + #[test] + fn block_at_block_threshold() { + let mut guard = LoopGuard::new(worker_config()); + let args = r#"{"command":"cargo build"}"#; + // Worker block_threshold = 7. + for _ in 0..6 { + guard.check("shell", args); + } + let verdict = guard.check("shell", args); + assert!( + matches!(verdict, LoopGuardVerdict::Block(ref message) if message.contains("Blocked")) + ); + } + + #[test] + fn different_params_no_collision() { + let mut guard = LoopGuard::new(worker_config()); + for i in 0..20 { + let args = format!(r#"{{"query":"query_{i}"}}"#); + assert_eq!(guard.check("web_search", &args), LoopGuardVerdict::Allow); + } + } + + #[test] + fn global_circuit_breaker() { + let config = LoopGuardConfig { + global_circuit_breaker: 5, + warn_threshold: 100, + block_threshold: 100, + ..worker_config() + }; + let mut guard = LoopGuard::new(config); + for i in 0..5 { + let args = format!(r#"{{"n":{i}}}"#); + assert_eq!(guard.check("tool", &args), LoopGuardVerdict::Allow); + } + // Call 6 triggers circuit breaker (> 5). + let verdict = guard.check("tool", r#"{"n":5}"#); + assert!(matches!(verdict, LoopGuardVerdict::CircuitBreak(_))); + } + + #[test] + fn channel_has_stricter_thresholds_than_worker() { + let channel = channel_config(); + let worker = worker_config(); + assert!(channel.warn_threshold < worker.warn_threshold); + assert!(channel.block_threshold < worker.block_threshold); + assert!(channel.global_circuit_breaker < worker.global_circuit_breaker); + } + + #[test] + fn outcome_repetition_poisons_call_hash() { + let mut guard = LoopGuard::new(worker_config()); + let args = r#"{"query":"weather"}"#; + let result = "sunny 72F"; + + // Worker outcome_block_threshold = 4. Record 4 identical outcomes. + for _ in 0..4 { + guard.record_outcome("web_search", args, result); + } + + // The next check() for this call hash should auto-block. + let verdict = guard.check("web_search", args); + assert!( + matches!(verdict, LoopGuardVerdict::Block(ref message) if message.contains("identical results")) + ); + } + + #[test] + fn different_results_do_not_poison() { + let mut guard = LoopGuard::new(worker_config()); + let args = r#"{"command":"cargo build"}"#; + + // Each call produces a different result — the agent is making progress. + for i in 0..10 { + guard.record_outcome("shell", args, &format!("error on line {i}")); + } + + // Call hash should not be poisoned. + let verdict = guard.check("shell", args); + assert_eq!(verdict, LoopGuardVerdict::Allow); + } + + #[test] + fn ping_pong_ab_detection() { + let config = LoopGuardConfig { + warn_threshold: 100, + block_threshold: 100, + ping_pong_min_repeats: 3, + ..worker_config() + }; + let mut guard = LoopGuard::new(config); + let args_a = r#"{"file":"a.txt"}"#; + let args_b = r#"{"file":"b.txt"}"#; + + // A-B-A-B-A-B = 3 repeats of (A,B). + for _ in 0..3 { + guard.check("file", args_a); + guard.check("file", args_b); + } + + // On the 7th call, pattern should be detected. + let verdict = guard.check("file", args_a); + assert!( + matches!(verdict, LoopGuardVerdict::Block(ref message) if message.contains("Ping-pong")) + || matches!(verdict, LoopGuardVerdict::Allow), + "Expected ping-pong detection or allow (pattern detected on prior call), got: {verdict:?}" + ); + } + + #[test] + fn ping_pong_abc_detection() { + let config = LoopGuardConfig { + warn_threshold: 100, + block_threshold: 100, + ping_pong_min_repeats: 3, + ..worker_config() + }; + let mut guard = LoopGuard::new(config); + + // A-B-C-A-B-C-A-B-C = 3 repeats of (A,B,C). + for _ in 0..3 { + guard.check("tool_a", r#"{"a":1}"#); + guard.check("tool_b", r#"{"b":2}"#); + guard.check("tool_c", r#"{"c":3}"#); + } + + // The 10th call should detect the pattern. + let verdict = guard.check("tool_a", r#"{"a":1}"#); + assert!( + matches!(verdict, LoopGuardVerdict::Block(ref message) if message.contains("Ping-pong")), + "Expected ping-pong detection, got: {verdict:?}" + ); + } + + #[test] + fn no_false_ping_pong() { + let mut guard = LoopGuard::new(worker_config()); + for i in 0..10 { + let args = format!(r#"{{"n":{i}}}"#); + let verdict = guard.check("tool", &args); + assert_eq!(verdict, LoopGuardVerdict::Allow); + } + } + + #[test] + fn poll_tool_relaxed_thresholds() { + let mut guard = LoopGuard::new(worker_config()); + // Worker: warn=4, poll_multiplier=3, so effective warn=12. + let args = r#"{"command":"docker ps --status running"}"#; + + // Calls 1-11 should all be Allow (below effective warn=12). + for i in 1..=11 { + let verdict = guard.check("shell", args); + assert_eq!( + verdict, + LoopGuardVerdict::Allow, + "Call {i} should be allowed for poll tool" + ); + } + + // Call 12 should trigger a warning. + let verdict = guard.check("shell", args); + assert!( + matches!(verdict, LoopGuardVerdict::Block(ref message) if message.contains("Warning")), + "Expected warning at poll threshold, got: {verdict:?}" + ); + } + + #[test] + fn non_poll_shell_not_relaxed() { + let mut guard = LoopGuard::new(worker_config()); + let args = r#"{"command":"echo hello"}"#; + + // Worker: warn=4, no poll multiplier for non-poll commands. + for _ in 0..3 { + assert_eq!(guard.check("shell", args), LoopGuardVerdict::Allow); + } + let verdict = guard.check("shell", args); + assert!(matches!(verdict, LoopGuardVerdict::Block(_))); + } + + #[test] + fn reset_clears_all_state() { + let mut guard = LoopGuard::new(worker_config()); + let args = r#"{"command":"ls"}"#; + + for _ in 0..3 { + guard.check("shell", args); + } + guard.record_outcome("shell", args, "file1 file2"); + + guard.reset(); + + // After reset, the first call should be allowed again. + assert_eq!(guard.check("shell", args), LoopGuardVerdict::Allow); + assert_eq!(guard.total_calls, 1); + } + + #[test] + fn warning_bucket_escalates_to_block() { + let config = LoopGuardConfig { + warn_threshold: 2, + block_threshold: 100, + max_warnings_per_call: 2, + ..worker_config() + }; + let mut guard = LoopGuard::new(config); + let args = r#"{"x":1}"#; + + // Call 1: Allow. + assert_eq!(guard.check("tool", args), LoopGuardVerdict::Allow); + + // Call 2: Block with warning (hits warn_threshold=2). + let verdict = guard.check("tool", args); + assert!(matches!(verdict, LoopGuardVerdict::Block(ref m) if m.contains("Warning"))); + + // Call 3: Block with warning again. + let verdict = guard.check("tool", args); + assert!(matches!(verdict, LoopGuardVerdict::Block(ref m) if m.contains("Warning"))); + + // Call 4: Block with "warnings exhausted" (max_warnings_per_call=2 exceeded). + let verdict = guard.check("tool", args); + assert!( + matches!(verdict, LoopGuardVerdict::Block(ref m) if m.contains("warnings exhausted")) + ); + } +} diff --git a/src/hooks/spacebot.rs b/src/hooks/spacebot.rs index 9f1a4a9a3..9232bb99e 100644 --- a/src/hooks/spacebot.rs +++ b/src/hooks/spacebot.rs @@ -1,5 +1,6 @@ //! SpacebotHook: Prompt hook for channels, branches, and workers. +use crate::hooks::loop_guard::{LoopGuard, LoopGuardConfig, LoopGuardVerdict}; use crate::{AgentId, ChannelId, ProcessEvent, ProcessId, ProcessType}; use rig::agent::{HookAction, PromptHook, ToolCallHookAction}; use rig::completion::{CompletionModel, CompletionResponse, Message, Prompt, PromptError}; @@ -45,6 +46,9 @@ pub struct SpacebotHook { /// tool call completes successfully, so the budget tracks *consecutive* /// text-only responses rather than total across the worker's lifetime. nudge_attempts: std::sync::Arc, + /// Detects repetitive tool calling patterns (identical calls, identical + /// outcomes, ping-pong cycles) and blocks them before execution. + loop_guard: std::sync::Arc>, } impl SpacebotHook { @@ -65,6 +69,7 @@ impl SpacebotHook { channel_id: Option, event_tx: broadcast::Sender, ) -> Self { + let loop_guard_config = LoopGuardConfig::for_process(process_type); Self { agent_id, process_id, @@ -76,6 +81,9 @@ impl SpacebotHook { nudge_request_active: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)), outcome_signaled: std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)), nudge_attempts: std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)), + loop_guard: std::sync::Arc::new(std::sync::Mutex::new(LoopGuard::new( + loop_guard_config, + ))), } } @@ -90,7 +98,13 @@ impl SpacebotHook { self.tool_nudge_policy } - /// Reset per-prompt state (tool nudging and outcome tracking). + /// Returns `true` if the worker has called `set_status` with `kind: "outcome"`. + pub fn outcome_signaled(&self) -> bool { + self.outcome_signaled + .load(std::sync::atomic::Ordering::Relaxed) + } + + /// Reset per-prompt state (tool nudging, outcome tracking, and loop guard). pub fn reset_tool_nudge_state(&self) { self.completion_calls .store(0, std::sync::atomic::Ordering::Relaxed); @@ -98,6 +112,9 @@ impl SpacebotHook { .store(false, std::sync::atomic::Ordering::Relaxed); self.nudge_attempts .store(0, std::sync::atomic::Ordering::Relaxed); + if let Ok(mut guard) = self.loop_guard.lock() { + guard.reset(); + } } fn set_tool_nudge_request_active(&self, active: bool) { @@ -550,6 +567,31 @@ where _internal_call_id: &str, args: &str, ) -> ToolCallHookAction { + // Loop guard: check for repetitive tool calling before execution. + // Runs for all process types. Block → Skip (message becomes tool + // result), CircuitBreak → Terminate. + if let Ok(mut guard) = self.loop_guard.lock() { + match guard.check(tool_name, args) { + LoopGuardVerdict::Allow => {} + LoopGuardVerdict::Block(reason) => { + tracing::warn!( + process_id = %self.process_id, + tool_name = %tool_name, + "loop guard blocked tool call" + ); + return ToolCallHookAction::Skip { reason }; + } + LoopGuardVerdict::CircuitBreak(reason) => { + tracing::warn!( + process_id = %self.process_id, + tool_name = %tool_name, + "loop guard circuit-breaking agent loop" + ); + return ToolCallHookAction::Terminate { reason }; + } + } + } + // Leak blocking is enforced at channel egress (`reply`). Worker and // branch tool calls may legitimately handle secrets internally. if self.process_type == ProcessType::Channel @@ -648,10 +690,20 @@ where self.record_tool_result_metrics(tool_name, internal_call_id); + // Record outcome for loop guard (outcome-aware repetition detection). + // The guard uses the (tool_name, args, result) triple to detect when + // the same call produces the same result repeatedly, and poisons the + // call hash so the next check() in on_tool_call auto-blocks. + if let Ok(mut guard) = self.loop_guard.lock() { + guard.record_outcome(tool_name, _args, result); + } + // A successful tool call proves the worker is still productive. // Reset the consecutive nudge counter so a brief narration blip // after many tool calls doesn't exhaust the retry budget. - if self.tool_nudge_policy.is_enabled() { + // Tool errors (from Rig's error path) don't count as productive. + let is_tool_error = result.starts_with("Toolset error:"); + if self.tool_nudge_policy.is_enabled() && !is_tool_error { self.nudge_attempts .store(0, std::sync::atomic::Ordering::Relaxed); } diff --git a/src/prompts/text.rs b/src/prompts/text.rs index 2af8b0131..46ab0a02e 100644 --- a/src/prompts/text.rs +++ b/src/prompts/text.rs @@ -147,7 +147,18 @@ fn lookup(lang: &str, key: &str) -> &'static str { include_str!("../../prompts/en/tools/set_status_description.md.j2") } ("en", "tools/shell") => include_str!("../../prompts/en/tools/shell_description.md.j2"), - ("en", "tools/file") => include_str!("../../prompts/en/tools/file_description.md.j2"), + ("en", "tools/file_read") => { + include_str!("../../prompts/en/tools/file_read_description.md.j2") + } + ("en", "tools/file_write") => { + include_str!("../../prompts/en/tools/file_write_description.md.j2") + } + ("en", "tools/file_edit") => { + include_str!("../../prompts/en/tools/file_edit_description.md.j2") + } + ("en", "tools/file_list") => { + include_str!("../../prompts/en/tools/file_list_description.md.j2") + } ("en", "tools/exec") => include_str!("../../prompts/en/tools/exec_description.md.j2"), ("en", "tools/browser") => include_str!("../../prompts/en/tools/browser_description.md.j2"), ("en", "tools/web_search") => { diff --git a/src/tools.rs b/src/tools.rs index 0a27f4250..bb88fe5d3 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -18,7 +18,7 @@ //! - `spawn_worker` is included for channel-originated branches only //! //! **Worker ToolServer** (one per worker, created at spawn time): -//! - `shell`, `file`, `exec` — stateless, registered at creation +//! - `shell`, `file_read`/`file_write`/`file_edit`/`file_list`, `exec` — stateless, registered at creation //! - `task_update` — scoped to the worker's assigned task //! - `set_status` — per-worker instance, registered at creation //! @@ -80,7 +80,11 @@ pub use config_inspect::{ pub use cron::{CronArgs, CronError, CronOutput, CronTool}; pub use email_search::{EmailSearchArgs, EmailSearchError, EmailSearchOutput, EmailSearchTool}; pub use exec::{EnvVar, ExecArgs, ExecError, ExecOutput, ExecResult, ExecTool}; -pub use file::{FileArgs, FileEntry, FileEntryOutput, FileError, FileOutput, FileTool, FileType}; +pub use file::{ + FileEditArgs, FileEditTool, FileEntry, FileEntryOutput, FileError, FileListArgs, FileListTool, + FileOutput, FileReadArgs, FileReadTool, FileType, FileWriteArgs, FileWriteTool, + register_file_tools, +}; pub use mcp::{McpToolAdapter, McpToolError, McpToolOutput}; pub use memory_delete::{ MemoryDeleteArgs, MemoryDeleteError, MemoryDeleteOutput, MemoryDeleteTool, @@ -497,8 +501,7 @@ pub fn create_worker_tool_server( ) -> ToolServerHandle { let mut server = ToolServer::new() .tool(ShellTool::new(workspace.clone(), sandbox.clone())) - .tool(FileTool::new(workspace.clone(), sandbox.clone())) - .tool(ExecTool::new(workspace, sandbox)) + .tool(ExecTool::new(workspace.clone(), sandbox.clone())) .tool(TaskUpdateTool::for_worker( task_store, agent_id.clone(), @@ -513,6 +516,8 @@ pub fn create_worker_tool_server( }) .tool(ReadSkillTool::new(runtime_config.clone())); + server = register_file_tools(server, workspace, sandbox); + if let Some(store) = runtime_config.secrets.load().as_ref() { server = server.tool(SecretSetTool::new(store.clone())); } @@ -596,8 +601,9 @@ pub fn create_cortex_chat_tool_server( .tool(TaskListTool::new(task_store.clone(), agent_id.to_string())) .tool(TaskUpdateTool::for_branch(task_store, agent_id.clone())) .tool(ShellTool::new(workspace.clone(), sandbox.clone())) - .tool(FileTool::new(workspace.clone(), sandbox.clone())) - .tool(ExecTool::new(workspace, sandbox)); + .tool(ExecTool::new(workspace.clone(), sandbox.clone())); + + server = register_file_tools(server, workspace, sandbox); if browser_config.enabled { server = register_browser_tools(server, browser_config, screenshot_dir, &runtime_config); diff --git a/src/tools/file.rs b/src/tools/file.rs index 88fc62819..b69bb4e28 100644 --- a/src/tools/file.rs +++ b/src/tools/file.rs @@ -1,4 +1,8 @@ -//! File tool for reading/writing/listing files (task workers only). +//! File tools for reading, writing, editing, and listing files (task workers only). +//! +//! Provides a suite of separate tools (`file_read`, `file_write`, `file_edit`, +//! `file_list`) backed by a shared `FileContext` that handles sandbox-aware path +//! validation. This mirrors the flat-tool pattern used by the browser tools. use crate::sandbox::Sandbox; use rig::completion::ToolDefinition; @@ -8,20 +12,19 @@ use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; use std::sync::Arc; -/// Tool for file operations with sandbox-aware path validation. -/// -/// When sandbox mode is enabled, file access is restricted to the workspace -/// boundary. When sandbox is disabled, any path accessible to the process is -/// allowed (relative paths are still resolved against the workspace root). +// Shared context + +/// Shared context cloned into each file tool. Holds workspace root and sandbox +/// for path validation, mirroring how `BrowserContext` is shared across browser +/// tools. #[derive(Debug, Clone)] -pub struct FileTool { +pub(crate) struct FileContext { workspace: PathBuf, sandbox: Arc, } -impl FileTool { - /// Create a new file tool with sandbox-aware path validation. - pub fn new(workspace: PathBuf, sandbox: Arc) -> Self { +impl FileContext { + fn new(workspace: PathBuf, sandbox: Arc) -> Self { Self { workspace, sandbox } } @@ -86,6 +89,27 @@ impl FileTool { Ok(canonical) } + + /// Check whether writing to a path is blocked by identity file protection. + /// Only applies when sandbox is enabled. + fn check_identity_protection(&self, path: &Path) -> Result<(), FileError> { + if !self.sandbox.mode_enabled() { + return Ok(()); + } + let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); + const PROTECTED_FILES: &[&str] = &["SOUL.md", "IDENTITY.md", "USER.md"]; + if PROTECTED_FILES + .iter() + .any(|f| file_name.eq_ignore_ascii_case(f)) + { + return Err(FileError( + "ACCESS DENIED: Identity files are protected and cannot be modified \ + through file operations. Use the identity management API instead." + .to_string(), + )); + } + Ok(()) + } } /// Canonicalize as much of the path as possible. For paths where the final @@ -118,30 +142,16 @@ fn best_effort_canonicalize(path: &Path) -> PathBuf { result } -/// Error type for file tool. +// Error type + +/// Error type for file tools. #[derive(Debug, thiserror::Error)] #[error("File operation failed: {0}")] pub struct FileError(String); -/// Arguments for file tool. -#[derive(Debug, Deserialize, JsonSchema)] -pub struct FileArgs { - /// The operation to perform. - pub operation: String, - /// The file or directory path. - pub path: String, - /// Content to write (required for write operation). - pub content: Option, - /// Whether to create parent directories if they don't exist (for write operations). - #[serde(default = "default_create_dirs")] - pub create_dirs: bool, -} - -fn default_create_dirs() -> bool { - true -} +// Shared output types -/// Output from file tool. +/// Output from file tools. #[derive(Debug, Serialize)] pub struct FileOutput { /// Whether the operation succeeded. @@ -151,10 +161,13 @@ pub struct FileOutput { /// The file/directory path. pub path: String, /// File content (for read operations). + #[serde(skip_serializing_if = "Option::is_none")] pub content: Option, /// Directory entries (for list operations). + #[serde(skip_serializing_if = "Option::is_none")] pub entries: Option>, /// Error message if operation failed. + #[serde(skip_serializing_if = "Option::is_none")] pub error: Option, } @@ -169,128 +182,357 @@ pub struct FileEntryOutput { pub size: u64, } -impl Tool for FileTool { - const NAME: &'static str = "file"; +// Tool: file_read + +#[derive(Debug, Clone)] +pub struct FileReadTool { + context: FileContext, +} + +/// Arguments for file_read. +#[derive(Debug, Deserialize, JsonSchema)] +pub struct FileReadArgs { + /// The file path. Relative paths are resolved from the workspace root. + pub path: String, + /// Line number to start reading from (1-indexed). Omit to start from the beginning. + pub offset: Option, + /// Maximum number of lines to return. Omit to read the entire file. + pub limit: Option, +} + +impl Tool for FileReadTool { + const NAME: &'static str = "file_read"; type Error = FileError; - type Args = FileArgs; + type Args = FileReadArgs; type Output = FileOutput; async fn definition(&self, _prompt: String) -> ToolDefinition { ToolDefinition { name: Self::NAME.to_string(), - description: crate::prompts::text::get("tools/file").to_string(), + description: crate::prompts::text::get("tools/file_read").to_string(), parameters: serde_json::json!({ "type": "object", "properties": { - "operation": { + "path": { "type": "string", - "enum": ["read", "write", "list"], - "description": "The file operation to perform" + "description": "The file path to read. Relative paths are resolved from the workspace root." + }, + "offset": { + "type": "integer", + "description": "Line number to start reading from (1-indexed). Omit to start from the beginning." }, + "limit": { + "type": "integer", + "description": "Maximum number of lines to return. Omit to read the entire file (up to size limit)." + } + }, + "required": ["path"] + }), + } + } + + async fn call(&self, args: Self::Args) -> Result { + let path = self.context.resolve_path(&args.path)?; + + let raw = tokio::fs::read_to_string(&path) + .await + .map_err(|error| FileError(format!("Failed to read file: {error}")))?; + + // Apply line-based offset/limit if requested + let content = if args.offset.is_some() || args.limit.is_some() { + let lines: Vec<&str> = raw.lines().collect(); + let total_lines = lines.len(); + + // offset is 1-indexed; default to line 1 + let start = args.offset.unwrap_or(1).saturating_sub(1).min(total_lines); + let end = if let Some(limit) = args.limit { + (start + limit).min(total_lines) + } else { + total_lines + }; + + let selected: Vec = lines[start..end] + .iter() + .enumerate() + .map(|(i, line)| format!("{}: {}", start + i + 1, line)) + .collect(); + + let mut result = selected.join("\n"); + if end < total_lines { + result.push_str(&format!( + "\n\n[showing lines {}-{} of {total_lines}. Use offset={} to see more]", + start + 1, + end, + end + 1 + )); + } + result + } else { + crate::tools::truncate_output(&raw, crate::tools::MAX_TOOL_OUTPUT_BYTES) + }; + + Ok(FileOutput { + success: true, + operation: "read".to_string(), + path: path.to_string_lossy().to_string(), + content: Some(content), + entries: None, + error: None, + }) + } +} + +// Tool: file_write + +#[derive(Debug, Clone)] +pub struct FileWriteTool { + context: FileContext, +} + +/// Arguments for file_write. +#[derive(Debug, Deserialize, JsonSchema)] +pub struct FileWriteArgs { + /// The file path to write. Relative paths are resolved from the workspace root. + pub path: String, + /// The content to write to the file. + pub content: String, + /// Whether to create parent directories if they don't exist. Defaults to true. + #[serde(default = "default_create_dirs")] + pub create_dirs: bool, +} + +fn default_create_dirs() -> bool { + true +} + +impl Tool for FileWriteTool { + const NAME: &'static str = "file_write"; + + type Error = FileError; + type Args = FileWriteArgs; + type Output = FileOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: crate::prompts::text::get("tools/file_write").to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { "path": { "type": "string", - "description": "The file or directory path. Relative paths are resolved from the workspace root." + "description": "The file path to write. Relative paths are resolved from the workspace root." }, "content": { "type": "string", - "description": "Content to write to the file (required for write operation)" + "description": "The content to write to the file." }, "create_dirs": { "type": "boolean", "default": true, - "description": "For write operations: create parent directories if they don't exist" + "description": "Create parent directories if they don't exist." } }, - "required": ["operation", "path"] + "required": ["path", "content"] }), } } async fn call(&self, args: Self::Args) -> Result { - let path = self.resolve_path(&args.path)?; - - match args.operation.as_str() { - "read" => do_file_read(&path).await, - "write" => { - // When sandbox is enabled, identity files must go through the - // dedicated identity API to keep the update flow consistent. - // When sandbox is disabled, the user has opted into full access. - if self.sandbox.mode_enabled() { - let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); - const PROTECTED_FILES: &[&str] = &["SOUL.md", "IDENTITY.md", "USER.md"]; - if PROTECTED_FILES - .iter() - .any(|f| file_name.eq_ignore_ascii_case(f)) - { - return Err(FileError( - "ACCESS DENIED: Identity files are protected and cannot be modified \ - through file operations. Use the identity management API instead." - .to_string(), - )); - } - } + let path = self.context.resolve_path(&args.path)?; + self.context.check_identity_protection(&path)?; - let content = args.content.ok_or_else(|| { - FileError("Content is required for write operation".to_string()) - })?; - do_file_write(&path, content, args.create_dirs).await - } - "list" => do_file_list(&path).await, - _ => Err(FileError(format!("Unknown operation: {}", args.operation))), + // Ensure parent directory exists if requested + if args.create_dirs + && let Some(parent) = path.parent() + { + tokio::fs::create_dir_all(parent) + .await + .map_err(|error| FileError(format!("Failed to create directory: {error}")))?; } + + tokio::fs::write(&path, &args.content) + .await + .map_err(|error| FileError(format!("Failed to write file: {error}")))?; + + Ok(FileOutput { + success: true, + operation: "write".to_string(), + path: path.to_string_lossy().to_string(), + content: None, + entries: None, + error: None, + }) } } -async fn do_file_read(path: &Path) -> Result { - let raw = tokio::fs::read_to_string(path) - .await - .map_err(|e| FileError(format!("Failed to read file: {e}")))?; +// Tool: file_edit - let content = crate::tools::truncate_output(&raw, crate::tools::MAX_TOOL_OUTPUT_BYTES); +#[derive(Debug, Clone)] +pub struct FileEditTool { + context: FileContext, +} - Ok(FileOutput { - success: true, - operation: "read".to_string(), - path: path.to_string_lossy().to_string(), - content: Some(content), - entries: None, - error: None, - }) +/// Arguments for file_edit. +#[derive(Debug, Deserialize, JsonSchema)] +pub struct FileEditArgs { + /// The file path to edit. Relative paths are resolved from the workspace root. + pub path: String, + /// The exact text to find in the file. + pub old_string: String, + /// The replacement text. + pub new_string: String, + /// Replace all occurrences instead of just the first. Defaults to false. + #[serde(default)] + pub replace_all: bool, } -async fn do_file_write( - path: &Path, - content: String, - create_dirs: bool, -) -> Result { - // Ensure parent directory exists if requested - if create_dirs && let Some(parent) = path.parent() { - tokio::fs::create_dir_all(parent) +impl Tool for FileEditTool { + const NAME: &'static str = "file_edit"; + + type Error = FileError; + type Args = FileEditArgs; + type Output = FileOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: crate::prompts::text::get("tools/file_edit").to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "The file path to edit. Relative paths are resolved from the workspace root." + }, + "old_string": { + "type": "string", + "description": "The exact text to find in the file." + }, + "new_string": { + "type": "string", + "description": "The replacement text." + }, + "replace_all": { + "type": "boolean", + "default": false, + "description": "Replace all occurrences instead of just the first." + } + }, + "required": ["path", "old_string", "new_string"] + }), + } + } + + async fn call(&self, args: Self::Args) -> Result { + let path = self.context.resolve_path(&args.path)?; + self.context.check_identity_protection(&path)?; + + let original = tokio::fs::read_to_string(&path) .await - .map_err(|e| FileError(format!("Failed to create directory: {e}")))?; + .map_err(|error| FileError(format!("Failed to read file: {error}")))?; + + // Count occurrences to provide useful feedback + let match_count = original.matches(&args.old_string).count(); + + if match_count == 0 { + return Err(FileError(format!( + "old_string not found in {}. Make sure the text matches exactly, \ + including whitespace and indentation. Use file_read to verify \ + the current file contents.", + args.path + ))); + } + + if !args.replace_all && match_count > 1 { + return Err(FileError(format!( + "Found {match_count} matches for old_string in {}. Provide more \ + surrounding context in old_string to identify a unique match, \ + or set replace_all to true to replace every occurrence.", + args.path + ))); + } + + let updated = if args.replace_all { + original.replace(&args.old_string, &args.new_string) + } else { + original.replacen(&args.old_string, &args.new_string, 1) + }; + + tokio::fs::write(&path, &updated) + .await + .map_err(|error| FileError(format!("Failed to write file: {error}")))?; + + let replacements = if args.replace_all { match_count } else { 1 }; + + Ok(FileOutput { + success: true, + operation: "edit".to_string(), + path: path.to_string_lossy().to_string(), + content: Some(format!( + "Replaced {replacements} occurrence{} in {}", + if replacements != 1 { "s" } else { "" }, + args.path + )), + entries: None, + error: None, + }) } +} - tokio::fs::write(path, content) - .await - .map_err(|e| FileError(format!("Failed to write file: {e}")))?; +// Tool: file_list - Ok(FileOutput { - success: true, - operation: "write".to_string(), - path: path.to_string_lossy().to_string(), - content: None, - entries: None, - error: None, - }) +#[derive(Debug, Clone)] +pub struct FileListTool { + context: FileContext, +} + +/// Arguments for file_list. +#[derive(Debug, Deserialize, JsonSchema)] +pub struct FileListArgs { + /// The directory path to list. Relative paths are resolved from the workspace root. + pub path: String, +} + +impl Tool for FileListTool { + const NAME: &'static str = "file_list"; + + type Error = FileError; + type Args = FileListArgs; + type Output = FileOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: crate::prompts::text::get("tools/file_list").to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "The directory path to list. Relative paths are resolved from the workspace root." + } + }, + "required": ["path"] + }), + } + } + + async fn call(&self, args: Self::Args) -> Result { + let path = self.context.resolve_path(&args.path)?; + do_file_list(&path).await + } } +// Internal helpers + async fn do_file_list(path: &Path) -> Result { let mut entries = Vec::new(); let mut reader = tokio::fs::read_dir(path) .await - .map_err(|e| FileError(format!("Failed to read directory: {e}")))?; + .map_err(|error| FileError(format!("Failed to read directory: {error}")))?; let max_entries = crate::tools::MAX_DIR_ENTRIES; let mut total_count = 0usize; @@ -298,7 +540,7 @@ async fn do_file_list(path: &Path) -> Result { while let Some(entry) = reader .next_entry() .await - .map_err(|e| FileError(format!("Failed to read entry: {e}")))? + .map_err(|error| FileError(format!("Failed to read entry: {error}")))? { total_count += 1; @@ -306,7 +548,7 @@ async fn do_file_list(path: &Path) -> Result { let metadata = entry .metadata() .await - .map_err(|e| FileError(format!("Failed to read metadata: {e}")))?; + .map_err(|error| FileError(format!("Failed to read metadata: {error}")))?; let entry_type = if metadata.is_file() { "file".to_string() @@ -356,6 +598,32 @@ async fn do_file_list(path: &Path) -> Result { }) } +// Tool registration helper + +/// Register all file tools on a `ToolServer`. The tools share a single +/// `FileContext` for path validation and sandbox enforcement. +pub fn register_file_tools( + server: rig::tool::server::ToolServer, + workspace: PathBuf, + sandbox: Arc, +) -> rig::tool::server::ToolServer { + let context = FileContext::new(workspace, sandbox); + + server + .tool(FileReadTool { + context: context.clone(), + }) + .tool(FileWriteTool { + context: context.clone(), + }) + .tool(FileEditTool { + context: context.clone(), + }) + .tool(FileListTool { context }) +} + +// Legacy types (used by system-internal callers) + /// File entry metadata (legacy). #[derive(Debug, Clone)] pub struct FileEntry { @@ -372,34 +640,41 @@ pub enum FileType { Other, } -/// System-internal file operations that bypass workspace containment. -/// These are used by the system itself (not LLM-facing) and operate on -/// arbitrary paths. +/// System-internal file read that bypasses workspace containment. +/// Used by the system itself (not LLM-facing) and operates on arbitrary paths. pub async fn file_read(path: impl AsRef) -> crate::error::Result { - do_file_read(path.as_ref()) + let raw = tokio::fs::read_to_string(path.as_ref()) .await - .map(|output| output.content.unwrap_or_default()) - .map_err(|e| crate::error::AgentError::Other(e.into()).into()) + .map_err(|error| { + crate::error::AgentError::Other(anyhow::anyhow!("Failed to read file: {error}")) + })?; + + let content = crate::tools::truncate_output(&raw, crate::tools::MAX_TOOL_OUTPUT_BYTES); + Ok(content) } +/// System-internal file write that bypasses workspace containment. pub async fn file_write( path: impl AsRef, content: impl AsRef<[u8]>, ) -> crate::error::Result<()> { - do_file_write( - path.as_ref(), - String::from_utf8_lossy(content.as_ref()).to_string(), - true, - ) - .await - .map(|_| ()) - .map_err(|e| crate::error::AgentError::Other(e.into()).into()) + let path = path.as_ref(); + if let Some(parent) = path.parent() { + tokio::fs::create_dir_all(parent).await.map_err(|error| { + crate::error::AgentError::Other(anyhow::anyhow!("Failed to create directory: {error}")) + })?; + } + tokio::fs::write(path, content).await.map_err(|error| { + crate::error::AgentError::Other(anyhow::anyhow!("Failed to write file: {error}")) + })?; + Ok(()) } +/// System-internal directory list that bypasses workspace containment. pub async fn file_list(path: impl AsRef) -> crate::error::Result> { let output = do_file_list(path.as_ref()) .await - .map_err(|e| crate::error::AgentError::Other(e.into()))?; + .map_err(|error| crate::error::AgentError::Other(error.into()))?; let entries = output.entries.ok_or_else(|| { crate::error::AgentError::Other(anyhow::anyhow!("No entries in list result")) @@ -407,18 +682,20 @@ pub async fn file_list(path: impl AsRef) -> crate::error::Result FileType::File, "directory" => FileType::Directory, _ => FileType::Other, }, - size: e.size, + size: entry.size, }) .collect()) } +// Tests + #[cfg(test)] mod tests { use super::*; @@ -434,6 +711,11 @@ mod tests { Arc::new(Sandbox::new_for_test(config, workspace.to_path_buf())) } + fn make_context(mode: SandboxMode, workspace: &Path) -> FileContext { + let sandbox = create_sandbox(mode, workspace); + FileContext::new(workspace.to_path_buf(), sandbox) + } + #[tokio::test] async fn sandbox_enabled_rejects_read_outside_workspace() { let temp_dir = tempfile::tempdir().expect("failed to create temp dir"); @@ -445,15 +727,16 @@ mod tests { let file = outside.join("secret.txt"); fs::write(&file, "secret data").expect("failed to write file"); - let sandbox = create_sandbox(SandboxMode::Enabled, &workspace); - let tool = FileTool::new(workspace, sandbox); + let context = make_context(SandboxMode::Enabled, &workspace); + let tool = FileReadTool { + context: context.clone(), + }; let result = tool - .call(FileArgs { - operation: "read".to_string(), + .call(FileReadArgs { path: file.to_string_lossy().into_owned(), - content: None, - create_dirs: false, + offset: None, + limit: None, }) .await; @@ -474,15 +757,14 @@ mod tests { let file = outside.join("report.txt"); fs::write(&file, "public data").expect("failed to write file"); - let sandbox = create_sandbox(SandboxMode::Disabled, &workspace); - let tool = FileTool::new(workspace, sandbox); + let context = make_context(SandboxMode::Disabled, &workspace); + let tool = FileReadTool { context }; let result = tool - .call(FileArgs { - operation: "read".to_string(), + .call(FileReadArgs { path: file.to_string_lossy().into_owned(), - content: None, - create_dirs: false, + offset: None, + limit: None, }) .await .expect("should succeed when sandbox is disabled"); @@ -501,14 +783,13 @@ mod tests { let file = outside.join("output.txt"); - let sandbox = create_sandbox(SandboxMode::Disabled, &workspace); - let tool = FileTool::new(workspace, sandbox); + let context = make_context(SandboxMode::Disabled, &workspace); + let tool = FileWriteTool { context }; let result = tool - .call(FileArgs { - operation: "write".to_string(), + .call(FileWriteArgs { path: file.to_string_lossy().into_owned(), - content: Some("written outside workspace".to_string()), + content: "written outside workspace".to_string(), create_dirs: false, }) .await @@ -525,14 +806,15 @@ mod tests { let workspace = temp_dir.path().join("workspace"); fs::create_dir_all(&workspace).expect("failed to create workspace"); - let sandbox = create_sandbox(SandboxMode::Enabled, &workspace); - let tool = FileTool::new(workspace.clone(), sandbox); + let context = make_context(SandboxMode::Enabled, &workspace); + let tool = FileWriteTool { + context: context.clone(), + }; let result = tool - .call(FileArgs { - operation: "write".to_string(), + .call(FileWriteArgs { path: workspace.join("SOUL.md").to_string_lossy().into_owned(), - content: Some("overwritten".to_string()), + content: "overwritten".to_string(), create_dirs: false, }) .await; @@ -552,14 +834,13 @@ mod tests { let workspace = temp_dir.path().join("workspace"); fs::create_dir_all(&workspace).expect("failed to create workspace"); - let sandbox = create_sandbox(SandboxMode::Disabled, &workspace); - let tool = FileTool::new(workspace.clone(), sandbox); + let context = make_context(SandboxMode::Disabled, &workspace); + let tool = FileWriteTool { context }; let result = tool - .call(FileArgs { - operation: "write".to_string(), + .call(FileWriteArgs { path: workspace.join("IDENTITY.md").to_string_lossy().into_owned(), - content: Some("new identity".to_string()), + content: "new identity".to_string(), create_dirs: false, }) .await @@ -570,4 +851,177 @@ mod tests { fs::read_to_string(workspace.join("IDENTITY.md")).expect("failed to read file"); assert_eq!(written, "new identity"); } + + #[tokio::test] + async fn file_edit_replaces_single_occurrence() { + let temp_dir = tempfile::tempdir().expect("failed to create temp dir"); + let workspace = temp_dir.path().join("workspace"); + fs::create_dir_all(&workspace).expect("failed to create workspace"); + + let file = workspace.join("test.txt"); + fs::write(&file, "hello world").expect("failed to write file"); + + let context = make_context(SandboxMode::Disabled, &workspace); + let tool = FileEditTool { context }; + + let result = tool + .call(FileEditArgs { + path: file.to_string_lossy().into_owned(), + old_string: "hello".to_string(), + new_string: "goodbye".to_string(), + replace_all: false, + }) + .await + .expect("edit should succeed"); + + assert!(result.success); + let content = fs::read_to_string(&file).expect("failed to read file"); + assert_eq!(content, "goodbye world"); + } + + #[tokio::test] + async fn file_edit_rejects_ambiguous_match() { + let temp_dir = tempfile::tempdir().expect("failed to create temp dir"); + let workspace = temp_dir.path().join("workspace"); + fs::create_dir_all(&workspace).expect("failed to create workspace"); + + let file = workspace.join("test.txt"); + fs::write(&file, "aaa bbb aaa").expect("failed to write file"); + + let context = make_context(SandboxMode::Disabled, &workspace); + let tool = FileEditTool { context }; + + let result = tool + .call(FileEditArgs { + path: file.to_string_lossy().into_owned(), + old_string: "aaa".to_string(), + new_string: "ccc".to_string(), + replace_all: false, + }) + .await; + + let error = result + .expect_err("should reject ambiguous match") + .to_string(); + assert!( + error.contains("Found 2 matches"), + "unexpected error: {error}" + ); + } + + #[tokio::test] + async fn file_edit_replace_all() { + let temp_dir = tempfile::tempdir().expect("failed to create temp dir"); + let workspace = temp_dir.path().join("workspace"); + fs::create_dir_all(&workspace).expect("failed to create workspace"); + + let file = workspace.join("test.txt"); + fs::write(&file, "aaa bbb aaa").expect("failed to write file"); + + let context = make_context(SandboxMode::Disabled, &workspace); + let tool = FileEditTool { context }; + + let result = tool + .call(FileEditArgs { + path: file.to_string_lossy().into_owned(), + old_string: "aaa".to_string(), + new_string: "ccc".to_string(), + replace_all: true, + }) + .await + .expect("replace_all should succeed"); + + assert!(result.success); + let content = fs::read_to_string(&file).expect("failed to read file"); + assert_eq!(content, "ccc bbb ccc"); + } + + #[tokio::test] + async fn file_edit_not_found() { + let temp_dir = tempfile::tempdir().expect("failed to create temp dir"); + let workspace = temp_dir.path().join("workspace"); + fs::create_dir_all(&workspace).expect("failed to create workspace"); + + let file = workspace.join("test.txt"); + fs::write(&file, "hello world").expect("failed to write file"); + + let context = make_context(SandboxMode::Disabled, &workspace); + let tool = FileEditTool { context }; + + let result = tool + .call(FileEditArgs { + path: file.to_string_lossy().into_owned(), + old_string: "nonexistent".to_string(), + new_string: "replacement".to_string(), + replace_all: false, + }) + .await; + + let error = result.expect_err("should fail when not found").to_string(); + assert!( + error.contains("old_string not found"), + "unexpected error: {error}" + ); + } + + #[tokio::test] + async fn file_edit_blocks_identity_file() { + let temp_dir = tempfile::tempdir().expect("failed to create temp dir"); + let workspace = temp_dir.path().join("workspace"); + fs::create_dir_all(&workspace).expect("failed to create workspace"); + fs::write(workspace.join("SOUL.md"), "original").expect("failed to write file"); + + let context = make_context(SandboxMode::Enabled, &workspace); + let tool = FileEditTool { context }; + + let result = tool + .call(FileEditArgs { + path: workspace.join("SOUL.md").to_string_lossy().into_owned(), + old_string: "original".to_string(), + new_string: "hacked".to_string(), + replace_all: false, + }) + .await; + + let error = result + .expect_err("should block identity file edit") + .to_string(); + assert!( + error.contains("Identity files are protected"), + "unexpected error: {error}" + ); + } + + #[tokio::test] + async fn file_read_with_offset_and_limit() { + let temp_dir = tempfile::tempdir().expect("failed to create temp dir"); + let workspace = temp_dir.path().join("workspace"); + fs::create_dir_all(&workspace).expect("failed to create workspace"); + + let file = workspace.join("lines.txt"); + fs::write(&file, "line1\nline2\nline3\nline4\nline5").expect("failed to write file"); + + let context = make_context(SandboxMode::Disabled, &workspace); + let tool = FileReadTool { context }; + + let result = tool + .call(FileReadArgs { + path: file.to_string_lossy().into_owned(), + offset: Some(2), + limit: Some(2), + }) + .await + .expect("read with offset should succeed"); + + assert!(result.success); + let content = result.content.unwrap(); + assert!(content.contains("2: line2"), "should contain line 2"); + assert!(content.contains("3: line3"), "should contain line 3"); + assert!(!content.contains("line1"), "should not contain line 1"); + assert!(!content.contains("line4"), "should not contain line 4"); + assert!( + content.contains("showing lines 2-3 of 5"), + "should have continuation notice" + ); + } } diff --git a/src/tools/spawn_worker.rs b/src/tools/spawn_worker.rs index 31c8ad32d..969bc1975 100644 --- a/src/tools/spawn_worker.rs +++ b/src/tools/spawn_worker.rs @@ -87,7 +87,14 @@ impl Tool for SpawnWorkerTool { let web_search_enabled = rc.brave_search_key.load().is_some(); let opencode_enabled = rc.opencode.load().enabled; - let mut tools_list = vec!["shell", "file", "exec"]; + let mut tools_list = vec![ + "shell", + "file_read", + "file_write", + "file_edit", + "file_list", + "exec", + ]; if browser_enabled { tools_list.push("browser"); } From bc39faca53fecc156b70aa0ef0bc9b3c35402d30 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 16:41:54 -0800 Subject: [PATCH 11/11] fix: remove unused import and coerce unknown values in JSX conditionals --- interface/src/components/ToolCall.tsx | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/interface/src/components/ToolCall.tsx b/interface/src/components/ToolCall.tsx index 058822502..d6ff56839 100644 --- a/interface/src/components/ToolCall.tsx +++ b/interface/src/components/ToolCall.tsx @@ -1,6 +1,6 @@ import { useState } from "react"; import { cx } from "@/ui/utils"; -import type { ActionContent, TranscriptStep, OpenCodePart } from "@/api/client"; +import type { TranscriptStep, OpenCodePart } from "@/api/client"; // --------------------------------------------------------------------------- // Types @@ -428,12 +428,12 @@ const toolRenderers: Record = { if (!path && !content) return null; return (
- {path && ( + {!!path && (

{String(path)}

)} - {content && ( + {!!content && (
 							{truncate(String(content), 2000)}
 						
@@ -463,7 +463,7 @@ const toolRenderers: Record = {

{String(path)}

- {oldStr && ( + {!!oldStr && (

Old @@ -473,7 +473,7 @@ const toolRenderers: Record = {

)} - {newStr && ( + {!!newStr && (

New @@ -581,12 +581,12 @@ const toolRenderers: Record = { if (!filePath && !content) return null; return (

- {filePath && ( + {!!filePath && (

{String(filePath)}

)} - {content && ( + {!!content && (
 							{truncate(String(content), 2000)}
 						
@@ -616,7 +616,7 @@ const toolRenderers: Record = {

{String(filePath)}

- {oldStr && ( + {!!oldStr && (

Old @@ -626,7 +626,7 @@ const toolRenderers: Record = {

)} - {newStr && ( + {!!newStr && (

New @@ -749,7 +749,7 @@ const toolRenderers: Record = { Search: {String(query)}

- {(count || freshness) && ( + {!!(count || freshness) && (

{count ? `${count} results` : ""} {count && freshness ? " · " : ""} @@ -782,12 +782,12 @@ const toolRenderers: Record = { if (!action && !docId) return null; return (

- {docId && ( + {!!docId && (

{String(docId)}

)} - {query && ( + {!!query && (

filter: {String(query)}