diff --git a/.changeset/odd-snakes-film.md b/.changeset/odd-snakes-film.md new file mode 100644 index 000000000..8fb327f7e --- /dev/null +++ b/.changeset/odd-snakes-film.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Add SupportedUnderstudyActions to observe system prompt diff --git a/packages/README.md b/packages/README.md index a92183d77..e019ef1d2 100644 --- a/packages/README.md +++ b/packages/README.md @@ -4,4 +4,5 @@ This directory contains the Stagehand monorepo packages: - **core** - The main Stagehand package - **evals** - Evals CLI -- **docs** - [Docs](https://docs.stagehand.dev) \ No newline at end of file +- **docs** - [Docs](https://docs.stagehand.dev) +- **server** - Fastify server wrapping the core package for different language clients \ No newline at end of file diff --git a/packages/core/lib/inference.ts b/packages/core/lib/inference.ts index 9b843e043..7d60b4ae2 100644 --- a/packages/core/lib/inference.ts +++ b/packages/core/lib/inference.ts @@ -227,6 +227,7 @@ export async function observe({ userProvidedInstructions, logger, logInferenceToFile = false, + supportedActions, }: { instruction: string; domElements: string; @@ -234,6 +235,7 @@ export async function observe({ userProvidedInstructions?: string; logger: (message: LogLine) => void; logInferenceToFile?: boolean; + supportedActions?: string[]; }) { const isGPT5 = llmClient.modelName.includes("gpt-5"); // TODO: remove this as we update support for gpt-5 configuration options @@ -271,7 +273,7 @@ export async function observe({ type ObserveResponse = z.infer; const messages: ChatMessage[] = [ - buildObserveSystemPrompt(userProvidedInstructions), + buildObserveSystemPrompt(userProvidedInstructions, supportedActions), buildObserveUserMessage(instruction, domElements), ]; diff --git a/packages/core/lib/prompt.ts b/packages/core/lib/prompt.ts index de8806af2..ad18b8a67 100644 --- a/packages/core/lib/prompt.ts +++ b/packages/core/lib/prompt.ts @@ -110,7 +110,12 @@ Extracted content: ${JSON.stringify(extractionResponse, null, 2)}`, // observe export function buildObserveSystemPrompt( userProvidedInstructions?: string, + supportedActions?: string[], ): ChatMessage { + const actionsString = supportedActions?.length + ? `\n\nSupported actions: ${supportedActions.join(", ")}` + : ""; + const observeSystemPrompt = ` You are helping the user automate the browser by finding elements based on what the user wants to observe in the page. @@ -118,7 +123,8 @@ You will be given: 1. a instruction of elements to observe 2. a hierarchical accessibility tree showing the semantic structure of the page. The tree is a hybrid of the DOM and the accessibility tree. -Return an array of elements that match the instruction if they exist, otherwise return an empty array.`; +Return an array of elements that match the instruction if they exist, otherwise return an empty array. +When returning elements, include the appropriate method from the supported actions list.${actionsString}`; const content = observeSystemPrompt.replace(/\s+/g, " "); return { diff --git a/packages/core/lib/v3/handlers/actHandler.ts b/packages/core/lib/v3/handlers/actHandler.ts index 1b6b924d1..70f2cf2a1 100644 --- a/packages/core/lib/v3/handlers/actHandler.ts +++ b/packages/core/lib/v3/handlers/actHandler.ts @@ -11,7 +11,7 @@ import { diffCombinedTrees, } from "../understudy/a11y/snapshot"; import { LLMClient } from "../llm/LLMClient"; -import { SupportedPlaywrightAction } from "../types/private"; +import { SupportedUnderstudyAction } from "../types/private"; import { EncodedId } from "../types/private/internal"; import { AvailableModel, @@ -157,7 +157,7 @@ export class ActHandler { const actInstruction = buildActPrompt( instruction, - Object.values(SupportedPlaywrightAction), + Object.values(SupportedUnderstudyAction), variables, ); @@ -218,13 +218,13 @@ export class ActHandler { const stepTwoInstructions = buildStepTwoPrompt( instruction, previousAction, - Object.values(SupportedPlaywrightAction).filter( + Object.values(SupportedUnderstudyAction).filter( ( action, ): action is Exclude< - SupportedPlaywrightAction, - SupportedPlaywrightAction.SELECT_OPTION_FROM_DROPDOWN - > => action !== SupportedPlaywrightAction.SELECT_OPTION_FROM_DROPDOWN, + SupportedUnderstudyAction, + SupportedUnderstudyAction.SELECT_OPTION_FROM_DROPDOWN + > => action !== SupportedUnderstudyAction.SELECT_OPTION_FROM_DROPDOWN, ), variables, ); @@ -364,7 +364,7 @@ export class ActHandler { const instruction = buildActPrompt( actCommand, - Object.values(SupportedPlaywrightAction), + Object.values(SupportedUnderstudyAction), {}, ); @@ -471,10 +471,47 @@ function normalizeActInferenceElement( return undefined; } + // For dragAndDrop, convert element ID in arguments to xpath (target element) + let resolvedArgs = hasArgs ? args : undefined; + if (method === "dragAndDrop" && hasArgs && args.length > 0) { + const targetArg = args[0]; + // Check if argument looks like an element ID (e.g., "1-67") + if (typeof targetArg === "string" && /^\d+-\d+$/.test(targetArg)) { + const argXpath = xpathMap[targetArg as EncodedId]; + const trimmedArgXpath = trimTrailingTextNode(argXpath); + if (trimmedArgXpath) { + resolvedArgs = [`xpath=${trimmedArgXpath}`, ...args.slice(1)]; + } else { + // Target element lookup failed, filter out this action + v3Logger({ + category: "action", + message: "dragAndDrop target element lookup failed", + level: 1, + auxiliary: { + targetElementId: { value: targetArg, type: "string" }, + sourceElementId: { value: elementId, type: "string" }, + }, + }); + return undefined; + } + } else { + v3Logger({ + category: "action", + message: "dragAndDrop target element invalid ID format", + level: 0, + auxiliary: { + targetElementId: { value: String(targetArg), type: "string" }, + sourceElementId: { value: elementId, type: "string" }, + }, + }); + return undefined; + } + } + return { description, method, - arguments: hasArgs ? args : undefined, + arguments: resolvedArgs, selector: `xpath=${trimmed}`, } as Action; } diff --git a/packages/core/lib/v3/handlers/observeHandler.ts b/packages/core/lib/v3/handlers/observeHandler.ts index b66f3f3eb..99a05ba44 100644 --- a/packages/core/lib/v3/handlers/observeHandler.ts +++ b/packages/core/lib/v3/handlers/observeHandler.ts @@ -5,7 +5,10 @@ import { v3Logger } from "../logger"; import { V3FunctionName } from "../types/public/methods"; import { captureHybridSnapshot } from "../understudy/a11y/snapshot"; import { LLMClient } from "../llm/LLMClient"; -import { ObserveHandlerParams } from "../types/private/handlers"; +import { + ObserveHandlerParams, + SupportedUnderstudyAction, +} from "../types/private/handlers"; import { EncodedId } from "../types/private/internal"; import { Action } from "../types/public/methods"; import { @@ -114,6 +117,7 @@ export class ObserveHandler { userProvidedInstructions: this.systemPrompt, logger: v3Logger, logInferenceToFile: this.logInferenceToFile, + supportedActions: Object.values(SupportedUnderstudyAction), }); const { @@ -145,8 +149,56 @@ export class ObserveHandler { const trimmedXpath = trimTrailingTextNode(xpath); if (!trimmedXpath) return undefined; + // For dragAndDrop, convert element ID in arguments to xpath (target element) + let resolvedArgs = rest.arguments; + if ( + rest.method === "dragAndDrop" && + Array.isArray(rest.arguments) && + rest.arguments.length > 0 + ) { + const targetArg = rest.arguments[0]; + // Check if argument looks like an element ID (e.g., "1-67") + if ( + typeof targetArg === "string" && + /^\d+-\d+$/.test(targetArg) + ) { + const argXpath = combinedXpathMap[targetArg as EncodedId]; + const trimmedArgXpath = trimTrailingTextNode(argXpath); + if (trimmedArgXpath) { + resolvedArgs = [ + `xpath=${trimmedArgXpath}`, + ...rest.arguments.slice(1), + ]; + } else { + // Target element lookup failed, filter out this action + v3Logger({ + category: "observation", + message: "dragAndDrop target element lookup failed", + level: 0, + auxiliary: { + targetElementId: { value: targetArg, type: "string" }, + sourceElementId: { value: elementId, type: "string" }, + }, + }); + return undefined; + } + } else { + v3Logger({ + category: "observation", + message: "dragAndDrop target element invalid ID format", + level: 0, + auxiliary: { + targetElementId: { value: targetArg, type: "string" }, + sourceElementId: { value: elementId, type: "string" }, + }, + }); + return undefined; + } + } + return { ...rest, + arguments: resolvedArgs, selector: `xpath=${trimmedXpath}`, } as { description: string; diff --git a/packages/core/lib/v3/types/private/handlers.ts b/packages/core/lib/v3/types/private/handlers.ts index 5bab7f3c0..b994cf3a9 100644 --- a/packages/core/lib/v3/types/private/handlers.ts +++ b/packages/core/lib/v3/types/private/handlers.ts @@ -27,8 +27,8 @@ export interface ObserveHandlerParams { page: Page; } -// We can use this enum to list the actions supported in performPlaywrightMethod -export enum SupportedPlaywrightAction { +// We can use this enum to list the actions supported in performUnderstudyMethod +export enum SupportedUnderstudyAction { CLICK = "click", FILL = "fill", TYPE = "type", @@ -38,4 +38,6 @@ export enum SupportedPlaywrightAction { PREV_CHUNK = "prevChunk", SELECT_OPTION_FROM_DROPDOWN = "selectOptionFromDropdown", HOVER = "hover", + DOUBLE_CLICK = "doubleClick", + DRAG_AND_DROP = "dragAndDrop", }