diff --git a/examples/page-actions/extract.ts b/examples/page-actions/extract.ts new file mode 100644 index 0000000..62d52fc --- /dev/null +++ b/examples/page-actions/extract.ts @@ -0,0 +1,67 @@ +/** + * # Extract Example + * + * This example demonstrates how to use HyperAgent with a defined output schema + * to ensure structured and validated responses from the agent. + * + * ## What This Example Does + * + * The agent performs a task with structured output that: + * 1. Defines a Zod schema for the expected output format + * 2. Performs actions to complete the specified task + * 3. Returns movie information in a structured format specified + * + * ## Prerequisites + * + * 1. Node.js environment + * 2. OpenAI API key set in your .env file (OPENAI_API_KEY) + * + * ## Running the Example + * + * ```bash + * yarn ts-node -r tsconfig-paths/register examples/output-to-schema/output-to-schema.ts + * ``` + */ + +import "dotenv/config"; +import { HyperAgent } from "@hyperbrowser/agent"; + +import chalk from "chalk"; +import { ChatOpenAI } from "@langchain/openai"; +import { z } from "zod"; + +async function runEval() { + const llm = new ChatOpenAI({ + apiKey: process.env.OPENAI_API_KEY, + model: "gpt-4o", + }); + + const agent = new HyperAgent({ + llm: llm, + debug: true, + }); + + const page = await agent.newPage(); + await page.goto("https://www.imdb.com/title/tt0133093/"); + + const result = await page.extract( + "extract the director, release year, and rating", + z.object({ + director: z.array(z.string().describe("The name of the movie director")), + releaseYear: z.number().describe("The year the movie was released"), + rating: z.string().describe("The IMDb rating of the movie"), + }) + ); + + await agent.closeAgent(); + console.log(chalk.green.bold("\nResult:")); + console.log(chalk.white(JSON.stringify(result, null, 2))); + return result; +} + +(async () => { + await runEval(); +})().catch((error) => { + console.error(chalk.red("Error:"), error); + process.exit(1); +}); diff --git a/src/agent/actions/extract.ts b/src/agent/actions/extract.ts index 5be04ad..977e745 100644 --- a/src/agent/actions/extract.ts +++ b/src/agent/actions/extract.ts @@ -13,7 +13,7 @@ export const ExtractAction = z export type ExtractActionType = z.infer; -export const ExtractActionDefinition: AgentActionDefinition = { +export const ExtractActionDefinition: AgentActionDefinition = { type: "extract" as const, actionParams: ExtractAction, run: async ( diff --git a/src/agent/index.ts b/src/agent/index.ts index e6954c8..fb37874 100644 --- a/src/agent/index.ts +++ b/src/agent/index.ts @@ -34,6 +34,7 @@ import { runAgentTask } from "./tools/agent"; import { HyperPage, HyperVariable } from "@/types/agent/types"; import { z } from "zod"; import { ErrorEmitter } from "@/utils"; +import { PageExtractFn } from "./tools/page-actions/extract"; export class HyperAgent { private llm: BaseChatModel; @@ -570,27 +571,13 @@ export class HyperAgent { 400 ); } - if (task) { - const res = await this.executeTask( - `You have to perform an extraction on the current page. You have to perform the extraction according to the task: ${task}. Make sure your final response only contains the extracted content`, - { - maxSteps: 2, - outputSchema, - }, - page - ); - if (outputSchema) { - return JSON.parse(res.output as string); - } - return res.output as string; - } else { - const res = await this.executeTask( - "You have to perform a data extraction on the current page. Make sure your final response only contains the extracted content", - { maxSteps: 2, outputSchema }, - page - ); - return JSON.parse(res.output as string); - } + return await PageExtractFn({ + task, + schema: outputSchema, + page, + llm: this.llm, + tokenLimit: this.tokenLimit, + }); }; return hyperPage; } diff --git a/src/agent/tools/page-actions/extract.ts b/src/agent/tools/page-actions/extract.ts new file mode 100644 index 0000000..b833ea5 --- /dev/null +++ b/src/agent/tools/page-actions/extract.ts @@ -0,0 +1,128 @@ +import { z } from "zod"; +import { parseMarkdown } from "@/utils/html-to-markdown"; +import { Page } from "playwright"; +import { BaseChatModel } from "@langchain/core/language_models/chat_models"; +import { HumanMessage, SystemMessage } from "@langchain/core/messages"; + +export interface ExtractOptions< + T extends z.AnyZodObject | undefined = z.AnyZodObject, +> { + schema?: T; + task?: string; + page: Page; + llm: BaseChatModel; + tokenLimit?: number; +} + +export async function PageExtractFn< + T extends z.AnyZodObject | undefined = z.AnyZodObject, +>({ + schema, + task, + page, + llm, + tokenLimit = 4000, +}: ExtractOptions): Promise : string> { + if (!schema && !task) { + throw new Error("Either schema or task must be provided"); + } + + // Get page content and convert to markdown + const content = await page.content(); + const markdown = await parseMarkdown(content); + + // Get page metadata + const metadata = await page.evaluate(() => { + const meta = { + title: document.title, + description: + document + .querySelector('meta[name="description"]') + ?.getAttribute("content") || "", + keywords: + document + .querySelector('meta[name="keywords"]') + ?.getAttribute("content") || "", + ogTitle: + document + .querySelector('meta[property="og:title"]') + ?.getAttribute("content") || "", + ogDescription: + document + .querySelector('meta[property="og:description"]') + ?.getAttribute("content") || "", + ogImage: + document + .querySelector('meta[property="og:image"]') + ?.getAttribute("content") || "", + canonicalUrl: + document.querySelector('link[rel="canonical"]')?.getAttribute("href") || + "", + }; + return meta; + }); + + // TODO: Maybe take fullscreen screenshots here, and then break them up into manageable chunks usable by the LLM. + // Take screenshot for context + const cdpSession = await page.context().newCDPSession(page); + const screenshot = await cdpSession.send("Page.captureScreenshot"); + cdpSession.detach(); + + // TODO: Maybe use js-tiktoken here ? + // Trim markdown to stay within token limit + const avgTokensPerChar = 0.75; + const maxChars = Math.floor(tokenLimit / avgTokensPerChar); + const trimmedMarkdown = + markdown.length > maxChars + ? markdown.slice(0, maxChars) + "\n[Content truncated due to length]" + : markdown; + + // Create messages + const messages = [ + new SystemMessage( + `You are an expert at extracting structured information from web pages. Your task is to: +1. Analyze the provided markdown content, metadata, and screenshot of a webpage +2. Extract relevant information based on the provided task and schema (if any) +3. Pay attention to both the text content and visual layout +4. Handle cases where information might be split across different sections +5. Ensure the response is complete and accurate +6. Format the response appropriately based on the schema (if provided) + +Remember to: +- Look for information in both the main content and page metadata (title, description, etc.) +- Consider the visual hierarchy and layout of the page +- Handle cases where information might be ambiguous or incomplete +- Ensure the response is complete and accurate` + ), + new HumanMessage({ + content: [ + { + type: "text", + text: `Extract information from the page${task ? ` according to this task: ${task}` : ""}${schema ? " and format according to the schema" : ""}`, + }, + { type: "text", text: "Here is the page metadata:" }, + { type: "text", text: JSON.stringify(metadata, null, 2) }, + { type: "text", text: "Here is the page content:" }, + { type: "text", text: trimmedMarkdown }, + { type: "text", text: "Here is a screenshot of the page:" }, + { + type: "image_url", + image_url: { + url: `data:image/png;base64,${screenshot.data}`, + }, + }, + ], + }), + ]; + + if (schema) { + // Create structured output chain + const chain = llm.withStructuredOutput(schema); + const result = await chain.invoke(messages); + return result as T extends z.AnyZodObject ? z.infer : string; + } else { + // For task-based extraction, get raw response + const response = await llm.invoke(messages); + return response.content as T extends z.AnyZodObject ? z.infer : string; + } +}