Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions examples/page-actions/extract.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/**
* # Extract Example
*
* This example demonstrates how to use HyperAgent with a defined output schema
* to ensure structured and validated responses from the agent.
*
* ## What This Example Does
*
* The agent performs a task with structured output that:
* 1. Defines a Zod schema for the expected output format
* 2. Performs actions to complete the specified task
* 3. Returns movie information in a structured format specified
*
* ## Prerequisites
*
* 1. Node.js environment
* 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
*
* ## Running the Example
*
* ```bash
* yarn ts-node -r tsconfig-paths/register examples/output-to-schema/output-to-schema.ts
* ```
*/

import "dotenv/config";
import { HyperAgent } from "@hyperbrowser/agent";

import chalk from "chalk";
import { ChatOpenAI } from "@langchain/openai";
import { z } from "zod";

async function runEval() {
const llm = new ChatOpenAI({
apiKey: process.env.OPENAI_API_KEY,
model: "gpt-4o",
});

const agent = new HyperAgent({
llm: llm,
debug: true,
});

const page = await agent.newPage();
await page.goto("https://www.imdb.com/title/tt0133093/");

const result = await page.extract(
"extract the director, release year, and rating",
z.object({
director: z.array(z.string().describe("The name of the movie director")),
releaseYear: z.number().describe("The year the movie was released"),
rating: z.string().describe("The IMDb rating of the movie"),
})
);

await agent.closeAgent();
console.log(chalk.green.bold("\nResult:"));
console.log(chalk.white(JSON.stringify(result, null, 2)));
return result;
}

(async () => {
await runEval();
})().catch((error) => {
console.error(chalk.red("Error:"), error);
process.exit(1);
});
2 changes: 1 addition & 1 deletion src/agent/actions/extract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ export const ExtractAction = z

export type ExtractActionType = z.infer<typeof ExtractAction>;

export const ExtractActionDefinition: AgentActionDefinition = {
export const ExtractActionDefinition: AgentActionDefinition<typeof ExtractAction> = {
type: "extract" as const,
actionParams: ExtractAction,
run: async (
Expand Down
29 changes: 8 additions & 21 deletions src/agent/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import { runAgentTask } from "./tools/agent";
import { HyperPage, HyperVariable } from "@/types/agent/types";
import { z } from "zod";
import { ErrorEmitter } from "@/utils";
import { PageExtractFn } from "./tools/page-actions/extract";

export class HyperAgent<T extends BrowserProviders = "Local"> {
private llm: BaseChatModel;
Expand Down Expand Up @@ -570,27 +571,13 @@ export class HyperAgent<T extends BrowserProviders = "Local"> {
400
);
}
if (task) {
const res = await this.executeTask(
`You have to perform an extraction on the current page. You have to perform the extraction according to the task: ${task}. Make sure your final response only contains the extracted content`,
{
maxSteps: 2,
outputSchema,
},
page
);
if (outputSchema) {
return JSON.parse(res.output as string);
}
return res.output as string;
} else {
const res = await this.executeTask(
"You have to perform a data extraction on the current page. Make sure your final response only contains the extracted content",
{ maxSteps: 2, outputSchema },
page
);
return JSON.parse(res.output as string);
}
return await PageExtractFn({
task,
schema: outputSchema,
page,
llm: this.llm,
tokenLimit: this.tokenLimit,
});
};
return hyperPage;
}
Expand Down
128 changes: 128 additions & 0 deletions src/agent/tools/page-actions/extract.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import { z } from "zod";
import { parseMarkdown } from "@/utils/html-to-markdown";
import { Page } from "playwright";
import { BaseChatModel } from "@langchain/core/language_models/chat_models";
import { HumanMessage, SystemMessage } from "@langchain/core/messages";

export interface ExtractOptions<
T extends z.AnyZodObject | undefined = z.AnyZodObject,
> {
schema?: T;
task?: string;
page: Page;
llm: BaseChatModel;
tokenLimit?: number;
}

export async function PageExtractFn<
T extends z.AnyZodObject | undefined = z.AnyZodObject,
>({
schema,
task,
page,
llm,
tokenLimit = 4000,
}: ExtractOptions<T>): Promise<T extends z.AnyZodObject ? z.infer<T> : string> {
if (!schema && !task) {
throw new Error("Either schema or task must be provided");
}

// Get page content and convert to markdown
const content = await page.content();
const markdown = await parseMarkdown(content);

// Get page metadata
const metadata = await page.evaluate(() => {
const meta = {
title: document.title,
description:
document
.querySelector('meta[name="description"]')
?.getAttribute("content") || "",
keywords:
document
.querySelector('meta[name="keywords"]')
?.getAttribute("content") || "",
ogTitle:
document
.querySelector('meta[property="og:title"]')
?.getAttribute("content") || "",
ogDescription:
document
.querySelector('meta[property="og:description"]')
?.getAttribute("content") || "",
ogImage:
document
.querySelector('meta[property="og:image"]')
?.getAttribute("content") || "",
canonicalUrl:
document.querySelector('link[rel="canonical"]')?.getAttribute("href") ||
"",
};
return meta;
});

// TODO: Maybe take fullscreen screenshots here, and then break them up into manageable chunks usable by the LLM.
// Take screenshot for context
const cdpSession = await page.context().newCDPSession(page);
const screenshot = await cdpSession.send("Page.captureScreenshot");
cdpSession.detach();

// TODO: Maybe use js-tiktoken here ?
// Trim markdown to stay within token limit
const avgTokensPerChar = 0.75;
const maxChars = Math.floor(tokenLimit / avgTokensPerChar);
const trimmedMarkdown =
markdown.length > maxChars
? markdown.slice(0, maxChars) + "\n[Content truncated due to length]"
: markdown;

// Create messages
const messages = [
new SystemMessage(
`You are an expert at extracting structured information from web pages. Your task is to:
1. Analyze the provided markdown content, metadata, and screenshot of a webpage
2. Extract relevant information based on the provided task and schema (if any)
3. Pay attention to both the text content and visual layout
4. Handle cases where information might be split across different sections
5. Ensure the response is complete and accurate
6. Format the response appropriately based on the schema (if provided)

Remember to:
- Look for information in both the main content and page metadata (title, description, etc.)
- Consider the visual hierarchy and layout of the page
- Handle cases where information might be ambiguous or incomplete
- Ensure the response is complete and accurate`
),
new HumanMessage({
content: [
{
type: "text",
text: `Extract information from the page${task ? ` according to this task: ${task}` : ""}${schema ? " and format according to the schema" : ""}`,
},
{ type: "text", text: "Here is the page metadata:" },
{ type: "text", text: JSON.stringify(metadata, null, 2) },
{ type: "text", text: "Here is the page content:" },
{ type: "text", text: trimmedMarkdown },
{ type: "text", text: "Here is a screenshot of the page:" },
{
type: "image_url",
image_url: {
url: `data:image/png;base64,${screenshot.data}`,
},
},
],
}),
];

if (schema) {
// Create structured output chain
const chain = llm.withStructuredOutput(schema);
const result = await chain.invoke(messages);
return result as T extends z.AnyZodObject ? z.infer<T> : string;
} else {
// For task-based extraction, get raw response
const response = await llm.invoke(messages);
return response.content as T extends z.AnyZodObject ? z.infer<T> : string;
}
}