diff --git a/README.md b/README.md
index 3e317fe0..5db0d131 100644
--- a/README.md
+++ b/README.md
@@ -7,197 +7,167 @@ top of your existing test automation framework.
## Features
* **Natural Language Control**: "Search for 'Headphones' and click the first result."
-* **Multimodal Understanding**: Uses screenshots to understand the page context.
+* **Advanced Interaction**: Click, type, scroll, drag-and-drop, and more using simple commands.
+* **Multimodal Understanding**: Uses screenshots to understand page context (Visual Grounding).
* **Smart Planning**: Automatically plans, executes, and retries actions.
-* **Framework Agnostic**: Works alongside your existing Selenium or Playwright tests.
-* **Flexible Configuration**: Supports OpenAI (GPT-4o) and Google Gemini models.
-* **Visual Reports**: Generates HTML reports with execution traces and screenshots.
+* **Service Layer**: Low-level AI capabilities for locating, extracting, and describing elements.
+* **YAML Script Support**: Execute declarative test scripts defined in YAML.
+* **Caching**: Built-in caching for improved performance and reduced API costs.
+* **Framework Agnostic**: Works with Selenium and Playwright.
+* **Flexible Configuration**: Supports OpenAI (GPT-4o) and Google Gemini (1.5 Pro) models.
+* **Visual Reports**: Generates detailed HTML reports with execution traces, screenshots, and reasoning.
## Modules
-* **`midscene-core`**: The brain of the agent. Contains the `Planner`, `Executor`, and `Orchestrator`. Pure Java, no
- browser dependencies.
+* **`midscene-core`**: The brain of the agent. Contains `Agent`, `Service`, `ScriptPlayer`, and core logic.
* **`midscene-web`**: Adapters for browser automation tools (Selenium, Playwright).
* **`midscene-visualizer`**: Generates visual HTML reports from execution contexts.
## Installation
-### Build Locally
-
-Currently, Midscene Java is available as a source build. Clone this repository and install it to your local Maven
-repository:
-
-```bash
-git clone https://github.com/alstafeev/midscene-java.git
-cd midscene-java
-mvn clean install
-```
-
-### Add Dependencies
-
Add the necessary dependencies to your project's `pom.xml`:
```xml
-
io.github.alstafeev
midscene-web
- 0.1.6
+ 0.1.9-SNAPSHOT
-io.github.alstafeev
-midscene-visualizer
-0.1.6
+ io.github.alstafeev
+ midscene-visualizer
+ 0.1.9-SNAPSHOT
```
-## Usage Example
+## Quick Start (Agent Mode)
-Here is how to use Midscene in a standard Selenium test, including report generation.
+Midscene Agent is the primary way to interact with your application. It handles planning and execution.
-### Prerequisites
+```java
+// 1. Configure
+MidsceneConfig config = MidsceneConfig.builder()
+ .provider(ModelProvider.GEMINI) // or OPENAI
+ .apiKey(System.getenv("GEMINI_API_KEY"))
+ .modelName("gemini-1.5-pro")
+ .build();
+
+// 2. Initialize (Selenium example)
+WebDriver driver = new ChromeDriver();
+SeleniumDriver pageDriver = new SeleniumDriver(driver);
+Agent agent = Agent.create(config, pageDriver);
+
+// 3. Interact
+agent.aiAction("Search for 'Headphones' and click the first result");
+agent.aiAssert("Price should be under $200");
+
+// 4. Generate Report
+Visualizer.generateReport(agent.getContext(), Paths.get("report.html"));
+```
+
+## Advanced Features
-* Java 21 (Recommended) or Java 17+
-* Maven
-* `OPENAI_API_KEY` or `GEMINI_API_KEY` environment variable set.
+### 1. Expanded API Methods
-### Code Snippet
+The `Agent` class provides specific methods for precise control:
```java
-package io.github.alstafeev.web.demo;
-
-import com.midscene.core.agent.Agent;
-import com.midscene.core.config.MidsceneConfig;
-import com.midscene.core.config.ModelProvider;
-import com.midscene.visualizer.Visualizer;
-import com.midscene.web.driver.SeleniumDriver;
-import lombok.SneakyThrows;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-import org.openqa.selenium.WebDriver;
-import org.openqa.selenium.chrome.ChromeDriver;
-import org.openqa.selenium.chrome.ChromeOptions;
-
-import java.nio.file.Paths;
-
-public class MidsceneDemoTest {
-
- private WebDriver driver;
- private Agent agent;
-
- @BeforeEach
- @SneakyThrows
- void initDriver() {
- ChromeOptions options = new ChromeOptions();
- options.addArguments("--remote-allow-origins=*");
- driver = new ChromeDriver(options);
- driver.manage().window().maximize();
- }
-
- @Test
- public void localGeminiTest() {
- driver.get("https://midscenejs.com/");
-
- MidsceneConfig config = MidsceneConfig.builder()
- .provider(ModelProvider.GEMINI)
- .apiKey(System.getenv("GEMINI_API_KEY"))
- .modelName("gemini-2.5-pro")
- .build();
-
- SeleniumDriver driverAdapter = new SeleniumDriver(driver);
- agent = Agent.create(config, driverAdapter);
-
- agent.aiAction("Search for 'MCP server' button in the left sidebar of this site and click it.");
- }
-
- @AfterEach
- void shutDownDriver() {
- if (agent != null) {
- // Generate report after test
- Visualizer.generateReport(agent.getContext(), Paths.get("midscene-report.html"));
- }
- if (driver != null) {
- driver.quit();
- }
- }
-}
+// Interactions
+agent.aiTap("Submit button");
+agent.aiInput("Username field", "admin");
+agent.aiScroll(ScrollOptions.down());
+agent.aiHover("User profile icon");
+
+// Assertions & Waist
+agent.aiAssert("The login button should be visible");
+agent.aiWaitFor("Welcome message to appear");
+
+// Data Query
+String price = agent.aiString("What is the price of the first item?");
+boolean isLoggedIn = agent.aiBoolean("Is the user logged in?");
```
-### Playwright Example
+### 2. Service Layer (Low-Level AI)
+
+Use the `Service` class for direct AI tasks without full agent planning:
```java
-package io.github.alstafeev.web.demo;
-
-import com.microsoft.playwright.Browser;
-import com.microsoft.playwright.BrowserType;
-import com.microsoft.playwright.Page;
-import com.microsoft.playwright.Playwright;
-import com.midscene.core.agent.Agent;
-import com.midscene.core.config.MidsceneConfig;
-import com.midscene.core.config.ModelProvider;
-import com.midscene.visualizer.Visualizer;
-import com.midscene.web.driver.PlaywrightDriver;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-public class MidscenePlaywrightDemoTest {
-
- private Playwright playwright;
- private Browser browser;
- private Page page;
- private Agent agent;
-
- @BeforeEach
- void initDriver() {
- playwright = Playwright.create();
- browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(false));
- page = browser.newPage();
- }
-
- @Test
- public void localGeminiTest() {
- page.navigate("https://midscenejs.com/");
-
- MidsceneConfig config = MidsceneConfig.builder()
- .provider(ModelProvider.GEMINI)
- .apiKey(System.getenv("GEMINI_API_KEY"))
- .modelName("gemini-2.5-pro")
- .build();
-
- PlaywrightDriver driverAdapter = new PlaywrightDriver(page);
- agent = Agent.create(config, driverAdapter);
-
- agent.aiAction("Search for 'MCP server' button in the left sidebar of this site and click it.");
- }
-
- @AfterEach
- void shutDownDriver() {
- if (agent != null) {
- // Generate report after test
- Visualizer.generateReport(agent.getContext(), Paths.get("midscene-playwright-report.html"));
- }
- if (browser != null) {
- browser.close();
- }
- if (playwright != null) {
- playwright.close();
- }
- }
-}
+Service service = new Service(pageDriver, agent.getAiModel());
+
+// Locate element coordinates
+LocateResult result = service.locate("The blue checkout button");
+System.out.println("Button at: " + result.getRect());
+
+// Extract data
+ExtractResult price = service.extract("Price of the main item");
+
+// Describe element
+DescribeResult desc = service.describe(100, 200); // describe item at x=100, y=200
```
+### 3. YAML Script Support
+
+Define test flows declaratively in YAML:
+
+```yaml
+target:
+ url: "https://saucedemo.com"
+
+tasks:
+ - name: "Login Flow"
+ flow:
+ - aiAction: "Type 'standard_user' into username field"
+ - aiAction: "Type 'secret_sauce' into password field"
+ - aiAction: "Click Login"
+ - aiAssert: "User should be on the inventory page"
+ - logScreenshot: "Inventory Page"
+```
+
+Run it with Java:
+
+```java
+ScriptPlayer player = new ScriptPlayer("login_script.yaml", agent);
+ScriptResult result = player.run();
+```
+
+### 4. Caching
+
+Midscene caches planning results to speed up execution and save tokens.
+
+```java
+// Cache is enabled by default (memory + file)
+// Configure cache behavior:
+MidsceneConfig config = MidsceneConfig.builder()
+ // ...
+ .cacheId("my_test_cache") // persistent cache file
+ .build();
+```
+
+## Supported Drivers
+
+- **Selenium**: `new SeleniumDriver(webDriver)`
+- **Playwright**: `new PlaywrightDriver(page)`
+
## Configuration
-You can configure the agent using `MidsceneConfig`:
+Detailed configuration options:
```java
MidsceneConfig config = MidsceneConfig.builder()
- .provider(ModelProvider.OPENAI) // Choose OPENAI or GEMINI
- .baseUrl("https://llm-url/") // Set base URL for LLM model
- .apiKey("your-api-key") // Set API Key
- .modelName("gemini-2.5-pro") // Specific model version
- .timeoutMs(60000) // Timeout in milliseconds
+ .provider(ModelProvider.OPENAI)
+ .apiKey("sk-...")
+ .modelName("gpt-4o")
+ .baseUrl("https://api.openai.com/v1") // optional custom base URL
+ .timeoutMs(120000) // AI timeout
.build();
```
+
+## Contributing
+
+Build from source:
+
+```bash
+git clone https://github.com/alstafeev/midscene-java.git
+cd midscene-java
+mvn clean install
+```
diff --git a/midscene-core/pom.xml b/midscene-core/pom.xml
index 6847952d..6302dd1a 100644
--- a/midscene-core/pom.xml
+++ b/midscene-core/pom.xml
@@ -1,5 +1,6 @@
-
+
midscene-core
@@ -15,6 +16,12 @@
com.fasterxml.jackson.datatype
+
+
+ jackson-dataformat-yaml
+ com.fasterxml.jackson.dataformat
+
+
okhttp
@@ -42,26 +49,26 @@
- dev.langchain4j
langchain4j-anthropic
+ dev.langchain4j
- dev.langchain4j
langchain4j-mistral-ai
+ dev.langchain4j
- dev.langchain4j
langchain4j-azure-open-ai
+ dev.langchain4j
- dev.langchain4j
langchain4j-ollama
+ dev.langchain4j
diff --git a/midscene-core/src/main/java/com/midscene/core/agent/Agent.java b/midscene-core/src/main/java/com/midscene/core/agent/Agent.java
index fb3f30d1..6777ed1e 100644
--- a/midscene-core/src/main/java/com/midscene/core/agent/Agent.java
+++ b/midscene-core/src/main/java/com/midscene/core/agent/Agent.java
@@ -1,19 +1,44 @@
package com.midscene.core.agent;
+import com.midscene.core.cache.TaskCache;
import com.midscene.core.config.MidsceneConfig;
import com.midscene.core.context.Context;
-import com.midscene.core.model.*;
+import com.midscene.core.model.AIModel;
+import com.midscene.core.model.AnthropicModel;
+import com.midscene.core.model.AzureOpenAiModel;
+import com.midscene.core.model.GeminiModel;
+import com.midscene.core.model.MistralModel;
+import com.midscene.core.model.OllamaModel;
+import com.midscene.core.model.OpenAIModel;
+import com.midscene.core.pojo.options.InputOptions;
+import com.midscene.core.pojo.options.LocateOptions;
+import com.midscene.core.pojo.options.ScrollOptions;
+import com.midscene.core.pojo.options.WaitOptions;
import com.midscene.core.service.PageDriver;
import java.util.concurrent.CompletableFuture;
import lombok.extern.log4j.Log4j2;
+/**
+ * The main entry point for AI-powered browser automation. Provides natural language methods for interacting with web
+ * pages.
+ */
@Log4j2
public class Agent {
private final Orchestrator orchestrator;
+ private final PageDriver driver;
+ private TaskCache cache;
public Agent(PageDriver driver, AIModel aiModel) {
- this.orchestrator = new Orchestrator(driver, aiModel);
+ this.driver = driver;
+ this.cache = TaskCache.disabled();
+ this.orchestrator = new Orchestrator(driver, aiModel, this.cache);
+ }
+
+ public Agent(PageDriver driver, AIModel aiModel, TaskCache cache) {
+ this.driver = driver;
+ this.cache = cache != null ? cache : TaskCache.disabled();
+ this.orchestrator = new Orchestrator(driver, aiModel, this.cache);
}
/**
@@ -37,14 +62,222 @@ public static Agent create(MidsceneConfig config, PageDriver driver) {
}
/**
- * Performs an AI-driven action on the page.
+ * Creates a new Agent instance with the given configuration, driver, and cache.
+ *
+ * @param config The configuration for the agent
+ * @param driver The page driver
+ * @param cache The task cache for storing plans
+ * @return A new Agent instance
+ */
+ public static Agent create(MidsceneConfig config, PageDriver driver, TaskCache cache) {
+ AIModel model = switch (config.getProvider()) {
+ case OPENAI -> new OpenAIModel(config.getApiKey(), config.getModelName());
+ case GEMINI -> new GeminiModel(config.getApiKey(), config.getModelName());
+ case ANTHROPIC -> new AnthropicModel(config.getApiKey(), config.getModelName(), config.getBaseUrl());
+ case MISTRAL -> new MistralModel(config.getApiKey(), config.getModelName(), config.getBaseUrl());
+ case AZURE_OPEN_AI -> new AzureOpenAiModel(config.getApiKey(), config.getBaseUrl());
+ case OLLAMA -> new OllamaModel(config.getBaseUrl(), config.getModelName());
+ };
+
+ return new Agent(driver, model, cache);
+ }
+
+ /**
+ * Performs an AI-driven action on the page using natural language.
*
* @param instruction The instruction to execute
*/
public void aiAction(String instruction) {
- orchestrator.execute(instruction);
+ this.orchestrator.execute(instruction);
+ }
+
+ /**
+ * Alias for aiAction. Performs an AI-driven action on the page.
+ *
+ * @param instruction The instruction to execute
+ */
+ public void aiAct(String instruction) {
+ aiAction(instruction);
+ }
+
+ /**
+ * Tap on an element described by natural language.
+ *
+ * @param locatePrompt Description of the element to tap
+ */
+ public void aiTap(String locatePrompt) {
+ aiTap(locatePrompt, null);
+ }
+
+ /**
+ * Tap on an element with options.
+ *
+ * @param locatePrompt Description of the element to tap
+ * @param options Locate options
+ */
+ public void aiTap(String locatePrompt, LocateOptions options) {
+ runAction("Tap", locatePrompt, options);
+ }
+
+ /**
+ * Double-click on an element described by natural language.
+ *
+ * @param locatePrompt Description of the element to double-click
+ */
+ public void aiDoubleClick(String locatePrompt) {
+ aiDoubleClick(locatePrompt, null);
+ }
+
+ /**
+ * Double-click on an element with options.
+ *
+ * @param locatePrompt Description of the element to double-click
+ * @param options Locate options
+ */
+ public void aiDoubleClick(String locatePrompt, LocateOptions options) {
+ runAction("Double Click", locatePrompt, options);
+ }
+
+ /**
+ * Right-click on an element described by natural language.
+ *
+ * @param locatePrompt Description of the element to right-click
+ */
+ public void aiRightClick(String locatePrompt) {
+ aiRightClick(locatePrompt, null);
+ }
+
+ /**
+ * Right-click on an element with options.
+ *
+ * @param locatePrompt Description of the element to right-click
+ * @param options Locate options
+ */
+ public void aiRightClick(String locatePrompt, LocateOptions options) {
+ runAction("Right Click", locatePrompt, options);
+ }
+
+ /**
+ * Hover over an element described by natural language.
+ *
+ * @param locatePrompt Description of the element to hover over
+ */
+ public void aiHover(String locatePrompt) {
+ aiHover(locatePrompt, null);
+ }
+
+ /**
+ * Hover over an element with options.
+ *
+ * @param locatePrompt Description of the element to hover over
+ * @param options Locate options
+ */
+ public void aiHover(String locatePrompt, LocateOptions options) {
+ runAction("Hover", locatePrompt, options);
+ }
+
+ /**
+ * Input text into an element described by natural language.
+ *
+ * @param locatePrompt Description of the input element
+ * @param value The value to input
+ */
+ public void aiInput(String locatePrompt, String value) {
+ aiInput(locatePrompt, InputOptions.builder().value(value).build());
+ }
+
+ /**
+ * Input text into an element with options.
+ *
+ * @param locatePrompt Description of the input element
+ * @param options Input options including value and mode
+ */
+ public void aiInput(String locatePrompt, InputOptions options) {
+ String value = options.getValue() != null ? options.getValue() : "";
+ String modeStr = "";
+ if (options.getMode() != null) {
+ switch (options.getMode()) {
+ case APPEND -> modeStr = " (append to existing text)";
+ case CLEAR -> modeStr = " (clear the field)";
+ default -> modeStr = "";
+ }
+ }
+ aiAction("Type '" + value + "' into " + locatePrompt + modeStr);
+ }
+
+ /**
+ * Press a keyboard key.
+ *
+ * @param keyName The name of the key to press (e.g., "Enter", "Escape", "Tab")
+ */
+ public void aiKeyboardPress(String keyName) {
+ aiKeyboardPress(null, keyName);
}
+ /**
+ * Press a keyboard key on a specific element.
+ *
+ * @param locatePrompt Description of the element to focus before pressing the key
+ * @param keyName The name of the key to press
+ */
+ public void aiKeyboardPress(String locatePrompt, String keyName) {
+ if (locatePrompt != null) {
+ aiAction("Press '" + keyName + "' on " + locatePrompt);
+ } else {
+ aiAction("Press '" + keyName + "'");
+ }
+ }
+
+ // ========== Scroll Actions ==========
+
+ /**
+ * Scroll on the page with the given options.
+ *
+ * @param options Scroll options including direction and type
+ */
+ public void aiScroll(ScrollOptions options) {
+ aiScroll(null, options);
+ }
+
+ /**
+ * Scroll on a specific element with options.
+ *
+ * @param locatePrompt Description of the element to scroll (null for page scroll)
+ * @param options Scroll options
+ */
+ public void aiScroll(String locatePrompt, ScrollOptions options) {
+ StringBuilder instruction = new StringBuilder("Scroll ");
+
+ if (options.getDirection() != null) {
+ instruction.append(options.getDirection().name().toLowerCase()).append(" ");
+ } else {
+ instruction.append("down ");
+ }
+
+ if (locatePrompt != null && !locatePrompt.isEmpty()) {
+ instruction.append("on ").append(locatePrompt).append(" ");
+ }
+
+ if (options.getScrollType() != null) {
+ switch (options.getScrollType()) {
+ case SCROLL_TO_TOP -> instruction.append("until reaching the top");
+ case SCROLL_TO_BOTTOM -> instruction.append("until reaching the bottom");
+ case SCROLL_TO_LEFT -> instruction.append("until reaching the left edge");
+ case SCROLL_TO_RIGHT -> instruction.append("until reaching the right edge");
+ default -> {
+ }
+ }
+ }
+
+ if (options.getDistance() != null) {
+ instruction.append("by ").append(options.getDistance()).append(" pixels");
+ }
+
+ aiAction(instruction.toString().trim());
+ }
+
+ // ========== Query Actions ==========
+
/**
* Queries the page for information using the AI.
*
@@ -55,6 +288,102 @@ public String aiQuery(String question) {
return orchestrator.query(question);
}
+ /**
+ * Query the page and get a boolean result.
+ *
+ * @param prompt The question to evaluate as true/false
+ * @return true or false based on the AI's evaluation
+ */
+ public boolean aiBoolean(String prompt) {
+ String answer = aiQuery(prompt + " Answer with only 'true' or 'false'.");
+ return answer.toLowerCase().contains("true");
+ }
+
+ /**
+ * Query the page and get a numeric result.
+ *
+ * @param prompt The question expecting a numeric answer
+ * @return The numeric value extracted from the AI's response
+ */
+ public double aiNumber(String prompt) {
+ String answer = aiQuery(prompt + " Answer with only a number.");
+ try {
+ return Double.parseDouble(answer.replaceAll("[^0-9.-]", ""));
+ } catch (NumberFormatException e) {
+ log.warn("Failed to parse number from AI response: {}", answer);
+ return 0.0;
+ }
+ }
+
+ /**
+ * Query the page and get a string result.
+ *
+ * @param prompt The question expecting a text answer
+ * @return The text answer from the AI
+ */
+ public String aiString(String prompt) {
+ return aiQuery(prompt);
+ }
+
+ // ========== Assertion Actions ==========
+
+ /**
+ * Assert a condition on the page.
+ *
+ * @param assertion The assertion to verify
+ * @throws AssertionError if the assertion fails
+ */
+ public void aiAssert(String assertion) {
+ boolean result = aiBoolean("Is the following true? " + assertion);
+ if (!result) {
+ throw new AssertionError("AI Assertion failed: " + assertion);
+ }
+ log.info("AI Assertion passed: {}", assertion);
+ }
+
+ /**
+ * Wait for a condition to become true.
+ *
+ * @param assertion The condition to wait for
+ * @param options Wait options including timeout
+ */
+ public void aiWaitFor(String assertion, WaitOptions options) {
+ long timeoutMs = options.getTimeoutMs();
+ long checkIntervalMs = options.getCheckIntervalMs();
+ long startTime = System.currentTimeMillis();
+
+ while (System.currentTimeMillis() - startTime < timeoutMs) {
+ boolean result = aiBoolean("Is the following currently true? " + assertion);
+ if (result) {
+ log.info("Wait condition satisfied: {}", assertion);
+ return;
+ }
+
+ try {
+ Thread.sleep(checkIntervalMs);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ throw new RuntimeException("Wait interrupted", e);
+ }
+ }
+
+ if (options.isThrowOnTimeout()) {
+ throw new RuntimeException("Wait timeout: " + assertion);
+ }
+ log.warn("Wait timed out: {}", assertion);
+ }
+
+ /**
+ * Wait for a condition with default options (30s timeout).
+ *
+ * @param assertion The condition to wait for
+ */
+ public void aiWaitFor(String assertion) {
+ aiWaitFor(assertion, WaitOptions.builder().build());
+ }
+
+ // ========== Async Methods ==========
+
/**
* Performs an AI-driven action asynchronously.
*
@@ -65,6 +394,8 @@ public CompletableFuture aiActionAsync(String instruction) {
return CompletableFuture.runAsync(() -> aiAction(instruction));
}
+ // ========== Utility Methods ==========
+
/**
* Gets the execution context.
*
@@ -73,4 +404,53 @@ public CompletableFuture aiActionAsync(String instruction) {
public Context getContext() {
return orchestrator.getContext();
}
+
+ /**
+ * Gets the underlying page driver.
+ *
+ * @return The page driver
+ */
+ public PageDriver getDriver() {
+ return driver;
+ }
+
+ /**
+ * Gets the task cache.
+ *
+ * @return The task cache
+ */
+ public TaskCache getCache() {
+ return cache;
+ }
+
+ /**
+ * Sets the task cache.
+ *
+ * @param cache The task cache to use
+ */
+ public void setCache(TaskCache cache) {
+ this.cache = cache != null ? cache : TaskCache.disabled();
+ }
+
+ // ========== Private Helper Methods ==========
+
+ private void runAction(String action, String locatePrompt, LocateOptions options) {
+ String instruction = buildLocateInstruction(action, locatePrompt, options);
+ aiAction(instruction);
+ }
+
+ private String buildLocateInstruction(String action, String locatePrompt, LocateOptions options) {
+ StringBuilder instruction = new StringBuilder(action).append(" ").append(locatePrompt);
+
+ if (options != null) {
+ if (options.getSearchAreaPrompt() != null) {
+ instruction.append(" within ").append(options.getSearchAreaPrompt());
+ }
+ if (Boolean.TRUE.equals(options.getDeepThink())) {
+ instruction.append(" (use careful analysis)");
+ }
+ }
+
+ return instruction.toString();
+ }
}
diff --git a/midscene-core/src/main/java/com/midscene/core/agent/Executor.java b/midscene-core/src/main/java/com/midscene/core/agent/Executor.java
index 3380ce51..e101b825 100644
--- a/midscene-core/src/main/java/com/midscene/core/agent/Executor.java
+++ b/midscene-core/src/main/java/com/midscene/core/agent/Executor.java
@@ -5,6 +5,9 @@
import java.util.Objects;
import lombok.extern.log4j.Log4j2;
+/**
+ * Executes actions on the page driver based on the planned action items.
+ */
@Log4j2
public class Executor {
@@ -14,50 +17,182 @@ public Executor(PageDriver driver) {
this.driver = driver;
}
+ /**
+ * Executes a single action item.
+ *
+ * @param action the action to execute
+ */
public void execute(ActionsItem action) {
log.info("Executing action: {}", action.getType());
switch (action.getType()) {
- case CLICK -> {
- if (Objects.nonNull(action.getSelectorType()) && Objects.nonNull(action.getElementSelector())) {
- driver.click(action.getSelectorType(), action.getElementSelector());
- }
- if (Objects.nonNull(action.getLocate())) {
- driver.click(action.getLocate());
- }
- }
- case TYPE_TEXT -> {
- if (Objects.nonNull(action.getSelectorType()) && Objects.nonNull(action.getElementSelector())) {
- driver.type(action.getSelectorType(), action.getElementSelector(), action.getText());
- }
- if (Objects.nonNull(action.getLocate()) && Objects.nonNull(action.getText())) {
- driver.type(action.getLocate(), action.getText());
- }
- }
- case SCROLL_DOWN -> {
- if (Objects.nonNull(action.getSelectorType()) && Objects.nonNull(action.getElementSelector())) {
- driver.scrollDown(action.getSelectorType(), action.getElementSelector());
- }
- if (Objects.nonNull(action.getLocate())) {
- driver.scrollDown(action.getLocate());
- }
+ // ========== Click/Tap Actions ==========
+ case CLICK, TAP -> executeClick(action);
+ case DOUBLE_CLICK -> executeDoubleClick(action);
+ case RIGHT_CLICK -> executeRightClick(action);
+ case LONG_PRESS -> executeLongPress(action);
+
+ // ========== Text Input Actions ==========
+ case TYPE_TEXT, INPUT -> executeTypeText(action);
+ case CLEAR_INPUT -> executeClearInput(action);
+ case KEYBOARD_PRESS -> executeKeyboardPress(action);
+
+ // ========== Scroll Actions ==========
+ case SCROLL_DOWN -> executeScrollDown(action);
+ case SCROLL_UP -> executeScrollUp(action);
+ case SCROLL -> executeScroll(action);
+
+ // ========== Gesture Actions ==========
+ case HOVER -> executeHover(action);
+ case SWIPE -> executeSwipe(action);
+ case DRAG_AND_DROP -> executeDragAndDrop(action);
+
+ // ========== Navigation Actions ==========
+ case NAVIGATE -> driver.navigate(action.getUrl());
+ case RELOAD -> driver.reload();
+ case GO_BACK -> driver.goBack();
+
+ // ========== Utility Actions ==========
+ case SLEEP -> executeSleep(action);
+ case ASSERT, WAIT_FOR -> log.info("Assert/WaitFor action: {}", action.getAssertion());
+ }
+ }
+
+ private void executeClick(ActionsItem action) {
+ if (hasSelector(action)) {
+ driver.click(action.getSelectorType(), action.getElementSelector());
+ } else if (hasLocate(action)) {
+ driver.click(action.getLocate());
+ }
+ }
+
+ private void executeDoubleClick(ActionsItem action) {
+ if (hasSelector(action)) {
+ driver.doubleClick(action.getSelectorType(), action.getElementSelector());
+ } else if (hasLocate(action)) {
+ driver.doubleClick(action.getLocate());
+ }
+ }
+
+ private void executeRightClick(ActionsItem action) {
+ if (hasSelector(action)) {
+ driver.rightClick(action.getSelectorType(), action.getElementSelector());
+ } else if (hasLocate(action)) {
+ driver.rightClick(action.getLocate());
+ }
+ }
+
+ private void executeLongPress(ActionsItem action) {
+ if (hasLocate(action)) {
+ long duration = Objects.nonNull(action.getDurationMs()) ? action.getDurationMs() : 1000L;
+ driver.longPress(action.getLocate(), duration);
+ }
+ }
+
+ private void executeTypeText(ActionsItem action) {
+ String textToType = Objects.nonNull(action.getValue()) ? action.getValue() : action.getText();
+ if (hasSelector(action)) {
+ driver.type(action.getSelectorType(), action.getElementSelector(), textToType);
+ } else if (hasLocate(action) && Objects.nonNull(textToType)) {
+ driver.type(action.getLocate(), textToType);
+ }
+ }
+
+ private void executeClearInput(ActionsItem action) {
+ if (hasSelector(action)) {
+ driver.clearInput(action.getSelectorType(), action.getElementSelector());
+ } else if (hasLocate(action)) {
+ driver.clearInput(action.getLocate());
+ }
+ }
+
+ private void executeKeyboardPress(ActionsItem action) {
+ if (Objects.nonNull(action.getKeyName())) {
+ if (hasLocate(action)) {
+ driver.keyboardPress(action.getLocate(), action.getKeyName());
+ } else {
+ driver.keyboardPress(action.getKeyName());
}
- case SCROLL_UP -> {
- if (Objects.nonNull(action.getSelectorType()) && Objects.nonNull(action.getElementSelector())) {
- driver.scrollUp(action.getSelectorType(), action.getElementSelector());
- }
- if (Objects.nonNull(action.getLocate())) {
- driver.scrollUp(action.getLocate());
+ }
+ }
+
+ private void executeScrollDown(ActionsItem action) {
+ if (hasSelector(action)) {
+ driver.scrollDown(action.getSelectorType(), action.getElementSelector());
+ } else if (hasLocate(action)) {
+ driver.scrollDown(action.getLocate());
+ }
+ }
+
+ private void executeScrollUp(ActionsItem action) {
+ if (hasSelector(action)) {
+ driver.scrollUp(action.getSelectorType(), action.getElementSelector());
+ } else if (hasLocate(action)) {
+ driver.scrollUp(action.getLocate());
+ }
+ }
+
+ private void executeScroll(ActionsItem action) {
+ String direction = action.getDirection();
+ if (Objects.isNull(direction)) {
+ direction = "down";
+ }
+
+ switch (direction.toLowerCase()) {
+ case "up" -> executeScrollUp(action);
+ case "down" -> executeScrollDown(action);
+ case "left" -> {
+ int distance = Objects.nonNull(action.getDistance()) ? action.getDistance() : 200;
+ if (hasLocate(action)) {
+ driver.scrollLeft(action.getLocate(), distance);
}
}
- case HOVER -> {
- if (Objects.nonNull(action.getSelectorType()) && Objects.nonNull(action.getElementSelector())) {
- driver.hover(action.getSelectorType(), action.getElementSelector());
- }
- if (Objects.nonNull(action.getLocate())) {
- driver.hover(action.getLocate());
+ case "right" -> {
+ int distance = Objects.nonNull(action.getDistance()) ? action.getDistance() : 200;
+ if (hasLocate(action)) {
+ driver.scrollRight(action.getLocate(), distance);
}
}
}
}
+
+ private void executeHover(ActionsItem action) {
+ if (hasSelector(action)) {
+ driver.hover(action.getSelectorType(), action.getElementSelector());
+ } else if (hasLocate(action)) {
+ driver.hover(action.getLocate());
+ }
+ }
+
+ private void executeSwipe(ActionsItem action) {
+ if (Objects.nonNull(action.getFrom()) && Objects.nonNull(action.getTo())) {
+ long duration = Objects.nonNull(action.getDurationMs()) ? action.getDurationMs() : 500L;
+ driver.swipe(action.getFrom(), action.getTo(), duration);
+ }
+ }
+
+ private void executeDragAndDrop(ActionsItem action) {
+ if (Objects.nonNull(action.getFrom()) && Objects.nonNull(action.getTo())) {
+ driver.dragAndDrop(action.getFrom(), action.getTo());
+ }
+ }
+
+ private void executeSleep(ActionsItem action) {
+ int sleepMs = Objects.nonNull(action.getSleepMs()) ? action.getSleepMs() : 1000;
+ try {
+ log.debug("Sleeping for {} ms", sleepMs);
+ Thread.sleep(sleepMs);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ log.warn("Sleep interrupted", e);
+ }
+ }
+
+ private boolean hasSelector(ActionsItem action) {
+ return Objects.nonNull(action.getSelectorType()) && Objects.nonNull(action.getElementSelector());
+ }
+
+ private boolean hasLocate(ActionsItem action) {
+ return Objects.nonNull(action.getLocate());
+ }
}
diff --git a/midscene-core/src/main/java/com/midscene/core/agent/Orchestrator.java b/midscene-core/src/main/java/com/midscene/core/agent/Orchestrator.java
index 32b83187..44cc7736 100644
--- a/midscene-core/src/main/java/com/midscene/core/agent/Orchestrator.java
+++ b/midscene-core/src/main/java/com/midscene/core/agent/Orchestrator.java
@@ -1,5 +1,6 @@
package com.midscene.core.agent;
+import com.midscene.core.cache.TaskCache;
import com.midscene.core.context.Context;
import com.midscene.core.model.AIModel;
import com.midscene.core.pojo.planning.ActionsItem;
@@ -23,7 +24,11 @@ public class Orchestrator {
private final Context context;
public Orchestrator(PageDriver driver, AIModel aiModel) {
- this(driver, new Planner(aiModel), new Executor(driver));
+ this(driver, new Planner(aiModel, TaskCache.disabled()), new Executor(driver));
+ }
+
+ public Orchestrator(PageDriver driver, AIModel aiModel, TaskCache cache) {
+ this(driver, new Planner(aiModel, cache), new Executor(driver));
}
/**
@@ -72,6 +77,7 @@ public void execute(String instruction) {
List history = new ArrayList<>();
int maxRetries = 3;
boolean finished = false;
+ boolean cacheInvalidated = false;
for (int i = 0; i < maxRetries && !finished; i++) {
try {
@@ -97,6 +103,17 @@ public void execute(String instruction) {
} catch (Exception e) {
log.error("Failed to execute plan (Attempt {}) {}", i + 1, e.getMessage());
context.logError("Attempt " + (i + 1) + " failed: " + e.getMessage());
+
+ // On first failure, invalidate cache and clear history to force fresh AI call
+ if (!cacheInvalidated && i == 0) {
+ boolean wasInvalidated = planner.invalidateCache(instruction);
+ if (wasInvalidated) {
+ log.info("Invalidated stale cache for instruction: {}", instruction);
+ history.clear(); // Clear history to get fresh plan from AI
+ cacheInvalidated = true;
+ }
+ }
+
history.add(UserMessage.from("Error executing plan: " + e.getMessage()));
}
}
diff --git a/midscene-core/src/main/java/com/midscene/core/agent/Planner.java b/midscene-core/src/main/java/com/midscene/core/agent/Planner.java
index e89ab187..2d04f965 100644
--- a/midscene-core/src/main/java/com/midscene/core/agent/Planner.java
+++ b/midscene-core/src/main/java/com/midscene/core/agent/Planner.java
@@ -1,6 +1,7 @@
package com.midscene.core.agent;
import com.midscene.core.agent.promt.PromptManager;
+import com.midscene.core.cache.TaskCache;
import com.midscene.core.model.AIModel;
import com.midscene.core.pojo.planning.PlanningResponse;
import com.midscene.core.utils.ObjectMapper;
@@ -18,14 +19,29 @@
public class Planner {
private final AIModel aiModel;
+ private final TaskCache cache;
public Planner(AIModel aiModel) {
+ this(aiModel, TaskCache.disabled());
+ }
+
+ public Planner(AIModel aiModel, TaskCache cache) {
this.aiModel = aiModel;
+ this.cache = cache != null ? cache : TaskCache.disabled();
}
public PlanningResponse plan(String instruction, String screenshotBase64, String pageSource,
List history) {
+ // Check cache for first attempts only (empty history means fresh attempt)
+ if (history.isEmpty()) {
+ PlanningResponse cached = cache.get(instruction);
+ if (cached != null) {
+ log.info("Cache hit for instruction: {}", instruction);
+ return cached;
+ }
+ }
+
UserMessage message;
if (history.isEmpty()) {
String promptText = PromptManager.constructPlanningPrompt(instruction);
@@ -53,6 +69,13 @@ public PlanningResponse plan(String instruction, String screenshotBase64, String
PlanningResponse planningResponse = ObjectMapper.mapResponseToClass(responseJson,
PlanningResponse.class);
planningResponse.setDescription(chatResponse.metadata().tokenUsage().toString());
+
+ // Store in cache for first successful attempts
+ if (history.size() == 2) { // First attempt: 1 user message + 1 AI response
+ cache.put(instruction, planningResponse);
+ log.debug("Cached planning response for instruction: {}", instruction);
+ }
+
return planningResponse;
} catch (Exception e) {
log.error("Failed to parse plan {}", e.getMessage());
@@ -73,4 +96,15 @@ public String query(String question, String screenshotBase64) {
log.debug("AI Query Response: {}", response);
return response;
}
+
+ /**
+ * Invalidates (removes) a cached plan for the given instruction.
+ * Call this when execution of a cached plan fails.
+ *
+ * @param instruction the instruction whose cached plan should be invalidated
+ * @return true if the cache entry was removed
+ */
+ public boolean invalidateCache(String instruction) {
+ return cache.invalidate(instruction);
+ }
}
diff --git a/midscene-core/src/main/java/com/midscene/core/agent/promt/PromptManager.java b/midscene-core/src/main/java/com/midscene/core/agent/promt/PromptManager.java
index 4a04a749..7840e97b 100644
--- a/midscene-core/src/main/java/com/midscene/core/agent/promt/PromptManager.java
+++ b/midscene-core/src/main/java/com/midscene/core/agent/promt/PromptManager.java
@@ -1,115 +1,225 @@
package com.midscene.core.agent.promt;
+/**
+ * Manages AI prompts for planning, extraction, and assertion operations. Based on the TypeScript version's prompt
+ * structure.
+ */
public class PromptManager {
- private static final String BASE_PROMPT = """
- ## Role:
- You are an expert in software testing
- You are an AI agent controlling a web browser. You have page screenshot and page source attached to this message.
- You are an expert in software page image (2D) and page element text analysis.
-
- ## Objective:
- - Always try to find XPATH or CSS selector or element with which you need to interact. If this is not possible, specify the coordinates for interacting with the page.
- - You will need to create a plan with list of actions to complete user instructions in web browser.
- - Instructions will be executed by webdriver - you need to prepare clear and understandable instructions that can only be interpreted in one way.
- - Identify elements in screenshots and text that match the user's description.
- - Return JSON data containing the element selector.
- - Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
-
- ## Skills:
- - Image analysis and recognition
+ private static final String PLANNING_PROMPT = """
+ ## Role
+ You are an expert AI agent controlling a web browser. You analyze screenshots and page source to plan and execute actions.
+
+ ## Objective
+ Plan the NEXT SINGLE ACTION to accomplish the user's instruction. Consider the current page state shown in the screenshot.
+
+ ## Skills
+ - Image analysis and element recognition
- Multilingual text understanding
- - Software UI design and testing
-
- ## Workflow:
- 1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters, indicating that the application may be non-English.
- 2. Based on the user's description, locate the target element in the list of element descriptions and the screenshot.
- 3. Found the required number of elements
- 4. Return JSON data containing the element selector.
- 5. Judge whether the user's description is order-sensitive (see below for definition and examples).
-
- ## Constraints:
- - Accurately identify element information based on the user's description and return the corresponding element from the element description information, not extracted from the image.
- - If no elements are found, the "actions" array should be empty.
- - The returned data must conform to the specified JSON format.
-
- ## Order-Sensitive Definition:
- - If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
- - If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
-
- Order-sensitive means the description contains phrases like:
- - "the third item in the list"
- - "the last button"
- - "the first input box"
- - "the second row"
-
- Not order-sensitive means the description is like:
- - "confirm button"
- - "search box"
- - "password input"
-
- ## Actions:
-
- Return a JSON object with a list of actions.
- Available actions: CLICK, TYPE_TEXT, SCROLL_DOWN, SCROLL_UP, HOVER - should be one of those strictly.
- - If no element is found, the "actions" array should be empty.
- - Action format:
- elementSelector - CSS or XPATH selector.
- selectorType - Should be `BY_XPATH` or `BY_CSS` strictly.
- CLICK(locate(x, y), elementSelector, selectorType),
- TYPE_TEXT(locate(x, y), elementSelector, selectorType, text),
- SCROLL_DOWN(locate(x, y), elementSelector, selectorType),
- SCROLL_UP(locate(x, y), elementSelector, selectorType),
- HOVER(locate(x, y), elementSelector, selectorType).
-
- ## Output Format:
-
- OUTPUT JSON ONLY WITHOUT ANY STEPS, THOUGHTS OR DATA
- RETURN RESULT IN JSON ONLY AS FOLLOWING FORMAT STRICTLY:
-
- {"actions":[{"type":"CLICK","locate":{"x":110,"y":220},"elementSelector":"//button","selectorType":"BY_XPATH"},{"type":"TYPE_TEXT","locate":{"x":110,"y":220},"elementSelector":"//button","selectorType":"BY_XPATH","text":"text to type"},{"type":"SCROLL_DOWN","locate":{"x":110,"y":220},"elementSelector":"//button","selectorType":"BY_XPATH"},{"type":"SCROLL_UP","locate":{"x":110,"y":220},"elementSelector":"//button","selectorType":"BY_XPATH"},{"type":"HOVER","locate":{"x":110,"y":220},"elementSelector":"//button","selectorType":"BY_XPATH"}]}
-
- ## User instruction:
-
- User instruction: %s
+ - Software UI design and testing expertise
+
+ ## Available Actions
+ %s
+
+ ## Workflow
+ 1. Analyze the current screenshot and page state
+ 2. Determine if the instruction has been completed
+ 3. If not complete, plan the single next action
+ 4. Return structured JSON response
+
+ ## Selector Guidelines
+ - Prefer XPATH or CSS selectors when elements are identifiable
+ - Use coordinates (x, y) as fallback when selectors are not reliable
+ - selectorType must be `BY_XPATH` or `BY_CSS`
+
+ ## Output Format
+ Return a JSON object with the following structure:
+
+ ```json
+ {
+ "log": "Brief description of what you're about to do",
+ "moreActionsNeededByInstruction": true|false,
+ "actions": [
+ {
+ "type": "ACTION_TYPE",
+ "locate": {"x": 100, "y": 200},
+ "elementSelector": "//xpath/or/css",
+ "selectorType": "BY_XPATH",
+ "text": "optional text for TYPE_TEXT",
+ "keyName": "optional key for KEYBOARD_PRESS",
+ "direction": "optional for SCROLL",
+ "url": "optional for NAVIGATE"
+ }
+ ],
+ "sleep": 0,
+ "error": null
+ }
+ ```
+
+ ### Field Descriptions
+ - **log**: A brief preamble explaining what you're about to do (use same language as instruction)
+ - **moreActionsNeededByInstruction**: true if more actions needed after this one, false if instruction will be complete
+ - **actions**: Array of action objects (usually just one). Empty if task is already complete.
+ - **sleep**: Optional milliseconds to wait after action (default 0)
+ - **error**: Set this if you cannot proceed (explain why)
+
+ ## User Instruction
+
+ %s
+
+ OUTPUT JSON ONLY. NO EXPLANATIONS OR MARKDOWN OUTSIDE THE JSON.
+ """;
+ private static final String EXTRACTION_PROMPT = """
+ ## Role
+ You are a professional in software UI design and testing.
+
+ ## Task
+ Extract data satisfying the DATA_DEMAND from the screenshot.
+
+ ## DATA_DEMAND
+ %s
+
+ ## Output Format
+ Return a JSON object:
+ ```json
+ {
+ "thought": "Brief explanation of your analysis",
+ "data": ,
+ "errors": []
+ }
+ ```
+
+ ## Rules
+ - The data field should match the type expected by the demand
+ - If data cannot be found, return null for data and explain in errors
+ - Be precise and accurate
+
+ OUTPUT JSON ONLY.
+ """;
+ private static final String ASSERTION_PROMPT = """
+ ## Role
+ You are evaluating whether a condition is true based on a screenshot.
+
+ ## Assertion to Verify
+ %s
+
+ ## Output Format
+ Return a JSON object:
+ ```json
+ {
+ "pass": true|false,
+ "thought": "Brief explanation of why the assertion passed or failed"
+ }
+ ```
+
+ Be precise and evaluate ONLY based on what is visible in the screenshot.
+
+ OUTPUT JSON ONLY.
""";
+ /**
+ * Constructs a planning prompt with dynamic action space description.
+ *
+ * @param instruction the user instruction to execute
+ * @return the formatted planning prompt
+ */
+ public static String constructPlanningPrompt(String instruction) {
+ return String.format(PLANNING_PROMPT, getActionSpaceDescription(), instruction);
+ }
+
+ /**
+ * Constructs a retry prompt when a previous attempt fails.
+ *
+ * @param instruction the original user instruction
+ * @return the formatted retry prompt
+ */
+ public static String constructRetryPrompt(String instruction) {
+ return String.format("""
+ Previous attempt failed. Analyze the new screenshot and page source carefully.
+ Consider what may have gone wrong and try an alternative approach.
+
+ User instruction: %s
+
+ Use the same output format as before.""", instruction);
+ }
+
+ /**
+ * Constructs a query prompt for answering questions about the page.
+ *
+ * @param question the question to answer
+ * @return the formatted query prompt
+ */
public static String constructQueryPrompt(String question) {
- return String.format(
- "You are an AI agent. User question: %s. " +
- "Answer the question briefly based on the screenshot provided.",
- question);
+ return String.format("""
+ You are an AI assistant analyzing a web page screenshot.
+
+ User question: %s
+
+ Answer the question briefly and accurately based on the screenshot provided.
+ Focus only on what is visible in the current screenshot.""", question);
}
-// public static String constructPlanningPrompt(String instruction) {
-// return String.format(
-// "You are an AI agent controlling a web browser. You have page screenshot and page source attached to this message. "
-// +
-// "You will need to create a plan with list of actions to complete user instructions in web browser." +
-// "Instructions will be executed by webdriver - you need to prepare clear and understandable instructions that can only be interpreted in one way. "
-// +
-// "Always try to find XPATH or CSS selector or element with which you need to interact. If this is not possible, specify the coordinates for interacting with the page. "
-// +
-// "User instruction: %s. " +
-// "Return a JSON object with a list of actions. " +
-// "Available actions: CLICK, TYPE_TEXT, SCROLL_DOWN, SCROLL_UP, HOVER - should be one of those strictly. " +
-// "elementSelector - CSS or XPATH selector. " +
-// "selectorType - Should be `BY_XPATH` or `BY_CSS` strictly. " +
-// "CLICK(locate(x, y), elementSelector, selectorType), " +
-// "TYPE_TEXT(locate(x, y), elementSelector, selectorType, text), " +
-// "SCROLL_DOWN(locate(x, y), elementSelector, selectorType), " +
-// "SCROLL_UP(locate(x, y), elementSelector, selectorType), " +
-// "HOVER(locate(x, y), elementSelector, selectorType). " +
-// "Example Format: {\"actions\":[{\"type\":\"CLICK\",\"locate\":{\"x\":110,\"y\":220},\"elementSelector\":\"//button\",\"selectorType\":\"BY_XPATH\"},{\"type\":\"TYPE_TEXT\",\"locate\":{\"x\":110,\"y\":220},\"elementSelector\":\"//button\",\"selectorType\":\"BY_XPATH\",\"text\":\"text to type\"},{\"type\":\"SCROLL_DOWN\",\"locate\":{\"x\":110,\"y\":220},\"elementSelector\":\"//button\",\"selectorType\":\"BY_XPATH\"},{\"type\":\"SCROLL_UP\",\"locate\":{\"x\":110,\"y\":220},\"elementSelector\":\"//button\",\"selectorType\":\"BY_XPATH\"},{\"type\":\"HOVER\",\"locate\":{\"x\":110,\"y\":220},\"elementSelector\":\"//button\",\"selectorType\":\"BY_XPATH\"}]}",
-// instruction);
-// }
+ /**
+ * Constructs an extraction prompt for structured data extraction.
+ *
+ * @param dataDemand the description of data to extract
+ * @return the formatted extraction prompt
+ */
+ public static String constructExtractionPrompt(String dataDemand) {
+ return String.format(EXTRACTION_PROMPT, dataDemand);
+ }
- public static String constructPlanningPrompt(String instruction) {
- return String.format(BASE_PROMPT, instruction);
+ /**
+ * Constructs an assertion prompt for verifying conditions.
+ *
+ * @param assertion the assertion to verify
+ * @return the formatted assertion prompt
+ */
+ public static String constructAssertionPrompt(String assertion) {
+ return String.format(ASSERTION_PROMPT, assertion);
}
- public static String constructRetryPrompt(String instruction) {
- return String.format("Previous attempt failed. Please try again with this new screenshot and new page source. " +
- "User instruction: %s. ", instruction);
+ /**
+ * Generates a dynamic description of the action space.
+ *
+ * @return formatted action space description
+ */
+ private static String getActionSpaceDescription() {
+ StringBuilder sb = new StringBuilder();
+
+ sb.append("### Mouse Actions\n");
+ sb.append("- **CLICK**: Click on an element. Params: locate{x,y}, elementSelector, selectorType\n");
+ sb.append("- **TAP**: Tap on an element (alias for click). Params: locate{x,y}\n");
+ sb.append("- **DOUBLE_CLICK**: Double-click on an element. Params: locate{x,y}, elementSelector, selectorType\n");
+ sb.append("- **RIGHT_CLICK**: Right-click on an element. Params: locate{x,y}, elementSelector, selectorType\n");
+ sb.append("- **HOVER**: Hover over an element. Params: locate{x,y}, elementSelector, selectorType\n");
+ sb.append("- **LONG_PRESS**: Long press on an element. Params: locate{x,y}, durationMs\n\n");
+
+ sb.append("### Input Actions\n");
+ sb.append("- **TYPE_TEXT**: Type text into an element. Params: locate{x,y}, elementSelector, selectorType, text\n");
+ sb.append("- **INPUT**: Input text with mode. Params: locate{x,y}, value, inputMode(replace|append|clear)\n");
+ sb.append("- **CLEAR_INPUT**: Clear an input field. Params: locate{x,y}, elementSelector, selectorType\n");
+ sb.append("- **KEYBOARD_PRESS**: Press a keyboard key. Params: keyName (e.g., Enter, Tab, Escape)\n\n");
+
+ sb.append("### Scroll Actions\n");
+ sb.append("- **SCROLL_DOWN**: Scroll down. Params: locate{x,y}\n");
+ sb.append("- **SCROLL_UP**: Scroll up. Params: locate{x,y}\n");
+ sb.append("- **SCROLL**: Scroll in a direction. Params: locate{x,y}, direction(up|down|left|right), distance\n\n");
+
+ sb.append("### Gesture Actions\n");
+ sb.append("- **SWIPE**: Swipe gesture. Params: from{x,y}, to{x,y}, durationMs\n");
+ sb.append("- **DRAG_AND_DROP**: Drag and drop. Params: from{x,y}, to{x,y}\n\n");
+
+ sb.append("### Navigation Actions\n");
+ sb.append("- **NAVIGATE**: Navigate to URL. Params: url\n");
+ sb.append("- **RELOAD**: Reload the page. No params required.\n");
+ sb.append("- **GO_BACK**: Go back in history. No params required.\n\n");
+
+ sb.append("### Utility Actions\n");
+ sb.append("- **SLEEP**: Wait for specified time. Params: sleepMs\n");
+ sb.append("- **ASSERT**: Assert a condition. Params: assertion\n");
+ sb.append("- **WAIT_FOR**: Wait for a condition. Params: assertion, timeoutMs\n");
+
+ return sb.toString();
}
}
diff --git a/midscene-core/src/main/java/com/midscene/core/cache/TaskCache.java b/midscene-core/src/main/java/com/midscene/core/cache/TaskCache.java
new file mode 100644
index 00000000..e3bafa1c
--- /dev/null
+++ b/midscene-core/src/main/java/com/midscene/core/cache/TaskCache.java
@@ -0,0 +1,300 @@
+package com.midscene.core.cache;
+
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.MapperFeature;
+import com.fasterxml.jackson.databind.json.JsonMapper;
+import com.midscene.core.pojo.planning.PlanningResponse;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import lombok.extern.log4j.Log4j2;
+
+/**
+ * Cache system for storing and retrieving AI planning responses. Supports both in-memory and file-based caching modes.
+ */
+@Log4j2
+public class TaskCache {
+
+ private static final com.fasterxml.jackson.databind.ObjectMapper MAPPER = JsonMapper.builder()
+ .configure(MapperFeature.ACCEPT_CASE_INSENSITIVE_PROPERTIES, true)
+ .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
+ .build();
+ private final Map memoryCache = new ConcurrentHashMap<>();
+ private CacheMode mode;
+ private Path cacheFilePath;
+ /**
+ * Creates a new TaskCache with the given mode and optional file path.
+ *
+ * @param mode the cache mode
+ * @param cacheFilePath optional path to persist cache to file (can be null for memory-only)
+ */
+ public TaskCache(CacheMode mode, Path cacheFilePath) {
+ this.mode = mode;
+ this.cacheFilePath = cacheFilePath;
+
+ if (cacheFilePath != null && mode != CacheMode.DISABLED) {
+ loadFromFile();
+ }
+ }
+
+ /**
+ * Creates a new memory-only cache with READ_WRITE mode.
+ */
+ public TaskCache() {
+ this(CacheMode.READ_WRITE, null);
+ }
+
+ /**
+ * Creates a cache with a specific file path for persistence.
+ *
+ * @param cacheFilePath path to the cache file
+ * @return a new TaskCache instance
+ */
+ public static TaskCache withFile(Path cacheFilePath) {
+ return new TaskCache(CacheMode.READ_WRITE, cacheFilePath);
+ }
+
+ /**
+ * Creates a cache with a specific file path and mode.
+ *
+ * @param cacheFilePath path to the cache file
+ * @param mode the cache mode
+ * @return a new TaskCache instance
+ */
+ public static TaskCache withFile(Path cacheFilePath, CacheMode mode) {
+ return new TaskCache(mode, cacheFilePath);
+ }
+
+ /**
+ * Creates a memory-only cache.
+ *
+ * @return a new TaskCache instance
+ */
+ public static TaskCache memoryOnly() {
+ return new TaskCache(CacheMode.READ_WRITE, null);
+ }
+
+ /**
+ * Creates a disabled cache (no caching).
+ *
+ * @return a new TaskCache instance that never caches
+ */
+ public static TaskCache disabled() {
+ return new TaskCache(CacheMode.DISABLED, null);
+ }
+
+ /**
+ * Gets a cached planning response for the given prompt.
+ *
+ * @param prompt the prompt to look up
+ * @return the cached response, or null if not found or reading is disabled
+ */
+ public PlanningResponse get(String prompt) {
+ if (mode == CacheMode.WRITE_ONLY || mode == CacheMode.DISABLED) {
+ return null;
+ }
+
+ String key = generateCacheKey(prompt);
+ PlanningResponse cached = memoryCache.get(key);
+
+ if (cached != null) {
+ log.debug("Cache hit for prompt key: {}", key.substring(0, 8));
+ }
+
+ return cached;
+ }
+
+ /**
+ * Stores a planning response in the cache.
+ *
+ * @param prompt the prompt used to generate the response
+ * @param response the response to cache
+ */
+ public void put(String prompt, PlanningResponse response) {
+ if (mode == CacheMode.READ_ONLY || mode == CacheMode.DISABLED) {
+ return;
+ }
+
+ String key = generateCacheKey(prompt);
+ memoryCache.put(key, response);
+ log.debug("Cached response for prompt key: {}", key.substring(0, 8));
+
+ if (cacheFilePath != null) {
+ saveToFile();
+ }
+ }
+
+ /**
+ * Checks if a response is cached for the given prompt.
+ *
+ * @param prompt the prompt to check
+ * @return true if cached, false otherwise
+ */
+ public boolean contains(String prompt) {
+ if (mode == CacheMode.WRITE_ONLY || mode == CacheMode.DISABLED) {
+ return false;
+ }
+ return memoryCache.containsKey(generateCacheKey(prompt));
+ }
+
+ /**
+ * Clears all cached entries.
+ */
+ public void clear() {
+ memoryCache.clear();
+ log.info("Cache cleared");
+
+ if (cacheFilePath != null) {
+ saveToFile();
+ }
+ }
+
+ /**
+ * Invalidates (removes) a specific cached entry.
+ * Use this when a cached plan fails and needs to be refreshed.
+ *
+ * @param prompt the prompt to invalidate
+ * @return true if the entry was removed, false if it wasn't cached
+ */
+ public boolean invalidate(String prompt) {
+ if (mode == CacheMode.DISABLED) {
+ return false;
+ }
+
+ String key = generateCacheKey(prompt);
+ PlanningResponse removed = memoryCache.remove(key);
+
+ if (removed != null) {
+ log.info("Invalidated cache entry for prompt key: {}", key.substring(0, 8));
+ if (cacheFilePath != null) {
+ saveToFile();
+ }
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Returns the number of cached entries.
+ *
+ * @return cache size
+ */
+ public int size() {
+ return memoryCache.size();
+ }
+
+ /**
+ * Gets the current cache mode.
+ *
+ * @return the cache mode
+ */
+ public CacheMode getMode() {
+ return mode;
+ }
+
+ /**
+ * Sets the cache mode.
+ *
+ * @param mode the new cache mode
+ */
+ public void setMode(CacheMode mode) {
+ this.mode = mode;
+ }
+
+ /**
+ * Generates a cache key from a prompt using SHA-256 hash.
+ *
+ * @param prompt the prompt to hash
+ * @return the hash key
+ */
+ private String generateCacheKey(String prompt) {
+ try {
+ MessageDigest digest = MessageDigest.getInstance("SHA-256");
+ byte[] hash = digest.digest(prompt.getBytes());
+ StringBuilder hexString = new StringBuilder();
+ for (byte b : hash) {
+ String hex = Integer.toHexString(0xff & b);
+ if (hex.length() == 1) {
+ hexString.append('0');
+ }
+ hexString.append(hex);
+ }
+ return hexString.toString();
+ } catch (NoSuchAlgorithmException e) {
+ // Fallback to simple hash
+ log.warn("SHA-256 not available, using hashCode fallback");
+ return String.valueOf(prompt.hashCode());
+ }
+ }
+
+ /**
+ * Loads cache entries from file.
+ */
+ private void loadFromFile() {
+ if (cacheFilePath == null || !Files.exists(cacheFilePath)) {
+ return;
+ }
+
+ try {
+ String json = Files.readString(cacheFilePath);
+ Map loaded = MAPPER.readValue(
+ json, new TypeReference