Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 23 additions & 27 deletions pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ authors = [
{name = "askui GmbH", email = "info@askui.com"},
]
dependencies = [
"askui-agent-os>=26.1.1",
'askui-agent-os>=26.4.1; sys_platform == "darwin"',
'askui-agent-os>=26.5.1; sys_platform != "darwin"',
"anthropic>=0.86.0",
"fastapi>=0.115.12",
"fastmcp>=2.3.0",
Expand Down
24 changes: 23 additions & 1 deletion src/askui/computer_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ class ComputerAgent(Agent):
tools (AgentToolbox | None, optional): Custom toolbox instance. If `None`, a default one will be created with `AskUiControllerClient`.
settings (AgentSettings | None, optional): Provider-based model settings. If `None`, uses the default AskUI model stack.
retry (Retry, optional): The retry instance to use for retrying failed actions. Defaults to `ConfigurableRetry` with exponential backoff. Currently only supported for `locate()` method.
act_tools (list[Tool] | None, optional): Additional tools to make available for the `act()` method.
act_tools (list[Tool] | None, optional): Additional tools to make available for
the `act()` method for every call. Same tools can instead be passed per call
via `act(..., tools=[...])` (see example below).

Example:
```python
Expand All @@ -67,6 +69,26 @@ class ComputerAgent(Agent):
agent.type("Hello World")
agent.act("Open settings menu")
```

Example (optional tools for `act()`):
Register tools from `askui.tools.store` (or your own `Tool` implementations)
either on the agent so they apply to all `act()` calls, or only for one call.

```python
from askui import ComputerAgent
from askui.tools.store.computer import ComputerSaveScreenshotTool

with ComputerAgent(
act_tools=[ComputerSaveScreenshotTool(base_dir="/path/to/screenshots")]
) as agent:
agent.act("Take a screenshot and save it as demo/demo.png")

with ComputerAgent() as agent:
agent.act(
"Take a screenshot and save it as demo/demo.png",
tools=[ComputerSaveScreenshotTool(base_dir="/path/to/screenshots")],
)
```
"""

@telemetry.record_call(
Expand Down
43 changes: 43 additions & 0 deletions src/askui/tools/agent_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,3 +676,46 @@ def set_window_in_focus(self, process_id: int, window_id: int) -> None:
window_id (int): The ID of the window to set as active.
"""
raise NotImplementedError

def get_file_names(self, absolute_directory_path: str) -> list[str]:
"""
List file names in an absolute directory on the automation target
(desktop Agent OS).

Args:
absolute_directory_path (str): Absolute directory path on the target system.

Returns:
list[str]: Names of files in that directory.

Raises:
NotImplementedError: If the implementation does not support this operation.
"""
raise NotImplementedError

def get_file(self, path: str) -> Image.Image | str:
"""
Read a file from the automation target (desktop Agent OS).

Binary image payloads are returned as `PIL.Image.Image` when recognized;
otherwise UTF-8 text when decodable.

Args:
path (str): File path on the target system.

Returns:
Image.Image | str: Decoded file contents.

Raises:
NotImplementedError: If the implementation does not support this operation.
"""
raise NotImplementedError

def remove_virtual_displays(self) -> None:
"""
Remove virtual displays from the controller, leaving real displays only.

Raises:
NotImplementedError: If the implementation does not support this operation.
"""
raise NotImplementedError
119 changes: 119 additions & 0 deletions src/askui/tools/askui/askui_controller.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import base64
import logging
import pathlib
import subprocess
Expand Down Expand Up @@ -43,6 +44,8 @@
DeleteRenderObjectCommand,
GetActiveProcessCommand,
GetActiveWindowCommand,
GetFileCommand,
GetFileNamesCommand,
GetMousePositionCommand,
GetSystemInfoCommand,
Guid,
Expand All @@ -51,6 +54,7 @@
Location,
Message,
Parameter3,
RemoveVirtualDisplaysCommand,
RenderImage,
RenderObjectId,
RenderObjectStyle,
Expand All @@ -66,10 +70,13 @@
GetActiveProcessResponseModel,
GetActiveWindowResponse,
GetActiveWindowResponseModel,
GetFileNamesResponse,
GetFileResponse,
GetSystemInfoResponse,
GetSystemInfoResponseModel,
)
from askui.utils.annotated_image import AnnotatedImage
from askui.utils.image_utils import base64_to_image

from ..utils import process_exists, wait_for_port
from .exceptions import (
Expand Down Expand Up @@ -217,6 +224,12 @@ def connect(self) -> None:
self._start_session()
self._start_execution()
self.set_display(self._display)
if self._settings.clean_virtual_displays:
Comment thread
mlikasam-askui marked this conversation as resolved.
logger.info(
"clean_virtual_displays is enabled. Removing all virtual displays ... "
)
self.remove_virtual_displays()
logger.info("Virtual displays removed.")

def _get_stub(self) -> controller_v1.ControllerAPIStub:
assert isinstance(self._stub, controller_v1.ControllerAPIStub), (
Expand Down Expand Up @@ -1294,3 +1307,109 @@ def set_window_in_focus(self, process_id: int, window_id: int) -> None:
_window_id = Parameter3(root=window_id)
command = SetActiveWindowCommand(parameters=[_process_id, _window_id])
self._send_command(command)

def get_file_names(self, absolute_directory_path: str) -> list[str]:
"""
Get the file names in the given absolute directory on the device under
automation.

Args:
absolute_directory_path (str): The absolute directory path to list
file names from.

Returns:
list[str]: The file names returned by the controller.
"""
assert isinstance(self._stub, controller_v1.ControllerAPIStub), (
"Stub is not initialized"
)
self._reporter.add_message(
"AgentOS", f"get_file_names({absolute_directory_path})"
)
command = GetFileNamesCommand(parameters=[absolute_directory_path])
res = self._send_command(command).message.command
if not isinstance(res, GetFileNamesResponse):
message = f"unexpected response type: {res}"
raise DesktopAgentOsError(message)
if res.error is not None:
raise DesktopAgentOsError(res.error)
if res.response is None:
message = f"{type(res).__name__} is missing both error and response"
raise DesktopAgentOsError(message)
self._reporter.add_message(
"AgentOS", f"get_file_names({absolute_directory_path}) -> {res.response}"
)
return res.response.fileNames

def get_file(self, path: str) -> Image.Image | str:
"""
Get the contents of a file at the given path on the device under
automation.

The controller returns the file as a Base64-encoded string, which is
decoded and returned as `PIL.Image.Image` when the bytes can be opened
as an image (PNG, JPEG, BMP, GIF, WebP, TIFF, ...), or as `str` when
they decode cleanly as UTF-8 text.

Args:
path (str): The file path to read on the device under automation.

Returns:
Image.Image | str: The decoded file contents.

Raises:
DesktopAgentOsError: If the file cannot be read or the response is invalid.
"""
assert isinstance(self._stub, controller_v1.ControllerAPIStub), (
"Stub is not initialized"
)
self._reporter.add_message("AgentOS", f"get_file({path})")
command = GetFileCommand(parameters=[path])
res = self._send_command(command).message.command
if not isinstance(res, GetFileResponse):
message = f"unexpected response type: {res}"
raise DesktopAgentOsError(message)
if res.error is not None:
raise DesktopAgentOsError(res.error)
if res.response is None:
message = f"{type(res).__name__} is missing both error and response"
raise DesktopAgentOsError(message)
decoded = self._decode_file_payload(res.response.file.content)
if isinstance(decoded, Image.Image):
detail = f"image ({decoded.format}, {decoded.size[0]}x{decoded.size[1]})"
self._reporter.add_message(
"AgentOS", f"get_file({path}) -> {detail}", decoded
)
Comment thread
mlikasam-askui marked this conversation as resolved.
return decoded

detail = f"text ({len(decoded)} chars)"
self._reporter.add_message("AgentOS", f"get_file({path}) -> {detail}")
return decoded

def remove_virtual_displays(self) -> None:
"""
Remove all virtual displays from the controller, leaving only real
displays active.
"""
assert isinstance(self._stub, controller_v1.ControllerAPIStub), (
"Stub is not initialized"
)
self._reporter.add_message("AgentOS", "remove_virtual_displays()")
command = RemoveVirtualDisplaysCommand()
self._send_command(command)
self._reporter.add_message("AgentOS", "remove_virtual_displays() -> done")

@staticmethod
def _decode_file_payload(base64_data: str) -> Image.Image | str:
try:
Comment thread
mlikasam-askui marked this conversation as resolved.
return base64_to_image(base64_data)
except ValueError:
pass
data = base64.b64decode(base64_data, validate=True)
if b"\x00" not in data:
try:
return data.decode("utf-8")
except UnicodeDecodeError:
pass
message = "File contents are neither a supported image nor UTF-8 text"
raise DesktopAgentOsError(message)
8 changes: 8 additions & 0 deletions src/askui/tools/askui/askui_controller_client_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,13 @@ class AskUiControllerClientSettings(BaseSettings):
"Controller server. Defaults to True.",
)

clean_virtual_displays: bool = Field(
default=False,
description=(
"Whether to clean virtual displays after the controller is started."
"Default: False"
),
)


__all__ = ["AskUiControllerClientSettings"]
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from __future__ import annotations

from enum import Enum
from typing import Dict, List, Literal, Optional, Union
from typing import Any, Dict, List, Literal, Optional, Union

from pydantic import BaseModel, ConfigDict, Field, RootModel, confloat, conint, constr
from pydantic import (BaseModel, ConfigDict, Field, RootModel, confloat,
conint, constr)


class ParameterEnum(Enum):
Expand Down Expand Up @@ -393,6 +394,20 @@ class LoadCharacterMapCommand(BaseModel):
None, max_length=1, min_length=1
)

class GetFileNamesCommand(BaseModel):
name: Literal['GetFileNames'] = 'GetFileNames'
parameters: list[str] = Field(..., max_length=1, min_length=1)


class GetFileCommand(BaseModel):
name: Literal['GetFile'] = 'GetFile'
parameters: list[str] = Field(..., max_length=1, min_length=1)


class RemoveVirtualDisplaysCommand(BaseModel):
name: Literal['RemoveVirtualDisplays'] = 'RemoveVirtualDisplays'
parameters: List[str] = Field(default=[], max_length=0)

Command =Union[
GetSystemInfoCommand,
GetMousePositionCommand,
Expand All @@ -412,6 +427,9 @@ class LoadCharacterMapCommand(BaseModel):
SetActiveProcessCommand,
GetActiveWindowCommand,
SetActiveWindowCommand,
GetFileNamesCommand,
GetFileCommand,
RemoveVirtualDisplaysCommand,
]

class Message(BaseModel):
Expand Down
Loading
Loading