From 21ec19baba153f23417e50340f24a8a36fb19eaa Mon Sep 17 00:00:00 2001 From: Eduard van Valkenburg Date: Wed, 22 Apr 2026 12:31:44 +0200 Subject: [PATCH 1/4] improved parsing of tool call results and tweaks --- .../packages/core/agent_framework/__init__.py | 2 + .../packages/core/agent_framework/_tools.py | 77 ++++++- python/packages/core/tests/core/test_tools.py | 148 +++++++++++++ python/packages/hyperlight/README.md | 6 + .../_execute_code_tool.py | 197 +++++++++++------- .../hyperlight/test_hyperlight_codeact.py | 155 ++++++++++++++ 6 files changed, 500 insertions(+), 85 deletions(-) diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py index 7475b1eb96..05f65873bc 100644 --- a/python/packages/core/agent_framework/__init__.py +++ b/python/packages/core/agent_framework/__init__.py @@ -125,6 +125,7 @@ prepend_agent_framework_to_user_agent, ) from ._tools import ( + SKIP_PARSING, FunctionInvocationConfiguration, FunctionInvocationLayer, FunctionTool, @@ -258,6 +259,7 @@ "GROUP_INDEX_KEY", "GROUP_KIND_KEY", "GROUP_TOKEN_COUNT_KEY", + "SKIP_PARSING", "SUMMARIZED_BY_SUMMARY_ID_KEY", "SUMMARY_OF_GROUP_IDS_KEY", "SUMMARY_OF_MESSAGE_IDS_KEY", diff --git a/python/packages/core/agent_framework/_tools.py b/python/packages/core/agent_framework/_tools.py index 5f5e91b656..91db646855 100644 --- a/python/packages/core/agent_framework/_tools.py +++ b/python/packages/core/agent_framework/_tools.py @@ -94,6 +94,31 @@ ChatClientT = TypeVar("ChatClientT", bound="SupportsChatGetResponse[Any]") ResponseModelBoundT = TypeVar("ResponseModelBoundT", bound=BaseModel) + +class _SkipParsingSentinel: + """Sentinel signaling that :meth:`FunctionTool.invoke` should return the raw value. + + When passed as the ``result_parser`` keyword argument to ``invoke``, the configured + ``result_parser`` (and the default :meth:`FunctionTool.parse_result`) are bypassed + and the wrapped function's return value is returned unchanged. + + Use the module-level ``SKIP_PARSING`` singleton — do not instantiate this class. + """ + + _instance: ClassVar[_SkipParsingSentinel | None] = None + + def __new__(cls) -> _SkipParsingSentinel: + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __repr__(self) -> str: + return "SKIP_PARSING" + + +SKIP_PARSING: Final[_SkipParsingSentinel] = _SkipParsingSentinel() +"""Sentinel for ``FunctionTool.invoke(result_parser=...)`` meaning "do not parse the result".""" + # region Helpers @@ -508,31 +533,63 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: self.invocation_exception_count += 1 raise + @overload async def invoke( self, *, arguments: BaseModel | Mapping[str, Any] | None = None, context: FunctionInvocationContext | None = None, tool_call_id: str | None = None, + result_parser: _SkipParsingSentinel, **kwargs: Any, - ) -> list[Content]: + ) -> Any: ... + + @overload + async def invoke( + self, + *, + arguments: BaseModel | Mapping[str, Any] | None = None, + context: FunctionInvocationContext | None = None, + tool_call_id: str | None = None, + result_parser: None = None, + **kwargs: Any, + ) -> list[Content]: ... + + async def invoke( + self, + *, + arguments: BaseModel | Mapping[str, Any] | None = None, + context: FunctionInvocationContext | None = None, + tool_call_id: str | None = None, + result_parser: _SkipParsingSentinel | None = None, + **kwargs: Any, + ) -> list[Content] | Any: """Run the AI function with the provided arguments as a Pydantic model. The raw return value of the wrapped function is automatically parsed into a ``list[Content]`` using :meth:`parse_result` or the custom ``result_parser`` - if one was provided. Every result — text, rich media, or serialized objects — - is represented uniformly as Content items. + configured on the tool. Every result — text, rich media, or serialized + objects — is represented uniformly as Content items. + + Pass the module-level :data:`SKIP_PARSING` sentinel as ``result_parser`` to + bypass parsing entirely and receive the wrapped function's raw return value. + This is intended for callers (e.g. sandboxed runtimes) that consume the value + from Python directly and would otherwise undo the ``Content`` wrapping. Keyword Args: arguments: A mapping or model instance containing the arguments for the function. context: Explicit function invocation context carrying runtime kwargs. tool_call_id: Optional tool call identifier used for telemetry and tracing. + result_parser: Per-call override. Pass :data:`SKIP_PARSING` to skip parsing + and return the wrapped function's raw value. When omitted (the default), + the tool's configured ``result_parser`` (or :meth:`parse_result`) is used. kwargs: Direct function argument values. When provided, every keyword must match a declared tool parameter. Runtime data must be passed via ``context``. Returns: - A list of Content items representing the tool output. + ``list[Content]`` by default. The raw function return value (``Any``) when + ``result_parser=SKIP_PARSING`` is supplied. Raises: TypeError: If arguments is not mapping-like or fails schema checks. @@ -544,6 +601,7 @@ async def invoke( from ._types import Content from .observability import OBSERVABILITY_SETTINGS + skip_parsing = result_parser is SKIP_PARSING parser = self.result_parser or FunctionTool.parse_result parameter_names = set(self.parameters().get("properties", {}).keys()) @@ -616,6 +674,10 @@ async def invoke( logger.debug(f"Function arguments: {observable_kwargs}") res = self.__call__(**call_kwargs) result = await res if inspect.isawaitable(res) else res + if skip_parsing: + logger.info(f"Function {self.name} succeeded.") + logger.debug(f"Function result: {type(result).__name__}") + return result try: parsed = parser(result) except Exception: @@ -671,6 +733,13 @@ async def invoke( logger.error(f"Function failed. Error: {exception}") raise else: + if skip_parsing: + logger.info(f"Function {self.name} succeeded.") + if OBSERVABILITY_SETTINGS.SENSITIVE_DATA_ENABLED: # type: ignore[name-defined] + result_str = str(result) + span.set_attribute(OtelAttr.TOOL_RESULT, result_str) + logger.debug(f"Function result: {result_str}") + return result try: parsed = parser(result) except Exception: diff --git a/python/packages/core/tests/core/test_tools.py b/python/packages/core/tests/core/test_tools.py index 6fa7172295..95c32c1f4f 100644 --- a/python/packages/core/tests/core/test_tools.py +++ b/python/packages/core/tests/core/test_tools.py @@ -8,6 +8,7 @@ from pydantic import BaseModel from agent_framework import ( + SKIP_PARSING, Content, FunctionTool, tool, @@ -1298,6 +1299,153 @@ def __len__(self) -> int: assert len(normalized) == 2 assert normalized[0] is bundled assert normalized[1] is standalone +# region SKIP_PARSING sentinel + + +async def test_invoke_skip_parsing_returns_native_value() -> None: + """invoke(result_parser=SKIP_PARSING) returns the wrapped function's raw value.""" + + @tool + def get_weather(city: str) -> dict[str, Any]: + """Get the weather.""" + return {"city": city, "temperature_c": 21.5, "conditions": "partly cloudy"} + + raw = await get_weather.invoke(arguments={"city": "Seattle"}, result_parser=SKIP_PARSING) + + assert isinstance(raw, dict) + assert raw == {"city": "Seattle", "temperature_c": 21.5, "conditions": "partly cloudy"} + + +async def test_invoke_skip_parsing_passes_through_custom_objects() -> None: + """SKIP_PARSING must not call str()/repr() on the result.""" + + class Custom: # noqa: B903 + def __init__(self, value: int) -> None: + self.value = value + + @tool + def make() -> Custom: + """Make a custom object.""" + return Custom(42) + + raw = await make.invoke(result_parser=SKIP_PARSING) + + assert isinstance(raw, Custom) + assert raw.value == 42 + + +async def test_invoke_skip_parsing_awaits_async_functions() -> None: + @tool + async def slow(x: int) -> int: + """Async tool.""" + return x * 2 + + raw = await slow.invoke(arguments={"x": 21}, result_parser=SKIP_PARSING) + assert raw == 42 + + +async def test_invoke_skip_parsing_bypasses_configured_result_parser() -> None: + """The tool's own result_parser is bypassed when SKIP_PARSING is requested.""" + parser_calls: list[Any] = [] + + def parser(value: Any) -> str: + parser_calls.append(value) + return "PARSED" + + @tool(result_parser=parser) + def make_dict() -> dict[str, int]: + """Returns a dict.""" + return {"a": 1} + + raw = await make_dict.invoke(result_parser=SKIP_PARSING) + assert raw == {"a": 1} + assert parser_calls == [] + + # Sanity: omitting result_parser still applies the configured parser. + parsed = await make_dict.invoke() + assert parsed[0].type == "text" + assert parsed[0].text == "PARSED" + + +async def test_invoke_skip_parsing_validates_arguments() -> None: + """Argument validation is shared with the default path.""" + + @tool + def adder(x: int, y: int) -> int: + """Add.""" + return x + y + + with pytest.raises(TypeError): + await adder.invoke(arguments={"x": "not-an-int", "y": 1}, result_parser=SKIP_PARSING) + + +async def test_invoke_skip_parsing_rejects_unexpected_runtime_kwargs() -> None: + @tool + async def echo(message: str) -> str: + """Echo.""" + return message + + with pytest.raises(TypeError, match="Unexpected keyword argument"): + await echo.invoke(arguments={"message": "hi"}, result_parser=SKIP_PARSING, api_token="secret") + + +async def test_invoke_skip_parsing_raises_for_declaration_only_tool() -> None: + declared = FunctionTool(name="dummy", description="declaration only") + + from agent_framework.exceptions import ToolException + + with pytest.raises(ToolException): + await declared.invoke(arguments={}, result_parser=SKIP_PARSING) + + +async def test_invoke_skip_parsing_records_telemetry(span_exporter: InMemorySpanExporter) -> None: + """SKIP_PARSING participates in OTEL spans and records str(raw) as TOOL_RESULT.""" + + @tool(name="raw_tool", description="raw tool") + def returns_dict(x: int) -> dict[str, int]: + """Returns a dict.""" + return {"value": x} + + span_exporter.clear() + raw = await returns_dict.invoke(arguments={"x": 5}, tool_call_id="raw_call", result_parser=SKIP_PARSING) + + assert raw == {"value": 5} + spans = span_exporter.get_finished_spans() + assert len(spans) == 1 + span = spans[0] + assert span.attributes[OtelAttr.TOOL_NAME] == "raw_tool" + assert span.attributes[OtelAttr.TOOL_CALL_ID] == "raw_call" + assert span.attributes[OtelAttr.TOOL_RESULT] == "{'value': 5}" + + +async def test_invoke_default_path_records_parsed_telemetry( + span_exporter: InMemorySpanExporter, +) -> None: + """Regression: omitting SKIP_PARSING still records the parsed result in telemetry.""" + + def parser(value: Any) -> str: + return f"parsed:{value}" + + @tool(name="parsed_tool", description="parsed", result_parser=parser) + def returns_int() -> int: + """Returns an int.""" + return 7 + + span_exporter.clear() + parsed = await returns_int.invoke(tool_call_id="parsed_call") + + assert parsed[0].text == "parsed:7" + spans = span_exporter.get_finished_spans() + assert len(spans) == 1 + assert spans[0].attributes[OtelAttr.TOOL_RESULT] == "parsed:7" + + +def test_skip_parsing_is_singleton() -> None: + """SKIP_PARSING is a singleton; instantiation returns the same object.""" + from agent_framework._tools import _SkipParsingSentinel + + assert _SkipParsingSentinel() is SKIP_PARSING + assert repr(SKIP_PARSING) == "SKIP_PARSING" # endregion diff --git a/python/packages/hyperlight/README.md b/python/packages/hyperlight/README.md index 1b1bc1e0ce..afc36f9365 100644 --- a/python/packages/hyperlight/README.md +++ b/python/packages/hyperlight/README.md @@ -130,3 +130,9 @@ codeact = HyperlightCodeActProvider( - `allowed_domains` accepts a single string target such as `"github.com"` to allow all backend-supported methods, an explicit `(target, method_or_methods)` tuple such as `("github.com", "GET")`, or an `AllowedDomain` named tuple. +- Tools registered with the sandbox return their native Python value + (`dict`, `list`, primitives, or custom objects) directly to the guest via the + Hyperlight FFI. Any `result_parser` configured on a `FunctionTool` is + intended for LLM-facing consumers and does not run on the sandbox path — + apply formatting inside the tool function itself if you need it for + in-sandbox consumers. diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py index a46707ac0d..1b08705076 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py @@ -2,42 +2,43 @@ from __future__ import annotations -import ast import asyncio -import copy import mimetypes import shutil import threading import time from collections.abc import Callable, Sequence -from dataclasses import dataclass +from concurrent.futures import Future, ThreadPoolExecutor +from dataclasses import dataclass, field from pathlib import Path, PurePosixPath from tempfile import TemporaryDirectory -from typing import Annotated, Any, Protocol, TypeGuard, cast +from typing import Any, Protocol, TypeGuard, TypeVar, cast from urllib.parse import urlparse -from agent_framework import Content, FunctionTool +from agent_framework import SKIP_PARSING, Content, FunctionTool from agent_framework._tools import ApprovalMode, normalize_tools -from pydantic import BaseModel, Field from ._instructions import build_codeact_instructions, build_execute_code_description from ._types import AllowedDomain, AllowedDomainInput, FileMount, FileMountHostPath, FileMountInput DEFAULT_HYPERLIGHT_BACKEND = "wasm" DEFAULT_HYPERLIGHT_MODULE = "python_guest.path" -EXECUTE_CODE_INPUT_DESCRIPTION = "Python code to execute in an isolated Hyperlight sandbox." +EXECUTE_CODE_TOOL_DESCRIPTION = "Execute Python in an isolated Hyperlight sandbox." OUTPUT_FILE_RETRY_ATTEMPTS = 10 OUTPUT_FILE_RETRY_DELAY_SECONDS = 0.1 - -class _ExecuteCodeInput(BaseModel): - code: Annotated[str, Field(description=EXECUTE_CODE_INPUT_DESCRIPTION)] - - -@dataclass(frozen=True, slots=True) -class _StoredFileMount: - host_path: Path - mount_path: str +EXECUTE_CODE_INPUT_SCHEMA: dict[str, Any] = { + "type": "object", + "title": "_ExecuteCodeInput", + "properties": { + "code": { + "type": "string", + "title": "Code", + "description": "Python code to execute in an isolated Hyperlight sandbox.", + }, + }, + "required": ["code"], +} @dataclass(frozen=True, slots=True) @@ -85,13 +86,42 @@ class SandboxRuntime(Protocol): def execute(self, *, config: _RunConfig, code: str) -> list[Content]: ... +_T = TypeVar("_T") + + +class _SandboxWorker: + """Single-threaded executor that confines all sandbox operations to one OS thread. + + The Hyperlight ``WasmSandbox`` is declared ``unsendable`` in PyO3, meaning it can only be + accessed from the OS thread that created it; touching it from any other thread triggers a + Rust panic that cannot be caught from Python. Every cached :class:`_SandboxEntry` therefore + owns its own ``_SandboxWorker``, and *all* lifecycle and execution calls against the + underlying sandbox object must be routed through :meth:`submit`/:meth:`run`. + """ + + __slots__ = ("_executor",) + + def __init__(self, *, name: str = "hl-sandbox") -> None: + self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix=name) + + def submit(self, fn: Callable[..., _T], /, *args: Any, **kwargs: Any) -> Future[_T]: + return self._executor.submit(fn, *args, **kwargs) + + def run(self, fn: Callable[..., _T], /, *args: Any, **kwargs: Any) -> _T: + return self._executor.submit(fn, *args, **kwargs).result() + + def shutdown(self) -> None: + # Do not block on in-flight runs; the worker thread exits once the current task returns. + self._executor.shutdown(wait=False, cancel_futures=False) + + @dataclass class _SandboxEntry: sandbox: Any snapshot: Any input_dir: TemporaryDirectory[str] | None output_dir: TemporaryDirectory[str] | None - lock: threading.RLock + worker: _SandboxWorker = field(default_factory=_SandboxWorker) def _load_sandbox_class() -> type[Any]: @@ -106,10 +136,6 @@ def _load_sandbox_class() -> type[Any]: return Sandbox -def _passthrough_result_parser(result: Any) -> str: - return repr(result) - - def _collect_tools(*tool_groups: Any) -> list[FunctionTool]: tools_by_name: dict[str, FunctionTool] = {} @@ -166,7 +192,7 @@ def _is_file_mount_pair(value: Any) -> TypeGuard[FileMount | tuple[FileMountHost return isinstance(host_path, (str, Path)) and isinstance(mount_path, str) -def _normalize_file_mount_input(file_mount: FileMountInput) -> _StoredFileMount: +def _normalize_file_mount_input(file_mount: FileMountInput) -> FileMount: host_path: FileMountHostPath mount_path: str if isinstance(file_mount, str): @@ -176,7 +202,7 @@ def _normalize_file_mount_input(file_mount: FileMountInput) -> _StoredFileMount: host_path = file_mount[0] mount_path = file_mount[1] - return _StoredFileMount( + return FileMount( host_path=_resolve_existing_path(host_path), mount_path=_normalize_mount_path(mount_path), ) @@ -445,18 +471,13 @@ def _build_execution_contents( def _make_sandbox_callback(tool_obj: FunctionTool) -> Callable[..., Any]: - sandbox_tool = copy.copy(tool_obj) - # Auto-assign a passthrough parser so the raw return value round-trips through - # `ast.literal_eval` in the sandbox callback below. User-supplied parsers are - # left in place so callers can customize how results are exposed to the guest. - if sandbox_tool.result_parser is None: - sandbox_tool.result_parser = _passthrough_result_parser + sandbox_tool = tool_obj def _callback(**kwargs: Any) -> Any: - async def _invoke() -> list[Content]: - return await sandbox_tool.invoke(arguments=kwargs) + async def _invoke() -> Any: + return await sandbox_tool.invoke(arguments=kwargs, result_parser=SKIP_PARSING) - # FunctionTool.invoke() is always async. The real Hyperlight backend invokes + # FunctionTool.invoke() is async. The real Hyperlight backend invokes # registered callbacks synchronously via FFI, so this must be a sync function. # We run the async call on a dedicated thread to avoid conflicts with any # event loop that may be running on the current thread. @@ -474,22 +495,11 @@ def _run() -> None: worker.join() if error_box: raise error_box[0] - contents: list[Content] = result_box[0] - - values: list[Any] = [] - for content in contents: - if content.type == "text" and content.text is not None: - try: - values.append(ast.literal_eval(content.text)) - except (SyntaxError, ValueError): - values.append(content.text) - continue - - values.append(content.to_dict()) - - if len(values) == 1: - return values[0] - return values + # Return the raw value. The Hyperlight FFI marshals primitives (dict, list, + # str, int, float, bool, None) natively into the guest, and falls back to + # repr()/str() for unsupported types — so the guest receives real Python + # objects without a lossy host-side serialization round-trip. + return result_box[0] return _callback @@ -509,7 +519,7 @@ def _clear_directory(output_dir: TemporaryDirectory[str] | None) -> None: pass -class _SandboxRegistry: +class _SandboxRegistry(SandboxRuntime): def __init__(self) -> None: self._entries: dict[tuple[Any, ...], _SandboxEntry] = {} self._entries_lock = threading.RLock() @@ -517,28 +527,42 @@ def __init__(self) -> None: def execute(self, *, config: _RunConfig, code: str) -> list[Content]: """Execute code in a cached sandbox matching the given config. - Entries are keyed by ``config.cache_key()``. Concurrent calls with the same - key are serialized by the entry lock so they never race, but they share the - same sandbox instance. For true parallel execution, use distinct provider - instances or configs that produce different cache keys. + Entries are keyed by ``config.cache_key()``. All operations against the underlying + sandbox object are routed through the entry's dedicated single-threaded worker, which + both serializes concurrent callers and satisfies the PyO3 ``unsendable`` invariant + that the sandbox can only be touched from the thread that created it. """ + entry = self._get_or_create_entry(config) + return entry.worker.run(self._run_on_worker, entry, code) + + @staticmethod + def _run_on_worker(entry: _SandboxEntry, code: str) -> list[Content]: + entry.sandbox.restore(entry.snapshot) + _clear_directory(entry.output_dir) + result = entry.sandbox.run(code=code) + return _build_execution_contents( + result=result, + sandbox=entry.sandbox, + output_dir=entry.output_dir, + code=code, + ) + + def _get_or_create_entry(self, config: _RunConfig) -> _SandboxEntry: cache_key = config.cache_key() with self._entries_lock: entry = self._entries.get(cache_key) if entry is None: entry = self._create_entry(config) self._entries[cache_key] = entry + return entry - with entry.lock: - entry.sandbox.restore(entry.snapshot) - _clear_directory(entry.output_dir) - result = entry.sandbox.run(code=code) - return _build_execution_contents( - result=result, - sandbox=entry.sandbox, - output_dir=entry.output_dir, - code=code, - ) + def close(self) -> None: + """Shut down all per-entry worker threads. Safe to call multiple times.""" + with self._entries_lock: + entries = list(self._entries.values()) + self._entries.clear() + for entry in entries: + entry.worker.shutdown() def _create_entry(self, config: _RunConfig) -> _SandboxEntry: input_dir_handle = TemporaryDirectory() if config.filesystem_enabled else None @@ -578,26 +602,37 @@ def _configure_sandbox(*, sandbox: Any, expand_missing_scheme: bool) -> None: methods=list(allowed_domain.methods) if allowed_domain.methods is not None else None, ) - sandbox = _create_sandbox() - _configure_sandbox(sandbox=sandbox, expand_missing_scheme=False) - - try: - sandbox.run("None") - except RuntimeError as exc: - if not _should_retry_allowed_domain_registration(error=exc, allowed_domains=config.allowed_domains): - raise + worker = _SandboxWorker() + def _build_sandbox() -> tuple[Any, Any]: sandbox = _create_sandbox() - _configure_sandbox(sandbox=sandbox, expand_missing_scheme=True) - sandbox.run("None") + _configure_sandbox(sandbox=sandbox, expand_missing_scheme=False) + + try: + sandbox.run("None") + except RuntimeError as exc: + if not _should_retry_allowed_domain_registration(error=exc, allowed_domains=config.allowed_domains): + raise + + sandbox = _create_sandbox() + _configure_sandbox(sandbox=sandbox, expand_missing_scheme=True) + sandbox.run("None") + + snapshot = sandbox.snapshot() + return sandbox, snapshot + + try: + sandbox, snapshot = worker.run(_build_sandbox) + except BaseException: + worker.shutdown() + raise - snapshot = sandbox.snapshot() return _SandboxEntry( sandbox=sandbox, snapshot=snapshot, input_dir=input_dir_handle, output_dir=output_dir_handle, - lock=threading.RLock(), + worker=worker, ) @@ -619,10 +654,10 @@ def __init__( ) -> None: super().__init__( name="execute_code", - description=EXECUTE_CODE_INPUT_DESCRIPTION, + description=EXECUTE_CODE_TOOL_DESCRIPTION, approval_mode="never_require", func=self._run_code, - input_model=_ExecuteCodeInput, + input_model=EXECUTE_CODE_INPUT_SCHEMA, ) self._state_lock = threading.RLock() self._registry = _registry or _SandboxRegistry() @@ -632,7 +667,7 @@ def __init__( self._module: str | None = module self._module_path: str | None = module_path self._managed_tools: list[FunctionTool] = [] - self._file_mounts: dict[str, _StoredFileMount] = {} + self._file_mounts: dict[str, FileMount] = {} self._allowed_domains: dict[str, AllowedDomain] = {} if tools is not None: @@ -648,7 +683,7 @@ def __init__( def description(self) -> str: state_lock = getattr(self, "_state_lock", None) if state_lock is None: - return str(self.__dict__.get("description", EXECUTE_CODE_INPUT_DESCRIPTION)) + return str(self.__dict__.get("description", EXECUTE_CODE_TOOL_DESCRIPTION)) with state_lock: allowed_domains = sorted(self._allowed_domains.values(), key=lambda value: value.target) @@ -841,9 +876,9 @@ def _build_run_config(self) -> _RunConfig: workspace_signature = _path_tree_signature(workspace_root) if workspace_root is not None else () normalized_mounts = tuple( _NormalizedFileMount( - host_path=mount.host_path, + host_path=Path(mount.host_path), mount_path=mount.mount_path, - path_signature=_path_tree_signature(mount.host_path), + path_signature=_path_tree_signature(Path(mount.host_path)), ) for mount in stored_mounts ) diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index ab6a3f7c78..d20fb3b9c7 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -937,3 +937,158 @@ async def _concurrent_task(): assert concurrent_ran, "Event loop was blocked during sandbox execution" assert result[0].type == "text" + + +class _ThreadAffinityFakeSandbox(_FakeSandbox): + """Fake sandbox that records the OS thread of every method invocation. + + Mirrors the PyO3 ``unsendable`` invariant of ``hyperlight_sandbox.WasmSandbox``: + if ``__init__``, ``register_tool``, ``allow_domain``, ``run``, ``snapshot`` or ``restore`` + are ever called from more than one thread for a given instance, the test fails. + """ + + affinity_failures: list[str] = [] + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._owner_thread = threading.get_ident() + self.thread_ids: set[int] = {self._owner_thread} + + def _record(self, method: str) -> None: + ident = threading.get_ident() + self.thread_ids.add(ident) + if ident != self._owner_thread: + _ThreadAffinityFakeSandbox.affinity_failures.append( + f"{method} called from thread {ident}, expected {self._owner_thread}" + ) + + def register_tool(self, name_or_tool: Any, callback: Any | None = None) -> None: + self._record("register_tool") + super().register_tool(name_or_tool, callback) + + def allow_domain(self, target: str, methods: list[str] | None = None) -> None: + self._record("allow_domain") + super().allow_domain(target, methods) + + def run(self, code: str) -> _FakeResult: + self._record("run") + return super().run(code) + + def snapshot(self) -> str: + self._record("snapshot") + return super().snapshot() + + def restore(self, snapshot: Any) -> None: + self._record("restore") + super().restore(snapshot) + + +async def test_sandbox_calls_are_pinned_to_owning_worker_thread( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Regression: WasmSandbox is unsendable; every sandbox call must run on its owner thread.""" + _ThreadAffinityFakeSandbox.instances.clear() + _ThreadAffinityFakeSandbox.affinity_failures.clear() + monkeypatch.setattr(execute_code_module, "_load_sandbox_class", lambda: _ThreadAffinityFakeSandbox) + + execute_code = HyperlightExecuteCodeTool() + + # Invoke many times concurrently; asyncio.to_thread will spread these across the default + # executor's worker threads, which previously caused PyO3 to panic when a different thread + # touched the cached sandbox. + results = await asyncio.gather(*[execute_code.invoke(arguments={"code": "None"}) for _ in range(8)]) + for result in results: + assert result[0].type == "text" + + assert _ThreadAffinityFakeSandbox.affinity_failures == [] + assert len(_ThreadAffinityFakeSandbox.instances) == 1 + sandbox = _ThreadAffinityFakeSandbox.instances[0] + # All sandbox-touching calls must have stayed on a single owning thread, distinct from the + # caller thread that asyncio.to_thread used for dispatch. + assert sandbox.thread_ids == {sandbox._owner_thread} + assert sandbox._owner_thread != threading.get_ident() + + +async def test_sandbox_owner_thread_persists_across_dispatch_threads( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Sequential calls landing on different dispatch threads still share one sandbox thread.""" + _ThreadAffinityFakeSandbox.instances.clear() + _ThreadAffinityFakeSandbox.affinity_failures.clear() + monkeypatch.setattr(execute_code_module, "_load_sandbox_class", lambda: _ThreadAffinityFakeSandbox) + + execute_code = HyperlightExecuteCodeTool() + + for _ in range(5): + result = await execute_code.invoke(arguments={"code": "None"}) + assert result[0].type == "text" + + assert _ThreadAffinityFakeSandbox.affinity_failures == [] + assert len(_ThreadAffinityFakeSandbox.instances) == 1 + + +def test_sandbox_registry_close_shuts_down_workers(monkeypatch: pytest.MonkeyPatch) -> None: + _FakeSandbox.instances.clear() + monkeypatch.setattr(execute_code_module, "_load_sandbox_class", lambda: _FakeSandbox) + + registry = execute_code_module._SandboxRegistry() + execute_code = HyperlightExecuteCodeTool(_registry=registry) + asyncio.run(execute_code.invoke(arguments={"code": "None"})) + + entries = list(registry._entries.values()) + assert len(entries) == 1 + worker = entries[0].worker + + registry.close() + + assert registry._entries == {} + # Submitting after shutdown must fail; this proves the executor was actually torn down. + with pytest.raises(RuntimeError): + worker.submit(lambda: None) + + +async def test_make_sandbox_callback_returns_native_dict() -> None: + """Host tool returning a dict must be forwarded as a native dict (no repr round-trip).""" + + @tool + def get_weather(city: str) -> dict[str, Any]: + """Get weather.""" + return {"city": city, "temp_c": 21.5} + + callback = execute_code_module._make_sandbox_callback(get_weather) + result = callback(city="Seattle") + + assert isinstance(result, dict) + assert result == {"city": "Seattle", "temp_c": 21.5} + + +async def test_make_sandbox_callback_bypasses_user_result_parser() -> None: + """Documented behavior change: result_parser is bypassed in the sandbox path.""" + + parser_calls: list[Any] = [] + + def parser(value: Any) -> str: + parser_calls.append(value) + return "PARSED" + + @tool(result_parser=parser) + def make_payload() -> dict[str, int]: + """Returns a dict.""" + return {"a": 1, "b": 2} + + callback = execute_code_module._make_sandbox_callback(make_payload) + result = callback() + + assert result == {"a": 1, "b": 2} + assert parser_calls == [], "result_parser must not run on the sandbox path" + + +async def test_make_sandbox_callback_propagates_exceptions() -> None: + @tool + def boom(x: int) -> int: + """Always fails.""" + raise RuntimeError("nope") + + callback = execute_code_module._make_sandbox_callback(boom) + with pytest.raises(RuntimeError, match="nope"): + callback(x=1) From 003540bf077953125207fad8b64d600c77b1369d Mon Sep 17 00:00:00 2001 From: Eduard van Valkenburg Date: Wed, 22 Apr 2026 12:53:34 +0200 Subject: [PATCH 2/4] Address PR review: skip_parsing flag, broader registry close, comment fix - FunctionTool.invoke now takes a boolean skip_parsing flag instead of the SKIP_PARSING sentinel; the sentinel is still accepted as result_parser at construction time to opt out of parsing for every call. The two paths are equivalent. - _SandboxRegistry.close now invokes any sandbox close/shutdown hook on the entry's own worker thread (PyO3 unsendable), then shuts the worker down, then cleans up the per-entry temporary directories. - Clarified the _SandboxWorker.shutdown comment to describe the actual ThreadPoolExecutor.shutdown(wait=False, cancel_futures=False) semantics. - Hyperlight host callback uses skip_parsing=True (the new flag). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../packages/core/agent_framework/_tools.py | 59 +++++++++++-------- python/packages/core/tests/core/test_tools.py | 44 +++++++++----- .../_execute_code_tool.py | 22 +++++-- .../hyperlight/test_hyperlight_codeact.py | 33 +++++++++++ 4 files changed, 115 insertions(+), 43 deletions(-) diff --git a/python/packages/core/agent_framework/_tools.py b/python/packages/core/agent_framework/_tools.py index 91db646855..560a96f8f5 100644 --- a/python/packages/core/agent_framework/_tools.py +++ b/python/packages/core/agent_framework/_tools.py @@ -98,9 +98,11 @@ class _SkipParsingSentinel: """Sentinel signaling that :meth:`FunctionTool.invoke` should return the raw value. - When passed as the ``result_parser`` keyword argument to ``invoke``, the configured - ``result_parser`` (and the default :meth:`FunctionTool.parse_result`) are bypassed - and the wrapped function's return value is returned unchanged. + When passed as ``result_parser`` to :class:`FunctionTool` (or the ``@tool`` decorator), + the default :meth:`FunctionTool.parse_result` is bypassed and the wrapped function's + return value is returned unchanged from :meth:`FunctionTool.invoke`. Callers may also + request the raw value on a per-call basis by passing ``skip_parsing=True`` to + :meth:`FunctionTool.invoke`. Use the module-level ``SKIP_PARSING`` singleton — do not instantiate this class. """ @@ -117,7 +119,7 @@ def __repr__(self) -> str: SKIP_PARSING: Final[_SkipParsingSentinel] = _SkipParsingSentinel() -"""Sentinel for ``FunctionTool.invoke(result_parser=...)`` meaning "do not parse the result".""" +"""Sentinel for ``FunctionTool(result_parser=...)`` meaning "do not parse the result".""" # region Helpers @@ -304,7 +306,7 @@ def __init__( additional_properties: dict[str, Any] | None = None, func: Callable[..., Any] | None = None, input_model: type[BaseModel] | Mapping[str, Any] | None = None, - result_parser: Callable[[Any], str | list[Content]] | None = None, + result_parser: Callable[[Any], str | list[Content]] | _SkipParsingSentinel | None = None, **kwargs: Any, ) -> None: """Initialize the FunctionTool. @@ -352,9 +354,11 @@ def __init__( result_parser: An optional callable with signature ``Callable[[Any], str]`` that overrides the default result parsing behavior. When provided, this callable is used to convert the raw function return value to a string instead of the - built-in :meth:`parse_result` logic. Depending on your function, it may be - easiest to just do the serialization directly in the function body rather - than providing a custom ``result_parser``. + built-in :meth:`parse_result` logic. Pass the :data:`SKIP_PARSING` sentinel + instead of a callable to opt out of parsing entirely; in that case + :meth:`invoke` returns the wrapped function's raw return value. Depending + on your function, it may be easiest to just do the serialization directly + in the function body rather than providing a custom ``result_parser``. **kwargs: Additional keyword arguments. """ # Core attributes (formerly from BaseTool) @@ -540,7 +544,7 @@ async def invoke( arguments: BaseModel | Mapping[str, Any] | None = None, context: FunctionInvocationContext | None = None, tool_call_id: str | None = None, - result_parser: _SkipParsingSentinel, + skip_parsing: Literal[True], **kwargs: Any, ) -> Any: ... @@ -551,7 +555,7 @@ async def invoke( arguments: BaseModel | Mapping[str, Any] | None = None, context: FunctionInvocationContext | None = None, tool_call_id: str | None = None, - result_parser: None = None, + skip_parsing: Literal[False] = False, **kwargs: Any, ) -> list[Content]: ... @@ -561,7 +565,7 @@ async def invoke( arguments: BaseModel | Mapping[str, Any] | None = None, context: FunctionInvocationContext | None = None, tool_call_id: str | None = None, - result_parser: _SkipParsingSentinel | None = None, + skip_parsing: bool = False, **kwargs: Any, ) -> list[Content] | Any: """Run the AI function with the provided arguments as a Pydantic model. @@ -571,25 +575,27 @@ async def invoke( configured on the tool. Every result — text, rich media, or serialized objects — is represented uniformly as Content items. - Pass the module-level :data:`SKIP_PARSING` sentinel as ``result_parser`` to - bypass parsing entirely and receive the wrapped function's raw return value. - This is intended for callers (e.g. sandboxed runtimes) that consume the value - from Python directly and would otherwise undo the ``Content`` wrapping. + Parsing can be skipped in two ways: configure the tool with + ``result_parser=SKIP_PARSING`` to always skip parsing, or pass + ``skip_parsing=True`` per call. Either way the wrapped function's raw value + is returned. This is intended for callers (e.g. sandboxed runtimes) that + consume the value from Python directly and would otherwise undo the + ``Content`` wrapping. Keyword Args: arguments: A mapping or model instance containing the arguments for the function. context: Explicit function invocation context carrying runtime kwargs. tool_call_id: Optional tool call identifier used for telemetry and tracing. - result_parser: Per-call override. Pass :data:`SKIP_PARSING` to skip parsing - and return the wrapped function's raw value. When omitted (the default), - the tool's configured ``result_parser`` (or :meth:`parse_result`) is used. + skip_parsing: When ``True``, bypass parsing and return the wrapped function's + raw value instead of a ``list[Content]``. Defaults to ``False``. kwargs: Direct function argument values. When provided, every keyword must match a declared tool parameter. Runtime data must be passed via ``context``. Returns: ``list[Content]`` by default. The raw function return value (``Any``) when - ``result_parser=SKIP_PARSING`` is supplied. + ``skip_parsing=True`` (or the tool was constructed with + ``result_parser=SKIP_PARSING``). Raises: TypeError: If arguments is not mapping-like or fails schema checks. @@ -601,8 +607,13 @@ async def invoke( from ._types import Content from .observability import OBSERVABILITY_SETTINGS - skip_parsing = result_parser is SKIP_PARSING - parser = self.result_parser or FunctionTool.parse_result + configured_parser = self.result_parser + skip_parsing = skip_parsing or configured_parser is SKIP_PARSING + parser = ( + configured_parser + if callable(configured_parser) and configured_parser is not SKIP_PARSING + else FunctionTool.parse_result + ) parameter_names = set(self.parameters().get("properties", {}).keys()) direct_argument_kwargs = ( @@ -1136,7 +1147,7 @@ def tool( max_invocations: int | None = None, max_invocation_exceptions: int | None = None, additional_properties: dict[str, Any] | None = None, - result_parser: Callable[[Any], str | list[Content]] | None = None, + result_parser: Callable[[Any], str | list[Content]] | _SkipParsingSentinel | None = None, ) -> FunctionTool: ... @@ -1152,7 +1163,7 @@ def tool( max_invocations: int | None = None, max_invocation_exceptions: int | None = None, additional_properties: dict[str, Any] | None = None, - result_parser: Callable[[Any], str | list[Content]] | None = None, + result_parser: Callable[[Any], str | list[Content]] | _SkipParsingSentinel | None = None, ) -> Callable[[Callable[..., Any]], FunctionTool]: ... @@ -1167,7 +1178,7 @@ def tool( max_invocations: int | None = None, max_invocation_exceptions: int | None = None, additional_properties: dict[str, Any] | None = None, - result_parser: Callable[[Any], str | list[Content]] | None = None, + result_parser: Callable[[Any], str | list[Content]] | _SkipParsingSentinel | None = None, ) -> FunctionTool | Callable[[Callable[..., Any]], FunctionTool]: """Decorate a function to turn it into a FunctionTool that can be passed to models and executed automatically. diff --git a/python/packages/core/tests/core/test_tools.py b/python/packages/core/tests/core/test_tools.py index 95c32c1f4f..b3762bf4ef 100644 --- a/python/packages/core/tests/core/test_tools.py +++ b/python/packages/core/tests/core/test_tools.py @@ -1299,25 +1299,27 @@ def __len__(self) -> int: assert len(normalized) == 2 assert normalized[0] is bundled assert normalized[1] is standalone -# region SKIP_PARSING sentinel + + +# region SKIP_PARSING sentinel & skip_parsing async def test_invoke_skip_parsing_returns_native_value() -> None: - """invoke(result_parser=SKIP_PARSING) returns the wrapped function's raw value.""" + """invoke(skip_parsing=True) returns the wrapped function's raw value.""" @tool def get_weather(city: str) -> dict[str, Any]: """Get the weather.""" return {"city": city, "temperature_c": 21.5, "conditions": "partly cloudy"} - raw = await get_weather.invoke(arguments={"city": "Seattle"}, result_parser=SKIP_PARSING) + raw = await get_weather.invoke(arguments={"city": "Seattle"}, skip_parsing=True) assert isinstance(raw, dict) assert raw == {"city": "Seattle", "temperature_c": 21.5, "conditions": "partly cloudy"} async def test_invoke_skip_parsing_passes_through_custom_objects() -> None: - """SKIP_PARSING must not call str()/repr() on the result.""" + """skip_parsing must not call str()/repr() on the result.""" class Custom: # noqa: B903 def __init__(self, value: int) -> None: @@ -1328,7 +1330,7 @@ def make() -> Custom: """Make a custom object.""" return Custom(42) - raw = await make.invoke(result_parser=SKIP_PARSING) + raw = await make.invoke(skip_parsing=True) assert isinstance(raw, Custom) assert raw.value == 42 @@ -1340,12 +1342,12 @@ async def slow(x: int) -> int: """Async tool.""" return x * 2 - raw = await slow.invoke(arguments={"x": 21}, result_parser=SKIP_PARSING) + raw = await slow.invoke(arguments={"x": 21}, skip_parsing=True) assert raw == 42 async def test_invoke_skip_parsing_bypasses_configured_result_parser() -> None: - """The tool's own result_parser is bypassed when SKIP_PARSING is requested.""" + """The tool's own result_parser is bypassed when skip_parsing=True is requested.""" parser_calls: list[Any] = [] def parser(value: Any) -> str: @@ -1357,16 +1359,28 @@ def make_dict() -> dict[str, int]: """Returns a dict.""" return {"a": 1} - raw = await make_dict.invoke(result_parser=SKIP_PARSING) + raw = await make_dict.invoke(skip_parsing=True) assert raw == {"a": 1} assert parser_calls == [] - # Sanity: omitting result_parser still applies the configured parser. + # Sanity: omitting skip_parsing still applies the configured parser. parsed = await make_dict.invoke() assert parsed[0].type == "text" assert parsed[0].text == "PARSED" +async def test_constructor_skip_parsing_sentinel_returns_raw_by_default() -> None: + """Constructing a tool with result_parser=SKIP_PARSING makes invoke return the raw value.""" + + @tool(result_parser=SKIP_PARSING) + def make_dict() -> dict[str, int]: + """Returns a dict.""" + return {"a": 1} + + raw = await make_dict.invoke() + assert raw == {"a": 1} + + async def test_invoke_skip_parsing_validates_arguments() -> None: """Argument validation is shared with the default path.""" @@ -1376,7 +1390,7 @@ def adder(x: int, y: int) -> int: return x + y with pytest.raises(TypeError): - await adder.invoke(arguments={"x": "not-an-int", "y": 1}, result_parser=SKIP_PARSING) + await adder.invoke(arguments={"x": "not-an-int", "y": 1}, skip_parsing=True) async def test_invoke_skip_parsing_rejects_unexpected_runtime_kwargs() -> None: @@ -1386,7 +1400,7 @@ async def echo(message: str) -> str: return message with pytest.raises(TypeError, match="Unexpected keyword argument"): - await echo.invoke(arguments={"message": "hi"}, result_parser=SKIP_PARSING, api_token="secret") + await echo.invoke(arguments={"message": "hi"}, skip_parsing=True, api_token="secret") async def test_invoke_skip_parsing_raises_for_declaration_only_tool() -> None: @@ -1395,11 +1409,11 @@ async def test_invoke_skip_parsing_raises_for_declaration_only_tool() -> None: from agent_framework.exceptions import ToolException with pytest.raises(ToolException): - await declared.invoke(arguments={}, result_parser=SKIP_PARSING) + await declared.invoke(arguments={}, skip_parsing=True) async def test_invoke_skip_parsing_records_telemetry(span_exporter: InMemorySpanExporter) -> None: - """SKIP_PARSING participates in OTEL spans and records str(raw) as TOOL_RESULT.""" + """skip_parsing participates in OTEL spans and records str(raw) as TOOL_RESULT.""" @tool(name="raw_tool", description="raw tool") def returns_dict(x: int) -> dict[str, int]: @@ -1407,7 +1421,7 @@ def returns_dict(x: int) -> dict[str, int]: return {"value": x} span_exporter.clear() - raw = await returns_dict.invoke(arguments={"x": 5}, tool_call_id="raw_call", result_parser=SKIP_PARSING) + raw = await returns_dict.invoke(arguments={"x": 5}, tool_call_id="raw_call", skip_parsing=True) assert raw == {"value": 5} spans = span_exporter.get_finished_spans() @@ -1421,7 +1435,7 @@ def returns_dict(x: int) -> dict[str, int]: async def test_invoke_default_path_records_parsed_telemetry( span_exporter: InMemorySpanExporter, ) -> None: - """Regression: omitting SKIP_PARSING still records the parsed result in telemetry.""" + """Regression: omitting skip_parsing still records the parsed result in telemetry.""" def parser(value: Any) -> str: return f"parsed:{value}" diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py index 1b08705076..2aad40a767 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py @@ -9,13 +9,14 @@ import time from collections.abc import Callable, Sequence from concurrent.futures import Future, ThreadPoolExecutor +from contextlib import suppress from dataclasses import dataclass, field from pathlib import Path, PurePosixPath from tempfile import TemporaryDirectory from typing import Any, Protocol, TypeGuard, TypeVar, cast from urllib.parse import urlparse -from agent_framework import SKIP_PARSING, Content, FunctionTool +from agent_framework import Content, FunctionTool from agent_framework._tools import ApprovalMode, normalize_tools from ._instructions import build_codeact_instructions, build_execute_code_description @@ -111,7 +112,8 @@ def run(self, fn: Callable[..., _T], /, *args: Any, **kwargs: Any) -> _T: return self._executor.submit(fn, *args, **kwargs).result() def shutdown(self) -> None: - # Do not block on in-flight runs; the worker thread exits once the current task returns. + # Do not block on shutdown; stop accepting new tasks, but allow the currently running + # task and any already-queued tasks to finish before the worker thread exits. self._executor.shutdown(wait=False, cancel_futures=False) @@ -475,7 +477,7 @@ def _make_sandbox_callback(tool_obj: FunctionTool) -> Callable[..., Any]: def _callback(**kwargs: Any) -> Any: async def _invoke() -> Any: - return await sandbox_tool.invoke(arguments=kwargs, result_parser=SKIP_PARSING) + return await sandbox_tool.invoke(arguments=kwargs, skip_parsing=True) # FunctionTool.invoke() is async. The real Hyperlight backend invokes # registered callbacks synchronously via FFI, so this must be a sync function. @@ -557,12 +559,24 @@ def _get_or_create_entry(self, config: _RunConfig) -> _SandboxEntry: return entry def close(self) -> None: - """Shut down all per-entry worker threads. Safe to call multiple times.""" + """Shut down all per-entry worker threads and release per-entry resources. + + Safe to call multiple times. Runs any sandbox close hook on the entry's + own worker thread to honor the PyO3 ``unsendable`` invariant. + """ with self._entries_lock: entries = list(self._entries.values()) self._entries.clear() for entry in entries: + close_hook = getattr(entry.sandbox, "close", None) or getattr(entry.sandbox, "shutdown", None) + if callable(close_hook): + with suppress(Exception): + entry.worker.run(close_hook) entry.worker.shutdown() + for tmp_dir in (entry.input_dir, entry.output_dir): + if tmp_dir is not None: + with suppress(Exception): + tmp_dir.cleanup() def _create_entry(self, config: _RunConfig) -> _SandboxEntry: input_dir_handle = TemporaryDirectory() if config.filesystem_enabled else None diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py index d20fb3b9c7..e41a5a6ee1 100644 --- a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py +++ b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py @@ -1047,6 +1047,39 @@ def test_sandbox_registry_close_shuts_down_workers(monkeypatch: pytest.MonkeyPat worker.submit(lambda: None) +def test_sandbox_registry_close_releases_per_entry_resources(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """close() must invoke any sandbox close hook and release temp directories.""" + + close_calls: list[int] = [] + + class _ClosableFakeSandbox(_FakeSandbox): + def close(self) -> None: + close_calls.append(1) + + _FakeSandbox.instances.clear() + monkeypatch.setattr(execute_code_module, "_load_sandbox_class", lambda: _ClosableFakeSandbox) + + workspace = tmp_path / "workspace" + workspace.mkdir() + registry = execute_code_module._SandboxRegistry() + execute_code = HyperlightExecuteCodeTool(workspace_root=workspace, _registry=registry) + asyncio.run(execute_code.invoke(arguments={"code": "None"})) + + entries = list(registry._entries.values()) + assert len(entries) == 1 + entry = entries[0] + assert entry.input_dir is not None and entry.output_dir is not None + input_path = Path(entry.input_dir.name) + output_path = Path(entry.output_dir.name) + assert input_path.exists() and output_path.exists() + + registry.close() + + assert close_calls == [1] + assert not input_path.exists() + assert not output_path.exists() + + async def test_make_sandbox_callback_returns_native_dict() -> None: """Host tool returning a dict must be forwarded as a native dict (no repr round-trip).""" From eed884f996dbce6a493a3ad4f85a4e9506ebce0b Mon Sep 17 00:00:00 2001 From: Eduard van Valkenburg Date: Wed, 22 Apr 2026 13:37:34 +0200 Subject: [PATCH 3/4] Drop redundant 'is not SKIP_PARSING' guard that mypy 1.x flags After callable(configured_parser) the sentinel is already excluded; the extra identity check tripped mypy's non-overlapping identity warning. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- python/packages/core/agent_framework/_tools.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/packages/core/agent_framework/_tools.py b/python/packages/core/agent_framework/_tools.py index 560a96f8f5..3f15472a5a 100644 --- a/python/packages/core/agent_framework/_tools.py +++ b/python/packages/core/agent_framework/_tools.py @@ -609,11 +609,7 @@ async def invoke( configured_parser = self.result_parser skip_parsing = skip_parsing or configured_parser is SKIP_PARSING - parser = ( - configured_parser - if callable(configured_parser) and configured_parser is not SKIP_PARSING - else FunctionTool.parse_result - ) + parser = configured_parser if callable(configured_parser) else FunctionTool.parse_result parameter_names = set(self.parameters().get("properties", {}).keys()) direct_argument_kwargs = ( From 503c04e147784e0a8716032256c239b8160e37a9 Mon Sep 17 00:00:00 2001 From: Eduard van Valkenburg Date: Thu, 23 Apr 2026 09:37:36 +0200 Subject: [PATCH 4/4] fixed sandbox working on copy of tool --- .../agent_framework_hyperlight/_execute_code_tool.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py index 2aad40a767..f91eeb5215 100644 --- a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py +++ b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py @@ -10,6 +10,7 @@ from collections.abc import Callable, Sequence from concurrent.futures import Future, ThreadPoolExecutor from contextlib import suppress +from copy import copy from dataclasses import dataclass, field from pathlib import Path, PurePosixPath from tempfile import TemporaryDirectory @@ -473,7 +474,7 @@ def _build_execution_contents( def _make_sandbox_callback(tool_obj: FunctionTool) -> Callable[..., Any]: - sandbox_tool = tool_obj + sandbox_tool = copy(tool_obj) def _callback(**kwargs: Any) -> Any: async def _invoke() -> Any: