From 4acb529db87cf1a6aeaee850f519181f7e034c8a Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 11 Feb 2026 17:58:23 +0000 Subject: [PATCH 1/3] feat: Add TestLLM class for testing without mocking LiteLLM This commit introduces a new TestLLM class in openhands.sdk.testing that provides a clean way to write tests without needing to mock LiteLLM internals. Key features: - TestLLM is a real LLM subclass that works anywhere an LLM is accepted - No @patch decorators needed - just pass TestLLM as the llm= argument - Tests speak in SDK types (Message, TextContent, MessageToolCall) - Clear error when scripted responses are exhausted - Zero-cost metrics by default - Always uses completion() path (uses_responses_api returns False) Also refactors test_agent_status_transition.py to demonstrate the new TestLLM usage, replacing ~20 lines of mock setup with ~3 lines. Closes #2005 --- .../openhands/sdk/testing/__init__.py | 10 + .../openhands/sdk/testing/test_llm.py | 282 +++++++++++++++ .../local/test_agent_status_transition.py | 329 ++++++------------ 3 files changed, 408 insertions(+), 213 deletions(-) create mode 100644 openhands-sdk/openhands/sdk/testing/__init__.py create mode 100644 openhands-sdk/openhands/sdk/testing/test_llm.py diff --git a/openhands-sdk/openhands/sdk/testing/__init__.py b/openhands-sdk/openhands/sdk/testing/__init__.py new file mode 100644 index 0000000000..32de415071 --- /dev/null +++ b/openhands-sdk/openhands/sdk/testing/__init__.py @@ -0,0 +1,10 @@ +"""Testing utilities for OpenHands SDK. + +This module provides test utilities that make it easy to write tests for +code that uses the OpenHands SDK, without needing to mock LiteLLM internals. +""" + +from openhands.sdk.testing.test_llm import TestLLM, TestLLMExhaustedError + + +__all__ = ["TestLLM", "TestLLMExhaustedError"] diff --git a/openhands-sdk/openhands/sdk/testing/test_llm.py b/openhands-sdk/openhands/sdk/testing/test_llm.py new file mode 100644 index 0000000000..b07102e3ff --- /dev/null +++ b/openhands-sdk/openhands/sdk/testing/test_llm.py @@ -0,0 +1,282 @@ +"""TestLLM - A mock LLM for testing. + +TestLLM is a real LLM subclass that returns scripted responses, eliminating +the need for @patch decorators and understanding of LiteLLM internals. + +Example: + >>> from openhands.sdk.testing import TestLLM + >>> from openhands.sdk.llm import Message, TextContent + >>> + >>> # Create a TestLLM with scripted responses + >>> llm = TestLLM.from_messages([ + ... Message(role="assistant", content=[TextContent(text="Hello!")]), + ... Message(role="assistant", content=[TextContent(text="Goodbye!")]), + ... ]) + >>> + >>> # Use it like a normal LLM + >>> user_msg = Message(role="user", content=[TextContent(text="Hi")]) + >>> response = llm.completion([user_msg]) + >>> print(response.message.content[0].text) # "Hello!" +""" + +from __future__ import annotations + +from collections.abc import Sequence +from typing import TYPE_CHECKING, Any, ClassVar + +from litellm.types.utils import Choices, Message as LiteLLMMessage, ModelResponse +from pydantic import ConfigDict, Field, PrivateAttr + +from openhands.sdk.llm.llm import LLM +from openhands.sdk.llm.llm_response import LLMResponse +from openhands.sdk.llm.message import Message +from openhands.sdk.llm.streaming import TokenCallbackType +from openhands.sdk.llm.utils.metrics import MetricsSnapshot, TokenUsage + + +if TYPE_CHECKING: + from openhands.sdk.tool.tool import ToolDefinition + + +__all__ = ["TestLLM", "TestLLMExhaustedError"] + + +class TestLLMExhaustedError(Exception): + """Raised when TestLLM has no more scripted responses.""" + + pass + + +class TestLLM(LLM): + """A mock LLM for testing that returns scripted responses. + + TestLLM is a real LLM subclass that can be used anywhere an LLM is accepted: + in Agent(llm=...), in fallback_llms, in condensers, in routers, etc. + + Key features: + - No patching needed: just pass TestLLM as the llm= argument + - Tests speak in SDK types (Message, TextContent, MessageToolCall) + - Clear error when responses are exhausted + - Zero-cost metrics by default + - Always uses completion() path (uses_responses_api returns False) + + Example: + >>> from openhands.sdk.testing import TestLLM + >>> from openhands.sdk.llm import Message, TextContent, MessageToolCall + >>> + >>> # Simple text response + >>> llm = TestLLM.from_messages([ + ... Message(role="assistant", content=[TextContent(text="Done!")]), + ... ]) + >>> + >>> # Response with tool calls + >>> llm = TestLLM.from_messages([ + ... Message( + ... role="assistant", + ... content=[TextContent(text="")], + ... tool_calls=[ + ... MessageToolCall( + ... id="call_1", + ... name="my_tool", + ... arguments='{"arg": "value"}', + ... origin="completion", + ... ) + ... ], + ... ), + ... Message(role="assistant", content=[TextContent(text="Done!")]), + ... ]) + """ + + # Prevent pytest from collecting this class as a test + __test__ = False + + model: str = Field(default="test-model") + _scripted_responses: list[Message] = PrivateAttr(default_factory=list) + _call_count: int = PrivateAttr(default=0) + + model_config: ClassVar[ConfigDict] = ConfigDict( + extra="ignore", arbitrary_types_allowed=True + ) + + def __init__(self, **data: Any) -> None: + # Extract scripted_responses before calling super().__init__ + scripted_responses = data.pop("scripted_responses", []) + super().__init__(**data) + self._scripted_responses = list(scripted_responses) + self._call_count = 0 + + @classmethod + def from_messages( + cls, + messages: list[Message], + *, + model: str = "test-model", + usage_id: str = "test-llm", + **kwargs: Any, + ) -> TestLLM: + """Create a TestLLM with scripted responses. + + Args: + messages: List of Message objects to return in order. + Each call to completion() or responses() will return + the next message from this list. + model: Model name (default: "test-model") + usage_id: Usage ID for metrics (default: "test-llm") + **kwargs: Additional LLM configuration options + + Returns: + A TestLLM instance configured with the scripted responses. + + Example: + >>> llm = TestLLM.from_messages([ + ... Message(role="assistant", content=[TextContent(text="First")]), + ... Message(role="assistant", content=[TextContent(text="Second")]), + ... ]) + """ + return cls( + model=model, + usage_id=usage_id, + scripted_responses=messages, + **kwargs, + ) + + def completion( + self, + messages: list[Message], # noqa: ARG002 + tools: Sequence[ToolDefinition] | None = None, # noqa: ARG002 + _return_metrics: bool = False, + add_security_risk_prediction: bool = False, # noqa: ARG002 + on_token: TokenCallbackType | None = None, # noqa: ARG002 + **kwargs: Any, # noqa: ARG002 + ) -> LLMResponse: + """Return the next scripted response. + + Args: + messages: Input messages (ignored, but required for API compatibility) + tools: Available tools (ignored) + _return_metrics: Whether to return metrics (ignored) + add_security_risk_prediction: Add security risk field (ignored) + on_token: Streaming callback (ignored) + **kwargs: Additional arguments (ignored) + + Returns: + LLMResponse containing the next scripted message. + + Raises: + TestLLMExhaustedError: When no more scripted responses are available. + """ + if not self._scripted_responses: + raise TestLLMExhaustedError( + f"TestLLM: no more scripted responses " + f"(exhausted after {self._call_count} calls)" + ) + + message = self._scripted_responses.pop(0) + self._call_count += 1 + + # Create a minimal ModelResponse for raw_response + raw_response = self._create_model_response(message) + + return LLMResponse( + message=message, + metrics=self._zero_metrics(), + raw_response=raw_response, + ) + + def responses( + self, + messages: list[Message], + tools: Sequence[ToolDefinition] | None = None, + include: list[str] | None = None, # noqa: ARG002 + store: bool | None = None, # noqa: ARG002 + _return_metrics: bool = False, + add_security_risk_prediction: bool = False, + on_token: TokenCallbackType | None = None, + **kwargs: Any, + ) -> LLMResponse: + """Return the next scripted response (delegates to completion). + + For TestLLM, both completion() and responses() return from the same + queue of scripted responses. + """ + return self.completion( + messages=messages, + tools=tools, + _return_metrics=_return_metrics, + add_security_risk_prediction=add_security_risk_prediction, + on_token=on_token, + **kwargs, + ) + + def uses_responses_api(self) -> bool: + """TestLLM always uses the completion path.""" + return False + + def _zero_metrics(self) -> MetricsSnapshot: + """Return a zero-cost metrics snapshot.""" + return MetricsSnapshot( + model_name=self.model, + accumulated_cost=0.0, + max_budget_per_task=None, + accumulated_token_usage=TokenUsage( + model=self.model, + prompt_tokens=0, + completion_tokens=0, + ), + ) + + def _create_model_response(self, message: Message) -> ModelResponse: + """Create a minimal ModelResponse from a Message. + + This creates a valid ModelResponse that can be used as raw_response + in LLMResponse. + """ + # Build the LiteLLM message dict + litellm_message_dict: dict[str, Any] = { + "role": message.role, + "content": self._content_to_string(message), + } + + # Add tool_calls if present + if message.tool_calls: + litellm_message_dict["tool_calls"] = [ + { + "id": tc.id, + "type": "function", + "function": { + "name": tc.name, + "arguments": tc.arguments, + }, + } + for tc in message.tool_calls + ] + + litellm_message = LiteLLMMessage(**litellm_message_dict) + + return ModelResponse( + id=f"test-response-{self._call_count}", + choices=[Choices(message=litellm_message, index=0, finish_reason="stop")], + created=0, + model=self.model, + object="chat.completion", + ) + + def _content_to_string(self, message: Message) -> str: + """Convert message content to a string.""" + from openhands.sdk.llm.message import TextContent + + parts = [] + for item in message.content: + if isinstance(item, TextContent): + parts.append(item.text) + return "\n".join(parts) + + @property + def remaining_responses(self) -> int: + """Return the number of remaining scripted responses.""" + return len(self._scripted_responses) + + @property + def call_count(self) -> int: + """Return the number of calls made to this TestLLM.""" + return self._call_count diff --git a/tests/sdk/conversation/local/test_agent_status_transition.py b/tests/sdk/conversation/local/test_agent_status_transition.py index 37d7af34b8..4c6055c777 100644 --- a/tests/sdk/conversation/local/test_agent_status_transition.py +++ b/tests/sdk/conversation/local/test_agent_status_transition.py @@ -18,22 +18,13 @@ import threading from collections.abc import Sequence from typing import ClassVar -from unittest.mock import patch - -from litellm import ChatCompletionMessageToolCall -from litellm.types.utils import ( - Choices, - Function, - Message as LiteLLMMessage, - ModelResponse, -) -from pydantic import SecretStr from openhands.sdk.agent import Agent from openhands.sdk.conversation import Conversation from openhands.sdk.conversation.state import ConversationExecutionStatus from openhands.sdk.event import MessageEvent -from openhands.sdk.llm import LLM, ImageContent, Message, TextContent +from openhands.sdk.llm import ImageContent, Message, MessageToolCall, TextContent +from openhands.sdk.testing import TestLLM from openhands.sdk.tool import ( Action, Observation, @@ -100,8 +91,7 @@ def create( ] -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_execution_status_transitions_to_running_from_idle(mock_completion): +def test_execution_status_transitions_to_running_from_idle(): """Test that agent status transitions to RUNNING when run() is called from IDLE.""" status_during_execution: list[ConversationExecutionStatus] = [] @@ -112,24 +102,18 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]: register_tool("test_tool", _make_tool) - llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") + # Use TestLLM with a scripted response + llm = TestLLM.from_messages( + [ + Message(role="assistant", content=[TextContent(text="Task completed")]), + ] + ) agent = Agent(llm=llm, tools=[]) conversation = Conversation(agent=agent) # Verify initial state is IDLE assert conversation.state.execution_status == ConversationExecutionStatus.IDLE - # Mock LLM to return a message that finishes execution - mock_completion.return_value = ModelResponse( - id="response_msg", - choices=[ - Choices(message=LiteLLMMessage(role="assistant", content="Task completed")) - ], - created=0, - model="test-model", - object="chat.completion", - ) - # Send message and run conversation.send_message(Message(role="user", content=[TextContent(text="Hello")])) conversation.run() @@ -146,20 +130,49 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]: assert len(agent_messages) == 1 -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_execution_status_is_running_during_execution_from_idle(mock_completion): +def test_execution_status_is_running_during_execution_from_idle(): """Test that agent status is RUNNING during execution when started from IDLE.""" status_during_execution: list[ConversationExecutionStatus] = [] execution_started = threading.Event() + class SignalingExecutor( + ToolExecutor[StatusTransitionMockAction, StatusTransitionMockObservation] + ): + """Executor that signals when execution starts and captures status.""" + + def __call__( + self, action: StatusTransitionMockAction, conversation=None + ) -> StatusTransitionMockObservation: + # Signal that execution has started + execution_started.set() + # Capture the agent status during execution + if conversation: + status_during_execution.append(conversation.state.execution_status) + return StatusTransitionMockObservation(result=f"Executed: {action.command}") + def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]: - return StatusTransitionTestTool.create( - executor=StatusCheckingExecutor(status_during_execution) - ) + return StatusTransitionTestTool.create(executor=SignalingExecutor()) register_tool("test_tool", _make_tool) - llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") + # Use TestLLM with scripted responses: first a tool call, then completion + llm = TestLLM.from_messages( + [ + Message( + role="assistant", + content=[TextContent(text="")], + tool_calls=[ + MessageToolCall( + id="call_1", + name="test_tool", + arguments='{"command": "test_command"}', + origin="completion", + ) + ], + ), + Message(role="assistant", content=[TextContent(text="Task completed")]), + ] + ) agent = Agent( llm=llm, tools=[Tool(name="test_tool")], @@ -169,63 +182,12 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]: # Verify initial state is IDLE assert conversation.state.execution_status == ConversationExecutionStatus.IDLE - # Mock LLM to return an action first, then finish - tool_call = ChatCompletionMessageToolCall( - id="call_1", - type="function", - function=Function( - name="test_tool", - arguments='{"command": "test_command"}', - ), - ) - - call_count = [0] - - def side_effect(*args, **kwargs): - call_count[0] += 1 - if call_count[0] == 1: - # First call: return tool call - execution_started.set() - return ModelResponse( - id="response_action", - choices=[ - Choices( - message=LiteLLMMessage( - role="assistant", - content="", - tool_calls=[tool_call], - ) - ) - ], - created=0, - model="test-model", - object="chat.completion", - ) - else: - # Second call: finish - return ModelResponse( - id="response_msg", - choices=[ - Choices( - message=LiteLLMMessage( - role="assistant", content="Task completed" - ) - ) - ], - created=0, - model="test-model", - object="chat.completion", - ) - - mock_completion.side_effect = side_effect - # Send message conversation.send_message( Message(role="user", content=[TextContent(text="Execute command")]) ) # Run in a separate thread so we can check status during execution - status_checked = threading.Event() run_complete = threading.Event() status_during_run: list[ConversationExecutionStatus | None] = [None] @@ -241,7 +203,6 @@ def run_agent(): # Check status while running status_during_run[0] = conversation.state.execution_status - status_checked.set() # Wait for run to complete assert run_complete.wait(timeout=2.0), "Run did not complete" @@ -256,11 +217,15 @@ def run_agent(): assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_execution_status_transitions_to_running_from_paused(mock_completion): +def test_execution_status_transitions_to_running_from_paused(): """Test that agent status transitions to RUNNING when run() is called from PAUSED.""" - llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") + # Use TestLLM with a scripted response + llm = TestLLM.from_messages( + [ + Message(role="assistant", content=[TextContent(text="Task completed")]), + ] + ) agent = Agent(llm=llm, tools=[]) conversation = Conversation(agent=agent) @@ -268,17 +233,6 @@ def test_execution_status_transitions_to_running_from_paused(mock_completion): conversation.pause() assert conversation.state.execution_status == ConversationExecutionStatus.PAUSED - # Mock LLM to return a message that finishes execution - mock_completion.return_value = ModelResponse( - id="response_msg", - choices=[ - Choices(message=LiteLLMMessage(role="assistant", content="Task completed")) - ], - created=0, - model="test-model", - object="chat.completion", - ) - # Send message and run conversation.send_message(Message(role="user", content=[TextContent(text="Hello")])) conversation.run() @@ -295,70 +249,37 @@ def test_execution_status_transitions_to_running_from_paused(mock_completion): assert len(agent_messages) == 1 -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_execution_status_transitions_from_waiting_for_confirmation(mock_completion): +def test_execution_status_transitions_from_waiting_for_confirmation(): """Test WAITING_FOR_CONFIRMATION -> RUNNING transition when run() is called.""" from openhands.sdk.security.confirmation_policy import AlwaysConfirm - llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") - def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]: return StatusTransitionTestTool.create(executor=StatusCheckingExecutor([])) register_tool("test_tool", _make_tool) - agent = Agent(llm=llm, tools=[Tool(name="test_tool")]) - conversation = Conversation(agent=agent) - conversation.set_confirmation_policy(AlwaysConfirm()) - - # Mock LLM to return an action first, then finish - tool_call = ChatCompletionMessageToolCall( - id="call_1", - type="function", - function=Function( - name="test_tool", - arguments='{"command": "test_command"}', - ), - ) - - call_count = [0] - - def side_effect(*args, **kwargs): - call_count[0] += 1 - if call_count[0] == 1: - # First call: return tool call - return ModelResponse( - id="response_action", - choices=[ - Choices( - message=LiteLLMMessage( - role="assistant", - content="", - tool_calls=[tool_call], - ) + # Use TestLLM with scripted responses: first a tool call, then completion + llm = TestLLM.from_messages( + [ + Message( + role="assistant", + content=[TextContent(text="")], + tool_calls=[ + MessageToolCall( + id="call_1", + name="test_tool", + arguments='{"command": "test_command"}', + origin="completion", ) ], - created=0, - model="test-model", - object="chat.completion", - ) - else: - # Second call: finish - return ModelResponse( - id="response_msg", - choices=[ - Choices( - message=LiteLLMMessage( - role="assistant", content="Task completed" - ) - ) - ], - created=0, - model="test-model", - object="chat.completion", - ) + ), + Message(role="assistant", content=[TextContent(text="Task completed")]), + ] + ) - mock_completion.side_effect = side_effect + agent = Agent(llm=llm, tools=[Tool(name="test_tool")]) + conversation = Conversation(agent=agent) + conversation.set_confirmation_policy(AlwaysConfirm()) # Send message and run - should stop at WAITING_FOR_CONFIRMATION conversation.send_message( @@ -379,24 +300,18 @@ def side_effect(*args, **kwargs): assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_execution_status_finished_to_idle_to_running(mock_completion): +def test_execution_status_finished_to_idle_to_running(): """Test FINISHED -> IDLE -> RUNNING transition when new message is sent.""" - llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") + # Use TestLLM with two scripted responses (one for each run) + llm = TestLLM.from_messages( + [ + Message(role="assistant", content=[TextContent(text="Task completed")]), + Message(role="assistant", content=[TextContent(text="Task completed")]), + ] + ) agent = Agent(llm=llm, tools=[]) conversation = Conversation(agent=agent) - # Mock LLM to return completion messages - mock_completion.return_value = ModelResponse( - id="response_msg", - choices=[ - Choices(message=LiteLLMMessage(role="assistant", content="Task completed")) - ], - created=0, - model="test-model", - object="chat.completion", - ) - # First conversation - should end in FINISHED conversation.send_message( Message(role="user", content=[TextContent(text="First task")]) @@ -415,24 +330,17 @@ def test_execution_status_finished_to_idle_to_running(mock_completion): assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_run_exits_immediately_when_already_finished(mock_completion): +def test_run_exits_immediately_when_already_finished(): """Test that run() exits immediately when status is already FINISHED.""" - llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") + # Use TestLLM with a single scripted response + llm = TestLLM.from_messages( + [ + Message(role="assistant", content=[TextContent(text="Task completed")]), + ] + ) agent = Agent(llm=llm, tools=[]) conversation = Conversation(agent=agent) - # Mock LLM - mock_completion.return_value = ModelResponse( - id="response_msg", - choices=[ - Choices(message=LiteLLMMessage(role="assistant", content="Task completed")) - ], - created=0, - model="test-model", - object="chat.completion", - ) - # Complete a task conversation.send_message(Message(role="user", content=[TextContent(text="Task")])) conversation.run() @@ -440,19 +348,19 @@ def test_run_exits_immediately_when_already_finished(mock_completion): # Call run again without sending a new message # Should exit immediately without calling LLM again - initial_call_count = mock_completion.call_count + initial_call_count = llm.call_count conversation.run() # Status should still be FINISHED assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED # LLM should not be called again - assert mock_completion.call_count == initial_call_count + assert llm.call_count == initial_call_count -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_run_exits_immediately_when_stuck(mock_completion): +def test_run_exits_immediately_when_stuck(): """Test that run() exits immediately when status is STUCK.""" - llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") + # Use TestLLM with no scripted responses (should not be called) + llm = TestLLM.from_messages([]) agent = Agent(llm=llm, tools=[]) conversation = Conversation(agent=agent) @@ -465,11 +373,10 @@ def test_run_exits_immediately_when_stuck(mock_completion): # Status should still be STUCK assert conversation.state.execution_status == ConversationExecutionStatus.STUCK # LLM should not be called - assert mock_completion.call_count == 0 + assert llm.call_count == 0 -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_execution_status_error_on_max_iterations(mock_completion): +def test_execution_status_error_on_max_iterations(): """Test that status is set to ERROR with clear message when max iterations hit.""" from openhands.sdk.event.conversation_error import ConversationErrorEvent @@ -483,7 +390,29 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]: register_tool("test_tool", _make_tool) - llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") + # Create a tool call message that will be returned repeatedly + tool_call_message = Message( + role="assistant", + content=[TextContent(text="")], + tool_calls=[ + MessageToolCall( + id="call_1", + name="test_tool", + arguments='{"command": "test_command"}', + origin="completion", + ) + ], + ) + + # Use TestLLM with enough responses to hit max iterations + # max_iteration_per_run=2 means we need at least 2 tool call responses + llm = TestLLM.from_messages( + [ + tool_call_message, + tool_call_message, + tool_call_message, # Extra in case needed + ] + ) agent = Agent(llm=llm, tools=[Tool(name="test_tool")]) # Set max_iteration_per_run to 2 to quickly hit the limit conversation = Conversation( @@ -492,32 +421,6 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]: callbacks=[lambda e: events_received.append(e)], ) - # Mock LLM to always return tool calls (never finish) - tool_call = ChatCompletionMessageToolCall( - id="call_1", - type="function", - function=Function( - name="test_tool", - arguments='{"command": "test_command"}', - ), - ) - - mock_completion.return_value = ModelResponse( - id="response_action", - choices=[ - Choices( - message=LiteLLMMessage( - role="assistant", - content="", - tool_calls=[tool_call], - ) - ) - ], - created=0, - model="test-model", - object="chat.completion", - ) - # Send message and run conversation.send_message( Message(role="user", content=[TextContent(text="Execute command")]) From 3f16332cf90cb3ff875bfc9925322c2dcaae2dfd Mon Sep 17 00:00:00 2001 From: VascoSch92 Date: Wed, 11 Feb 2026 23:42:03 +0000 Subject: [PATCH 2/3] update class TestLLM to use deque instead of a stack --- openhands-sdk/openhands/sdk/testing/test_llm.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/openhands-sdk/openhands/sdk/testing/test_llm.py b/openhands-sdk/openhands/sdk/testing/test_llm.py index b07102e3ff..53e80da133 100644 --- a/openhands-sdk/openhands/sdk/testing/test_llm.py +++ b/openhands-sdk/openhands/sdk/testing/test_llm.py @@ -37,6 +37,8 @@ if TYPE_CHECKING: from openhands.sdk.tool.tool import ToolDefinition +from collections import deque + __all__ = ["TestLLM", "TestLLMExhaustedError"] @@ -91,7 +93,7 @@ class TestLLM(LLM): __test__ = False model: str = Field(default="test-model") - _scripted_responses: list[Message] = PrivateAttr(default_factory=list) + _scripted_responses: deque[Message] = PrivateAttr(default_factory=deque) _call_count: int = PrivateAttr(default=0) model_config: ClassVar[ConfigDict] = ConfigDict( @@ -102,7 +104,7 @@ def __init__(self, **data: Any) -> None: # Extract scripted_responses before calling super().__init__ scripted_responses = data.pop("scripted_responses", []) super().__init__(**data) - self._scripted_responses = list(scripted_responses) + self._scripted_responses = deque(list(scripted_responses)) self._call_count = 0 @classmethod @@ -171,7 +173,7 @@ def completion( f"(exhausted after {self._call_count} calls)" ) - message = self._scripted_responses.pop(0) + message = self._scripted_responses.popleft() self._call_count += 1 # Create a minimal ModelResponse for raw_response From 6224963c6993a1003feeb17813b55c6d722e7416 Mon Sep 17 00:00:00 2001 From: VascoSch92 Date: Fri, 13 Feb 2026 11:06:13 +0100 Subject: [PATCH 3/3] add possibility to trigger errors --- .../openhands/sdk/testing/test_llm.py | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/openhands-sdk/openhands/sdk/testing/test_llm.py b/openhands-sdk/openhands/sdk/testing/test_llm.py index 53e80da133..af56d820f1 100644 --- a/openhands-sdk/openhands/sdk/testing/test_llm.py +++ b/openhands-sdk/openhands/sdk/testing/test_llm.py @@ -17,6 +17,15 @@ >>> user_msg = Message(role="user", content=[TextContent(text="Hi")]) >>> response = llm.completion([user_msg]) >>> print(response.message.content[0].text) # "Hello!" + + >>> # Scripted errors (like unittest.mock side_effect) + >>> from openhands.sdk.llm.exceptions import LLMContextWindowExceedError + >>> llm = TestLLM.from_responses([ + ... Message(role="assistant", content=[TextContent(text="OK")]), + ... LLMContextWindowExceedError(), + ... ]) + >>> llm.completion([...]) # returns "OK" + >>> llm.completion([...]) # raises LLMContextWindowExceedError """ from __future__ import annotations @@ -93,7 +102,7 @@ class TestLLM(LLM): __test__ = False model: str = Field(default="test-model") - _scripted_responses: deque[Message] = PrivateAttr(default_factory=deque) + _scripted_responses: deque[Message | Exception] = PrivateAttr(default_factory=deque) _call_count: int = PrivateAttr(default=0) model_config: ClassVar[ConfigDict] = ConfigDict( @@ -110,18 +119,19 @@ def __init__(self, **data: Any) -> None: @classmethod def from_messages( cls, - messages: list[Message], + messages: list[Message | Exception], *, model: str = "test-model", usage_id: str = "test-llm", **kwargs: Any, ) -> TestLLM: - """Create a TestLLM with scripted responses. + """Create a TestLLM with scripted responses and/or errors. Args: - messages: List of Message objects to return in order. - Each call to completion() or responses() will return - the next message from this list. + messages: List of Message or Exception objects to return in order. + Each call to completion() or responses() consumes the next + item: Message objects are returned normally, Exception objects + are raised (like unittest.mock side_effect). model: Model name (default: "test-model") usage_id: Usage ID for metrics (default: "test-llm") **kwargs: Additional LLM configuration options @@ -132,7 +142,7 @@ def from_messages( Example: >>> llm = TestLLM.from_messages([ ... Message(role="assistant", content=[TextContent(text="First")]), - ... Message(role="assistant", content=[TextContent(text="Second")]), + ... LLMContextWindowExceedError("context too long"), ... ]) """ return cls( @@ -166,6 +176,7 @@ def completion( Raises: TestLLMExhaustedError: When no more scripted responses are available. + Exception: Any scripted exception placed in the response queue. """ if not self._scripted_responses: raise TestLLMExhaustedError( @@ -173,9 +184,15 @@ def completion( f"(exhausted after {self._call_count} calls)" ) - message = self._scripted_responses.popleft() + item = self._scripted_responses.popleft() self._call_count += 1 + # Raise scripted exceptions (like unittest.mock side_effect) + if isinstance(item, Exception): + raise item + + message = item + # Create a minimal ModelResponse for raw_response raw_response = self._create_model_response(message)