From 4acb529db87cf1a6aeaee850f519181f7e034c8a Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 11 Feb 2026 17:58:23 +0000
Subject: [PATCH 1/3] feat: Add TestLLM class for testing without mocking
 LiteLLM

This commit introduces a new TestLLM class in openhands.sdk.testing that
provides a clean way to write tests without needing to mock LiteLLM internals.

Key features:
- TestLLM is a real LLM subclass that works anywhere an LLM is accepted
- No @patch decorators needed - just pass TestLLM as the llm= argument
- Tests speak in SDK types (Message, TextContent, MessageToolCall)
- Clear error when scripted responses are exhausted
- Zero-cost metrics by default
- Always uses completion() path (uses_responses_api returns False)

Also refactors test_agent_status_transition.py to demonstrate the new
TestLLM usage, replacing ~20 lines of mock setup with ~3 lines.

Closes #2005
---
 .../openhands/sdk/testing/__init__.py         |  10 +
 .../openhands/sdk/testing/test_llm.py         | 282 +++++++++++++++
 .../local/test_agent_status_transition.py     | 329 ++++++------------
 3 files changed, 408 insertions(+), 213 deletions(-)
 create mode 100644 openhands-sdk/openhands/sdk/testing/__init__.py
 create mode 100644 openhands-sdk/openhands/sdk/testing/test_llm.py

diff --git a/openhands-sdk/openhands/sdk/testing/__init__.py b/openhands-sdk/openhands/sdk/testing/__init__.py
new file mode 100644
index 0000000000..32de415071
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/testing/__init__.py
@@ -0,0 +1,10 @@
+"""Testing utilities for OpenHands SDK.
+
+This module provides test utilities that make it easy to write tests for
+code that uses the OpenHands SDK, without needing to mock LiteLLM internals.
+"""
+
+from openhands.sdk.testing.test_llm import TestLLM, TestLLMExhaustedError
+
+
+__all__ = ["TestLLM", "TestLLMExhaustedError"]
diff --git a/openhands-sdk/openhands/sdk/testing/test_llm.py b/openhands-sdk/openhands/sdk/testing/test_llm.py
new file mode 100644
index 0000000000..b07102e3ff
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/testing/test_llm.py
@@ -0,0 +1,282 @@
+"""TestLLM - A mock LLM for testing.
+
+TestLLM is a real LLM subclass that returns scripted responses, eliminating
+the need for @patch decorators and understanding of LiteLLM internals.
+
+Example:
+    >>> from openhands.sdk.testing import TestLLM
+    >>> from openhands.sdk.llm import Message, TextContent
+    >>>
+    >>> # Create a TestLLM with scripted responses
+    >>> llm = TestLLM.from_messages([
+    ...     Message(role="assistant", content=[TextContent(text="Hello!")]),
+    ...     Message(role="assistant", content=[TextContent(text="Goodbye!")]),
+    ... ])
+    >>>
+    >>> # Use it like a normal LLM
+    >>> user_msg = Message(role="user", content=[TextContent(text="Hi")])
+    >>> response = llm.completion([user_msg])
+    >>> print(response.message.content[0].text)  # "Hello!"
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any, ClassVar
+
+from litellm.types.utils import Choices, Message as LiteLLMMessage, ModelResponse
+from pydantic import ConfigDict, Field, PrivateAttr
+
+from openhands.sdk.llm.llm import LLM
+from openhands.sdk.llm.llm_response import LLMResponse
+from openhands.sdk.llm.message import Message
+from openhands.sdk.llm.streaming import TokenCallbackType
+from openhands.sdk.llm.utils.metrics import MetricsSnapshot, TokenUsage
+
+
+if TYPE_CHECKING:
+    from openhands.sdk.tool.tool import ToolDefinition
+
+
+__all__ = ["TestLLM", "TestLLMExhaustedError"]
+
+
+class TestLLMExhaustedError(Exception):
+    """Raised when TestLLM has no more scripted responses."""
+
+    pass
+
+
+class TestLLM(LLM):
+    """A mock LLM for testing that returns scripted responses.
+
+    TestLLM is a real LLM subclass that can be used anywhere an LLM is accepted:
+    in Agent(llm=...), in fallback_llms, in condensers, in routers, etc.
+
+    Key features:
+    - No patching needed: just pass TestLLM as the llm= argument
+    - Tests speak in SDK types (Message, TextContent, MessageToolCall)
+    - Clear error when responses are exhausted
+    - Zero-cost metrics by default
+    - Always uses completion() path (uses_responses_api returns False)
+
+    Example:
+        >>> from openhands.sdk.testing import TestLLM
+        >>> from openhands.sdk.llm import Message, TextContent, MessageToolCall
+        >>>
+        >>> # Simple text response
+        >>> llm = TestLLM.from_messages([
+        ...     Message(role="assistant", content=[TextContent(text="Done!")]),
+        ... ])
+        >>>
+        >>> # Response with tool calls
+        >>> llm = TestLLM.from_messages([
+        ...     Message(
+        ...         role="assistant",
+        ...         content=[TextContent(text="")],
+        ...         tool_calls=[
+        ...             MessageToolCall(
+        ...                 id="call_1",
+        ...                 name="my_tool",
+        ...                 arguments='{"arg": "value"}',
+        ...                 origin="completion",
+        ...             )
+        ...         ],
+        ...     ),
+        ...     Message(role="assistant", content=[TextContent(text="Done!")]),
+        ... ])
+    """
+
+    # Prevent pytest from collecting this class as a test
+    __test__ = False
+
+    model: str = Field(default="test-model")
+    _scripted_responses: list[Message] = PrivateAttr(default_factory=list)
+    _call_count: int = PrivateAttr(default=0)
+
+    model_config: ClassVar[ConfigDict] = ConfigDict(
+        extra="ignore", arbitrary_types_allowed=True
+    )
+
+    def __init__(self, **data: Any) -> None:
+        # Extract scripted_responses before calling super().__init__
+        scripted_responses = data.pop("scripted_responses", [])
+        super().__init__(**data)
+        self._scripted_responses = list(scripted_responses)
+        self._call_count = 0
+
+    @classmethod
+    def from_messages(
+        cls,
+        messages: list[Message],
+        *,
+        model: str = "test-model",
+        usage_id: str = "test-llm",
+        **kwargs: Any,
+    ) -> TestLLM:
+        """Create a TestLLM with scripted responses.
+
+        Args:
+            messages: List of Message objects to return in order.
+                Each call to completion() or responses() will return
+                the next message from this list.
+            model: Model name (default: "test-model")
+            usage_id: Usage ID for metrics (default: "test-llm")
+            **kwargs: Additional LLM configuration options
+
+        Returns:
+            A TestLLM instance configured with the scripted responses.
+
+        Example:
+            >>> llm = TestLLM.from_messages([
+            ...     Message(role="assistant", content=[TextContent(text="First")]),
+            ...     Message(role="assistant", content=[TextContent(text="Second")]),
+            ... ])
+        """
+        return cls(
+            model=model,
+            usage_id=usage_id,
+            scripted_responses=messages,
+            **kwargs,
+        )
+
+    def completion(
+        self,
+        messages: list[Message],  # noqa: ARG002
+        tools: Sequence[ToolDefinition] | None = None,  # noqa: ARG002
+        _return_metrics: bool = False,
+        add_security_risk_prediction: bool = False,  # noqa: ARG002
+        on_token: TokenCallbackType | None = None,  # noqa: ARG002
+        **kwargs: Any,  # noqa: ARG002
+    ) -> LLMResponse:
+        """Return the next scripted response.
+
+        Args:
+            messages: Input messages (ignored, but required for API compatibility)
+            tools: Available tools (ignored)
+            _return_metrics: Whether to return metrics (ignored)
+            add_security_risk_prediction: Add security risk field (ignored)
+            on_token: Streaming callback (ignored)
+            **kwargs: Additional arguments (ignored)
+
+        Returns:
+            LLMResponse containing the next scripted message.
+
+        Raises:
+            TestLLMExhaustedError: When no more scripted responses are available.
+        """
+        if not self._scripted_responses:
+            raise TestLLMExhaustedError(
+                f"TestLLM: no more scripted responses "
+                f"(exhausted after {self._call_count} calls)"
+            )
+
+        message = self._scripted_responses.pop(0)
+        self._call_count += 1
+
+        # Create a minimal ModelResponse for raw_response
+        raw_response = self._create_model_response(message)
+
+        return LLMResponse(
+            message=message,
+            metrics=self._zero_metrics(),
+            raw_response=raw_response,
+        )
+
+    def responses(
+        self,
+        messages: list[Message],
+        tools: Sequence[ToolDefinition] | None = None,
+        include: list[str] | None = None,  # noqa: ARG002
+        store: bool | None = None,  # noqa: ARG002
+        _return_metrics: bool = False,
+        add_security_risk_prediction: bool = False,
+        on_token: TokenCallbackType | None = None,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        """Return the next scripted response (delegates to completion).
+
+        For TestLLM, both completion() and responses() return from the same
+        queue of scripted responses.
+        """
+        return self.completion(
+            messages=messages,
+            tools=tools,
+            _return_metrics=_return_metrics,
+            add_security_risk_prediction=add_security_risk_prediction,
+            on_token=on_token,
+            **kwargs,
+        )
+
+    def uses_responses_api(self) -> bool:
+        """TestLLM always uses the completion path."""
+        return False
+
+    def _zero_metrics(self) -> MetricsSnapshot:
+        """Return a zero-cost metrics snapshot."""
+        return MetricsSnapshot(
+            model_name=self.model,
+            accumulated_cost=0.0,
+            max_budget_per_task=None,
+            accumulated_token_usage=TokenUsage(
+                model=self.model,
+                prompt_tokens=0,
+                completion_tokens=0,
+            ),
+        )
+
+    def _create_model_response(self, message: Message) -> ModelResponse:
+        """Create a minimal ModelResponse from a Message.
+
+        This creates a valid ModelResponse that can be used as raw_response
+        in LLMResponse.
+        """
+        # Build the LiteLLM message dict
+        litellm_message_dict: dict[str, Any] = {
+            "role": message.role,
+            "content": self._content_to_string(message),
+        }
+
+        # Add tool_calls if present
+        if message.tool_calls:
+            litellm_message_dict["tool_calls"] = [
+                {
+                    "id": tc.id,
+                    "type": "function",
+                    "function": {
+                        "name": tc.name,
+                        "arguments": tc.arguments,
+                    },
+                }
+                for tc in message.tool_calls
+            ]
+
+        litellm_message = LiteLLMMessage(**litellm_message_dict)
+
+        return ModelResponse(
+            id=f"test-response-{self._call_count}",
+            choices=[Choices(message=litellm_message, index=0, finish_reason="stop")],
+            created=0,
+            model=self.model,
+            object="chat.completion",
+        )
+
+    def _content_to_string(self, message: Message) -> str:
+        """Convert message content to a string."""
+        from openhands.sdk.llm.message import TextContent
+
+        parts = []
+        for item in message.content:
+            if isinstance(item, TextContent):
+                parts.append(item.text)
+        return "\n".join(parts)
+
+    @property
+    def remaining_responses(self) -> int:
+        """Return the number of remaining scripted responses."""
+        return len(self._scripted_responses)
+
+    @property
+    def call_count(self) -> int:
+        """Return the number of calls made to this TestLLM."""
+        return self._call_count
diff --git a/tests/sdk/conversation/local/test_agent_status_transition.py b/tests/sdk/conversation/local/test_agent_status_transition.py
index 37d7af34b8..4c6055c777 100644
--- a/tests/sdk/conversation/local/test_agent_status_transition.py
+++ b/tests/sdk/conversation/local/test_agent_status_transition.py
@@ -18,22 +18,13 @@
 import threading
 from collections.abc import Sequence
 from typing import ClassVar
-from unittest.mock import patch
-
-from litellm import ChatCompletionMessageToolCall
-from litellm.types.utils import (
-    Choices,
-    Function,
-    Message as LiteLLMMessage,
-    ModelResponse,
-)
-from pydantic import SecretStr
 
 from openhands.sdk.agent import Agent
 from openhands.sdk.conversation import Conversation
 from openhands.sdk.conversation.state import ConversationExecutionStatus
 from openhands.sdk.event import MessageEvent
-from openhands.sdk.llm import LLM, ImageContent, Message, TextContent
+from openhands.sdk.llm import ImageContent, Message, MessageToolCall, TextContent
+from openhands.sdk.testing import TestLLM
 from openhands.sdk.tool import (
     Action,
     Observation,
@@ -100,8 +91,7 @@ def create(
         ]
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_execution_status_transitions_to_running_from_idle(mock_completion):
+def test_execution_status_transitions_to_running_from_idle():
     """Test that agent status transitions to RUNNING when run() is called from IDLE."""
     status_during_execution: list[ConversationExecutionStatus] = []
 
@@ -112,24 +102,18 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
 
     register_tool("test_tool", _make_tool)
 
-    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
+    # Use TestLLM with a scripted response
+    llm = TestLLM.from_messages(
+        [
+            Message(role="assistant", content=[TextContent(text="Task completed")]),
+        ]
+    )
     agent = Agent(llm=llm, tools=[])
     conversation = Conversation(agent=agent)
 
     # Verify initial state is IDLE
     assert conversation.state.execution_status == ConversationExecutionStatus.IDLE
 
-    # Mock LLM to return a message that finishes execution
-    mock_completion.return_value = ModelResponse(
-        id="response_msg",
-        choices=[
-            Choices(message=LiteLLMMessage(role="assistant", content="Task completed"))
-        ],
-        created=0,
-        model="test-model",
-        object="chat.completion",
-    )
-
     # Send message and run
     conversation.send_message(Message(role="user", content=[TextContent(text="Hello")]))
     conversation.run()
@@ -146,20 +130,49 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
     assert len(agent_messages) == 1
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_execution_status_is_running_during_execution_from_idle(mock_completion):
+def test_execution_status_is_running_during_execution_from_idle():
     """Test that agent status is RUNNING during execution when started from IDLE."""
     status_during_execution: list[ConversationExecutionStatus] = []
     execution_started = threading.Event()
 
+    class SignalingExecutor(
+        ToolExecutor[StatusTransitionMockAction, StatusTransitionMockObservation]
+    ):
+        """Executor that signals when execution starts and captures status."""
+
+        def __call__(
+            self, action: StatusTransitionMockAction, conversation=None
+        ) -> StatusTransitionMockObservation:
+            # Signal that execution has started
+            execution_started.set()
+            # Capture the agent status during execution
+            if conversation:
+                status_during_execution.append(conversation.state.execution_status)
+            return StatusTransitionMockObservation(result=f"Executed: {action.command}")
+
     def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
-        return StatusTransitionTestTool.create(
-            executor=StatusCheckingExecutor(status_during_execution)
-        )
+        return StatusTransitionTestTool.create(executor=SignalingExecutor())
 
     register_tool("test_tool", _make_tool)
 
-    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
+    # Use TestLLM with scripted responses: first a tool call, then completion
+    llm = TestLLM.from_messages(
+        [
+            Message(
+                role="assistant",
+                content=[TextContent(text="")],
+                tool_calls=[
+                    MessageToolCall(
+                        id="call_1",
+                        name="test_tool",
+                        arguments='{"command": "test_command"}',
+                        origin="completion",
+                    )
+                ],
+            ),
+            Message(role="assistant", content=[TextContent(text="Task completed")]),
+        ]
+    )
     agent = Agent(
         llm=llm,
         tools=[Tool(name="test_tool")],
@@ -169,63 +182,12 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
     # Verify initial state is IDLE
     assert conversation.state.execution_status == ConversationExecutionStatus.IDLE
 
-    # Mock LLM to return an action first, then finish
-    tool_call = ChatCompletionMessageToolCall(
-        id="call_1",
-        type="function",
-        function=Function(
-            name="test_tool",
-            arguments='{"command": "test_command"}',
-        ),
-    )
-
-    call_count = [0]
-
-    def side_effect(*args, **kwargs):
-        call_count[0] += 1
-        if call_count[0] == 1:
-            # First call: return tool call
-            execution_started.set()
-            return ModelResponse(
-                id="response_action",
-                choices=[
-                    Choices(
-                        message=LiteLLMMessage(
-                            role="assistant",
-                            content="",
-                            tool_calls=[tool_call],
-                        )
-                    )
-                ],
-                created=0,
-                model="test-model",
-                object="chat.completion",
-            )
-        else:
-            # Second call: finish
-            return ModelResponse(
-                id="response_msg",
-                choices=[
-                    Choices(
-                        message=LiteLLMMessage(
-                            role="assistant", content="Task completed"
-                        )
-                    )
-                ],
-                created=0,
-                model="test-model",
-                object="chat.completion",
-            )
-
-    mock_completion.side_effect = side_effect
-
     # Send message
     conversation.send_message(
         Message(role="user", content=[TextContent(text="Execute command")])
     )
 
     # Run in a separate thread so we can check status during execution
-    status_checked = threading.Event()
     run_complete = threading.Event()
     status_during_run: list[ConversationExecutionStatus | None] = [None]
 
@@ -241,7 +203,6 @@ def run_agent():
 
     # Check status while running
     status_during_run[0] = conversation.state.execution_status
-    status_checked.set()
 
     # Wait for run to complete
     assert run_complete.wait(timeout=2.0), "Run did not complete"
@@ -256,11 +217,15 @@ def run_agent():
     assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_execution_status_transitions_to_running_from_paused(mock_completion):
+def test_execution_status_transitions_to_running_from_paused():
     """Test that agent status transitions to RUNNING when run() is called from
     PAUSED."""
-    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
+    # Use TestLLM with a scripted response
+    llm = TestLLM.from_messages(
+        [
+            Message(role="assistant", content=[TextContent(text="Task completed")]),
+        ]
+    )
     agent = Agent(llm=llm, tools=[])
     conversation = Conversation(agent=agent)
 
@@ -268,17 +233,6 @@ def test_execution_status_transitions_to_running_from_paused(mock_completion):
     conversation.pause()
     assert conversation.state.execution_status == ConversationExecutionStatus.PAUSED
 
-    # Mock LLM to return a message that finishes execution
-    mock_completion.return_value = ModelResponse(
-        id="response_msg",
-        choices=[
-            Choices(message=LiteLLMMessage(role="assistant", content="Task completed"))
-        ],
-        created=0,
-        model="test-model",
-        object="chat.completion",
-    )
-
     # Send message and run
     conversation.send_message(Message(role="user", content=[TextContent(text="Hello")]))
     conversation.run()
@@ -295,70 +249,37 @@ def test_execution_status_transitions_to_running_from_paused(mock_completion):
     assert len(agent_messages) == 1
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_execution_status_transitions_from_waiting_for_confirmation(mock_completion):
+def test_execution_status_transitions_from_waiting_for_confirmation():
     """Test WAITING_FOR_CONFIRMATION -> RUNNING transition when run() is called."""
     from openhands.sdk.security.confirmation_policy import AlwaysConfirm
 
-    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
-
     def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
         return StatusTransitionTestTool.create(executor=StatusCheckingExecutor([]))
 
     register_tool("test_tool", _make_tool)
 
-    agent = Agent(llm=llm, tools=[Tool(name="test_tool")])
-    conversation = Conversation(agent=agent)
-    conversation.set_confirmation_policy(AlwaysConfirm())
-
-    # Mock LLM to return an action first, then finish
-    tool_call = ChatCompletionMessageToolCall(
-        id="call_1",
-        type="function",
-        function=Function(
-            name="test_tool",
-            arguments='{"command": "test_command"}',
-        ),
-    )
-
-    call_count = [0]
-
-    def side_effect(*args, **kwargs):
-        call_count[0] += 1
-        if call_count[0] == 1:
-            # First call: return tool call
-            return ModelResponse(
-                id="response_action",
-                choices=[
-                    Choices(
-                        message=LiteLLMMessage(
-                            role="assistant",
-                            content="",
-                            tool_calls=[tool_call],
-                        )
+    # Use TestLLM with scripted responses: first a tool call, then completion
+    llm = TestLLM.from_messages(
+        [
+            Message(
+                role="assistant",
+                content=[TextContent(text="")],
+                tool_calls=[
+                    MessageToolCall(
+                        id="call_1",
+                        name="test_tool",
+                        arguments='{"command": "test_command"}',
+                        origin="completion",
                     )
                 ],
-                created=0,
-                model="test-model",
-                object="chat.completion",
-            )
-        else:
-            # Second call: finish
-            return ModelResponse(
-                id="response_msg",
-                choices=[
-                    Choices(
-                        message=LiteLLMMessage(
-                            role="assistant", content="Task completed"
-                        )
-                    )
-                ],
-                created=0,
-                model="test-model",
-                object="chat.completion",
-            )
+            ),
+            Message(role="assistant", content=[TextContent(text="Task completed")]),
+        ]
+    )
 
-    mock_completion.side_effect = side_effect
+    agent = Agent(llm=llm, tools=[Tool(name="test_tool")])
+    conversation = Conversation(agent=agent)
+    conversation.set_confirmation_policy(AlwaysConfirm())
 
     # Send message and run - should stop at WAITING_FOR_CONFIRMATION
     conversation.send_message(
@@ -379,24 +300,18 @@ def side_effect(*args, **kwargs):
     assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_execution_status_finished_to_idle_to_running(mock_completion):
+def test_execution_status_finished_to_idle_to_running():
     """Test FINISHED -> IDLE -> RUNNING transition when new message is sent."""
-    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
+    # Use TestLLM with two scripted responses (one for each run)
+    llm = TestLLM.from_messages(
+        [
+            Message(role="assistant", content=[TextContent(text="Task completed")]),
+            Message(role="assistant", content=[TextContent(text="Task completed")]),
+        ]
+    )
     agent = Agent(llm=llm, tools=[])
     conversation = Conversation(agent=agent)
 
-    # Mock LLM to return completion messages
-    mock_completion.return_value = ModelResponse(
-        id="response_msg",
-        choices=[
-            Choices(message=LiteLLMMessage(role="assistant", content="Task completed"))
-        ],
-        created=0,
-        model="test-model",
-        object="chat.completion",
-    )
-
     # First conversation - should end in FINISHED
     conversation.send_message(
         Message(role="user", content=[TextContent(text="First task")])
@@ -415,24 +330,17 @@ def test_execution_status_finished_to_idle_to_running(mock_completion):
     assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_run_exits_immediately_when_already_finished(mock_completion):
+def test_run_exits_immediately_when_already_finished():
     """Test that run() exits immediately when status is already FINISHED."""
-    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
+    # Use TestLLM with a single scripted response
+    llm = TestLLM.from_messages(
+        [
+            Message(role="assistant", content=[TextContent(text="Task completed")]),
+        ]
+    )
     agent = Agent(llm=llm, tools=[])
     conversation = Conversation(agent=agent)
 
-    # Mock LLM
-    mock_completion.return_value = ModelResponse(
-        id="response_msg",
-        choices=[
-            Choices(message=LiteLLMMessage(role="assistant", content="Task completed"))
-        ],
-        created=0,
-        model="test-model",
-        object="chat.completion",
-    )
-
     # Complete a task
     conversation.send_message(Message(role="user", content=[TextContent(text="Task")]))
     conversation.run()
@@ -440,19 +348,19 @@ def test_run_exits_immediately_when_already_finished(mock_completion):
 
     # Call run again without sending a new message
     # Should exit immediately without calling LLM again
-    initial_call_count = mock_completion.call_count
+    initial_call_count = llm.call_count
     conversation.run()
 
     # Status should still be FINISHED
     assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
     # LLM should not be called again
-    assert mock_completion.call_count == initial_call_count
+    assert llm.call_count == initial_call_count
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_run_exits_immediately_when_stuck(mock_completion):
+def test_run_exits_immediately_when_stuck():
     """Test that run() exits immediately when status is STUCK."""
-    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
+    # Use TestLLM with no scripted responses (should not be called)
+    llm = TestLLM.from_messages([])
     agent = Agent(llm=llm, tools=[])
     conversation = Conversation(agent=agent)
 
@@ -465,11 +373,10 @@ def test_run_exits_immediately_when_stuck(mock_completion):
     # Status should still be STUCK
     assert conversation.state.execution_status == ConversationExecutionStatus.STUCK
     # LLM should not be called
-    assert mock_completion.call_count == 0
+    assert llm.call_count == 0
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_execution_status_error_on_max_iterations(mock_completion):
+def test_execution_status_error_on_max_iterations():
     """Test that status is set to ERROR with clear message when max iterations hit."""
     from openhands.sdk.event.conversation_error import ConversationErrorEvent
 
@@ -483,7 +390,29 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
 
     register_tool("test_tool", _make_tool)
 
-    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
+    # Create a tool call message that will be returned repeatedly
+    tool_call_message = Message(
+        role="assistant",
+        content=[TextContent(text="")],
+        tool_calls=[
+            MessageToolCall(
+                id="call_1",
+                name="test_tool",
+                arguments='{"command": "test_command"}',
+                origin="completion",
+            )
+        ],
+    )
+
+    # Use TestLLM with enough responses to hit max iterations
+    # max_iteration_per_run=2 means we need at least 2 tool call responses
+    llm = TestLLM.from_messages(
+        [
+            tool_call_message,
+            tool_call_message,
+            tool_call_message,  # Extra in case needed
+        ]
+    )
     agent = Agent(llm=llm, tools=[Tool(name="test_tool")])
     # Set max_iteration_per_run to 2 to quickly hit the limit
     conversation = Conversation(
@@ -492,32 +421,6 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
         callbacks=[lambda e: events_received.append(e)],
     )
 
-    # Mock LLM to always return tool calls (never finish)
-    tool_call = ChatCompletionMessageToolCall(
-        id="call_1",
-        type="function",
-        function=Function(
-            name="test_tool",
-            arguments='{"command": "test_command"}',
-        ),
-    )
-
-    mock_completion.return_value = ModelResponse(
-        id="response_action",
-        choices=[
-            Choices(
-                message=LiteLLMMessage(
-                    role="assistant",
-                    content="",
-                    tool_calls=[tool_call],
-                )
-            )
-        ],
-        created=0,
-        model="test-model",
-        object="chat.completion",
-    )
-
     # Send message and run
     conversation.send_message(
         Message(role="user", content=[TextContent(text="Execute command")])

From 3f16332cf90cb3ff875bfc9925322c2dcaae2dfd Mon Sep 17 00:00:00 2001
From: VascoSch92 <vasco.schiavo@protonmail.com>
Date: Wed, 11 Feb 2026 23:42:03 +0000
Subject: [PATCH 2/3] update class TestLLM to use deque instead of a stack

---
 openhands-sdk/openhands/sdk/testing/test_llm.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/testing/test_llm.py b/openhands-sdk/openhands/sdk/testing/test_llm.py
index b07102e3ff..53e80da133 100644
--- a/openhands-sdk/openhands/sdk/testing/test_llm.py
+++ b/openhands-sdk/openhands/sdk/testing/test_llm.py
@@ -37,6 +37,8 @@
 if TYPE_CHECKING:
     from openhands.sdk.tool.tool import ToolDefinition
 
+from collections import deque
+
 
 __all__ = ["TestLLM", "TestLLMExhaustedError"]
 
@@ -91,7 +93,7 @@ class TestLLM(LLM):
     __test__ = False
 
     model: str = Field(default="test-model")
-    _scripted_responses: list[Message] = PrivateAttr(default_factory=list)
+    _scripted_responses: deque[Message] = PrivateAttr(default_factory=deque)
     _call_count: int = PrivateAttr(default=0)
 
     model_config: ClassVar[ConfigDict] = ConfigDict(
@@ -102,7 +104,7 @@ def __init__(self, **data: Any) -> None:
         # Extract scripted_responses before calling super().__init__
         scripted_responses = data.pop("scripted_responses", [])
         super().__init__(**data)
-        self._scripted_responses = list(scripted_responses)
+        self._scripted_responses = deque(list(scripted_responses))
         self._call_count = 0
 
     @classmethod
@@ -171,7 +173,7 @@ def completion(
                 f"(exhausted after {self._call_count} calls)"
             )
 
-        message = self._scripted_responses.pop(0)
+        message = self._scripted_responses.popleft()
         self._call_count += 1
 
         # Create a minimal ModelResponse for raw_response

From 6224963c6993a1003feeb17813b55c6d722e7416 Mon Sep 17 00:00:00 2001
From: VascoSch92 <vasco.schiavo@protonmail.com>
Date: Fri, 13 Feb 2026 11:06:13 +0100
Subject: [PATCH 3/3] add possibility to trigger errors

---
 .../openhands/sdk/testing/test_llm.py         | 33 ++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/testing/test_llm.py b/openhands-sdk/openhands/sdk/testing/test_llm.py
index 53e80da133..af56d820f1 100644
--- a/openhands-sdk/openhands/sdk/testing/test_llm.py
+++ b/openhands-sdk/openhands/sdk/testing/test_llm.py
@@ -17,6 +17,15 @@
     >>> user_msg = Message(role="user", content=[TextContent(text="Hi")])
     >>> response = llm.completion([user_msg])
     >>> print(response.message.content[0].text)  # "Hello!"
+
+    >>> # Scripted errors (like unittest.mock side_effect)
+    >>> from openhands.sdk.llm.exceptions import LLMContextWindowExceedError
+    >>> llm = TestLLM.from_responses([
+    ...     Message(role="assistant", content=[TextContent(text="OK")]),
+    ...     LLMContextWindowExceedError(),
+    ... ])
+    >>> llm.completion([...])  # returns "OK"
+    >>> llm.completion([...])  # raises LLMContextWindowExceedError
 """
 
 from __future__ import annotations
@@ -93,7 +102,7 @@ class TestLLM(LLM):
     __test__ = False
 
     model: str = Field(default="test-model")
-    _scripted_responses: deque[Message] = PrivateAttr(default_factory=deque)
+    _scripted_responses: deque[Message | Exception] = PrivateAttr(default_factory=deque)
     _call_count: int = PrivateAttr(default=0)
 
     model_config: ClassVar[ConfigDict] = ConfigDict(
@@ -110,18 +119,19 @@ def __init__(self, **data: Any) -> None:
     @classmethod
     def from_messages(
         cls,
-        messages: list[Message],
+        messages: list[Message | Exception],
         *,
         model: str = "test-model",
         usage_id: str = "test-llm",
         **kwargs: Any,
     ) -> TestLLM:
-        """Create a TestLLM with scripted responses.
+        """Create a TestLLM with scripted responses and/or errors.
 
         Args:
-            messages: List of Message objects to return in order.
-                Each call to completion() or responses() will return
-                the next message from this list.
+            messages: List of Message or Exception objects to return in order.
+                Each call to completion() or responses() consumes the next
+                item: Message objects are returned normally, Exception objects
+                are raised (like unittest.mock side_effect).
             model: Model name (default: "test-model")
             usage_id: Usage ID for metrics (default: "test-llm")
             **kwargs: Additional LLM configuration options
@@ -132,7 +142,7 @@ def from_messages(
         Example:
             >>> llm = TestLLM.from_messages([
             ...     Message(role="assistant", content=[TextContent(text="First")]),
-            ...     Message(role="assistant", content=[TextContent(text="Second")]),
+            ...     LLMContextWindowExceedError("context too long"),
             ... ])
         """
         return cls(
@@ -166,6 +176,7 @@ def completion(
 
         Raises:
             TestLLMExhaustedError: When no more scripted responses are available.
+            Exception: Any scripted exception placed in the response queue.
         """
         if not self._scripted_responses:
             raise TestLLMExhaustedError(
@@ -173,9 +184,15 @@ def completion(
                 f"(exhausted after {self._call_count} calls)"
             )
 
-        message = self._scripted_responses.popleft()
+        item = self._scripted_responses.popleft()
         self._call_count += 1
 
+        # Raise scripted exceptions (like unittest.mock side_effect)
+        if isinstance(item, Exception):
+            raise item
+
+        message = item
+
         # Create a minimal ModelResponse for raw_response
         raw_response = self._create_model_response(message)