Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions openhands-sdk/openhands/sdk/testing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Testing utilities for OpenHands SDK.

This module provides test utilities that make it easy to write tests for
code that uses the OpenHands SDK, without needing to mock LiteLLM internals.
"""

from openhands.sdk.testing.test_llm import TestLLM, TestLLMExhaustedError


__all__ = ["TestLLM", "TestLLMExhaustedError"]
301 changes: 301 additions & 0 deletions openhands-sdk/openhands/sdk/testing/test_llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
"""TestLLM - A mock LLM for testing.

TestLLM is a real LLM subclass that returns scripted responses, eliminating
the need for @patch decorators and understanding of LiteLLM internals.

Example:
>>> from openhands.sdk.testing import TestLLM
>>> from openhands.sdk.llm import Message, TextContent
>>>
>>> # Create a TestLLM with scripted responses
>>> llm = TestLLM.from_messages([
... Message(role="assistant", content=[TextContent(text="Hello!")]),
... Message(role="assistant", content=[TextContent(text="Goodbye!")]),
... ])
>>>
>>> # Use it like a normal LLM
>>> user_msg = Message(role="user", content=[TextContent(text="Hi")])
>>> response = llm.completion([user_msg])
>>> print(response.message.content[0].text) # "Hello!"

>>> # Scripted errors (like unittest.mock side_effect)
>>> from openhands.sdk.llm.exceptions import LLMContextWindowExceedError
>>> llm = TestLLM.from_responses([
... Message(role="assistant", content=[TextContent(text="OK")]),
... LLMContextWindowExceedError(),
... ])
>>> llm.completion([...]) # returns "OK"
>>> llm.completion([...]) # raises LLMContextWindowExceedError
"""

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 Important: No unit tests for TestLLM itself. A testing utility should have its own tests to ensure:

  • _create_model_response produces valid LiteLLM responses
  • _content_to_string handles edge cases correctly
  • TestLLMExhaustedError is raised when responses are exhausted
  • Exception responses in the queue are raised correctly
  • remaining_responses and call_count properties work

Without these tests, bugs in TestLLM could cause confusing test failures.

from __future__ import annotations

from collections.abc import Sequence
from typing import TYPE_CHECKING, Any, ClassVar

from litellm.types.utils import Choices, Message as LiteLLMMessage, ModelResponse
from pydantic import ConfigDict, Field, PrivateAttr

from openhands.sdk.llm.llm import LLM
from openhands.sdk.llm.llm_response import LLMResponse
from openhands.sdk.llm.message import Message
from openhands.sdk.llm.streaming import TokenCallbackType
from openhands.sdk.llm.utils.metrics import MetricsSnapshot, TokenUsage


if TYPE_CHECKING:
from openhands.sdk.tool.tool import ToolDefinition

from collections import deque


__all__ = ["TestLLM", "TestLLMExhaustedError"]


class TestLLMExhaustedError(Exception):
"""Raised when TestLLM has no more scripted responses."""

pass


class TestLLM(LLM):
"""A mock LLM for testing that returns scripted responses.

TestLLM is a real LLM subclass that can be used anywhere an LLM is accepted:
in Agent(llm=...), in fallback_llms, in condensers, in routers, etc.

Key features:
- No patching needed: just pass TestLLM as the llm= argument
- Tests speak in SDK types (Message, TextContent, MessageToolCall)
- Clear error when responses are exhausted
- Zero-cost metrics by default
- Always uses completion() path (uses_responses_api returns False)

Example:
>>> from openhands.sdk.testing import TestLLM
>>> from openhands.sdk.llm import Message, TextContent, MessageToolCall
>>>
>>> # Simple text response
>>> llm = TestLLM.from_messages([
... Message(role="assistant", content=[TextContent(text="Done!")]),
... ])
>>>
>>> # Response with tool calls
>>> llm = TestLLM.from_messages([
... Message(
... role="assistant",
... content=[TextContent(text="")],
... tool_calls=[
... MessageToolCall(
... id="call_1",
... name="my_tool",
... arguments='{"arg": "value"}',
... origin="completion",
... )
... ],
... ),
... Message(role="assistant", content=[TextContent(text="Done!")]),
... ])
"""

# Prevent pytest from collecting this class as a test
__test__ = False

model: str = Field(default="test-model")
_scripted_responses: deque[Message | Exception] = PrivateAttr(default_factory=deque)
_call_count: int = PrivateAttr(default=0)

model_config: ClassVar[ConfigDict] = ConfigDict(
extra="ignore", arbitrary_types_allowed=True
)

def __init__(self, **data: Any) -> None:
# Extract scripted_responses before calling super().__init__
scripted_responses = data.pop("scripted_responses", [])
super().__init__(**data)
self._scripted_responses = deque(list(scripted_responses))
self._call_count = 0

@classmethod
def from_messages(
cls,
messages: list[Message | Exception],
*,
model: str = "test-model",
usage_id: str = "test-llm",
**kwargs: Any,
) -> TestLLM:
"""Create a TestLLM with scripted responses and/or errors.

Args:
messages: List of Message or Exception objects to return in order.
Each call to completion() or responses() consumes the next
item: Message objects are returned normally, Exception objects
are raised (like unittest.mock side_effect).
model: Model name (default: "test-model")
usage_id: Usage ID for metrics (default: "test-llm")
**kwargs: Additional LLM configuration options

Returns:
A TestLLM instance configured with the scripted responses.

Example:
>>> llm = TestLLM.from_messages([
... Message(role="assistant", content=[TextContent(text="First")]),
... LLMContextWindowExceedError("context too long"),
... ])
"""
return cls(
model=model,
usage_id=usage_id,
scripted_responses=messages,
**kwargs,
)

def completion(
self,
messages: list[Message], # noqa: ARG002
tools: Sequence[ToolDefinition] | None = None, # noqa: ARG002
_return_metrics: bool = False,
add_security_risk_prediction: bool = False, # noqa: ARG002
on_token: TokenCallbackType | None = None, # noqa: ARG002
**kwargs: Any, # noqa: ARG002
) -> LLMResponse:
"""Return the next scripted response.

Args:
messages: Input messages (ignored, but required for API compatibility)
tools: Available tools (ignored)
_return_metrics: Whether to return metrics (ignored)
add_security_risk_prediction: Add security risk field (ignored)
on_token: Streaming callback (ignored)
**kwargs: Additional arguments (ignored)

Returns:
LLMResponse containing the next scripted message.

Raises:
TestLLMExhaustedError: When no more scripted responses are available.
Exception: Any scripted exception placed in the response queue.
"""
if not self._scripted_responses:
raise TestLLMExhaustedError(
f"TestLLM: no more scripted responses "
f"(exhausted after {self._call_count} calls)"
)

item = self._scripted_responses.popleft()
self._call_count += 1

# Raise scripted exceptions (like unittest.mock side_effect)
if isinstance(item, Exception):
raise item

message = item

# Create a minimal ModelResponse for raw_response
raw_response = self._create_model_response(message)

return LLMResponse(
message=message,
metrics=self._zero_metrics(),
raw_response=raw_response,
)

def responses(
self,
messages: list[Message],
tools: Sequence[ToolDefinition] | None = None,
include: list[str] | None = None, # noqa: ARG002
store: bool | None = None, # noqa: ARG002
_return_metrics: bool = False,
add_security_risk_prediction: bool = False,
on_token: TokenCallbackType | None = None,
**kwargs: Any,
) -> LLMResponse:
"""Return the next scripted response (delegates to completion).

For TestLLM, both completion() and responses() return from the same
queue of scripted responses.
"""
return self.completion(
messages=messages,
tools=tools,
_return_metrics=_return_metrics,
add_security_risk_prediction=add_security_risk_prediction,
on_token=on_token,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Suggestion: uses_responses_api() always returning False means TestLLM cannot be used to test code paths that specifically use the responses API.

This is a significant limitation that should be documented in the class docstring, or consider adding a parameter to control this behavior:

def __init__(self, *, use_responses_api: bool = False, **data: Any) -> None:
    self._use_responses_api = use_responses_api
    # ...

def uses_responses_api(self) -> bool:
    return self._use_responses_api

**kwargs,
)

def uses_responses_api(self) -> bool:
"""TestLLM always uses the completion path."""
return False

def _zero_metrics(self) -> MetricsSnapshot:
"""Return a zero-cost metrics snapshot."""
return MetricsSnapshot(
model_name=self.model,
accumulated_cost=0.0,
max_budget_per_task=None,
accumulated_token_usage=TokenUsage(
model=self.model,
prompt_tokens=0,
completion_tokens=0,
),
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Suggestion: _create_model_response creates a minimal ModelResponse that might not match real LLM behavior. Real LLMs may include additional fields that downstream code depends on (e.g., usage stats, model-specific metadata).

Consider adding integration tests that verify TestLLM produces responses processed identically to real LLM responses by the Agent/Conversation machinery. This would catch cases where the minimal response causes different behavior than production.


def _create_model_response(self, message: Message) -> ModelResponse:
"""Create a minimal ModelResponse from a Message.

This creates a valid ModelResponse that can be used as raw_response
in LLMResponse.
"""
# Build the LiteLLM message dict
litellm_message_dict: dict[str, Any] = {
"role": message.role,
"content": self._content_to_string(message),
}

# Add tool_calls if present
if message.tool_calls:
litellm_message_dict["tool_calls"] = [
{
"id": tc.id,
"type": "function",
"function": {
"name": tc.name,
"arguments": tc.arguments,
},
}
for tc in message.tool_calls
]

litellm_message = LiteLLMMessage(**litellm_message_dict)

return ModelResponse(
id=f"test-response-{self._call_count}",
choices=[Choices(message=litellm_message, index=0, finish_reason="stop")],
created=0,
model=self.model,
object="chat.completion",
)

def _content_to_string(self, message: Message) -> str:
"""Convert message content to a string."""
from openhands.sdk.llm.message import TextContent

parts = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 Important: _content_to_string only handles TextContent and silently ignores other content types like ImageContent. This could mask bugs in tests where messages accidentally contain unsupported content types.

Suggest either:

  1. Handle all content types appropriately, or
  2. Raise an error for unsupported types with a clear message
Suggested change
parts = []
def _content_to_string(self, message: Message) -> str:
"""Convert message content to a string."""
from openhands.sdk.llm.message import TextContent
parts = []
for item in message.content:
if isinstance(item, TextContent):
parts.append(item.text)
else:
raise ValueError(
f"TestLLM only supports TextContent, got {type(item).__name__}"
)
return "\n".join(parts)

for item in message.content:
if isinstance(item, TextContent):
parts.append(item.text)
return "\n".join(parts)

@property
def remaining_responses(self) -> int:
"""Return the number of remaining scripted responses."""
return len(self._scripted_responses)

@property
def call_count(self) -> int:
"""Return the number of calls made to this TestLLM."""
return self._call_count
Loading
Loading