From 81c933e456233340fead3db363859cf08c3c7c4d Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Feb 2026 09:03:19 +0000 Subject: [PATCH 01/16] Update integration tests to use claude-sonnet-4-6 Replace claude-sonnet-4-5-20250929 with claude-sonnet-4-6 in the integration test workflow default model list. Co-authored-by: openhands --- .github/workflows/integration-runner.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 63cb881303..86ae4a1f60 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -22,7 +22,7 @@ on: model_ids: description: >- Comma-separated model IDs to test (from resolve_model_config.py). - Example: claude-sonnet-4-5-20250929,glm-4.7. Defaults to a standard set. + Example: claude-sonnet-4-6,glm-4.7. Defaults to a standard set. required: false default: '' type: string @@ -50,7 +50,7 @@ on: env: N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation # Default models for scheduled/label-triggered runs (subset of models from resolve_model_config.py) - DEFAULT_MODEL_IDS: claude-sonnet-4-5-20250929,deepseek-v3.2-reasoner,kimi-k2-thinking,gemini-3-pro + DEFAULT_MODEL_IDS: claude-sonnet-4-6,deepseek-v3.2-reasoner,kimi-k2-thinking,gemini-3-pro jobs: setup-matrix: From 8d7940945d92e74b87d80132b1b8af8131340040 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Feb 2026 14:24:41 +0000 Subject: [PATCH 02/16] fix: install litellm before resolving model configs in integration workflow The setup-matrix job was failing because resolve_model_config.py imports litellm, but the dependency was not installed before running the script. Co-authored-by: openhands --- .github/workflows/integration-runner.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 86ae4a1f60..4b2684b8bb 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -71,6 +71,9 @@ jobs: with: python-version: '3.13' + - name: Install required dependencies + run: pip install litellm + - name: Resolve model configurations id: resolve-models env: From 45ec6985556fcf8b43645da5c020c9d11d1176cf Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Feb 2026 14:28:45 +0000 Subject: [PATCH 03/16] fix: make litellm import lazy in resolve_model_config.py The MODELS dictionary is used by the integration-runner workflow to resolve model configurations without needing litellm installed. By moving the litellm import inside the test_model function, we allow importing MODELS without requiring the litellm dependency. This fixes the setup-matrix job failure in pull_request_target workflows where the workflow file from main branch is used but the PR code is checked out. Co-authored-by: openhands --- .github/run-eval/resolve_model_config.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index c6cb7edc31..445f3cdfa3 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -12,12 +12,16 @@ - models_json: JSON array of full model configs with display names """ +from __future__ import annotations + import json import os import sys -from typing import Any +from typing import TYPE_CHECKING, Any + -import litellm +if TYPE_CHECKING: + pass # Model configurations dictionary @@ -235,6 +239,8 @@ def test_model( Returns: Tuple of (success: bool, message: str) """ + import litellm + llm_config = model_config.get("llm_config", {}) model_name = llm_config.get("model", "unknown") display_name = model_config.get("display_name", model_name) From f595d48cc998c1f71e2c957dda6a083273fe3e57 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Feb 2026 14:32:19 +0000 Subject: [PATCH 04/16] ci: add temporary push trigger for testing workflow changes Add a push trigger for this branch so the workflow runs using the PR's code instead of the main branch's workflow file. TODO: Remove the push trigger after this PR is merged. Co-authored-by: openhands --- .github/workflows/integration-runner.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 4b2684b8bb..265635bab6 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -9,6 +9,10 @@ on: pull_request_target: types: - labeled + # TODO: Remove this push trigger after PR #2113 is merged + push: + branches: + - update-integration-test-model-to-sonnet-4-6 workflow_dispatch: inputs: reason: @@ -215,6 +219,7 @@ jobs: ) ) || github.event_name == 'workflow_dispatch' || + github.event_name == 'push' || (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk') ) && needs.setup-matrix.result == 'success' needs: [setup-matrix, post-label-comment, post-dispatch-comment] @@ -368,6 +373,7 @@ jobs: ) ) || github.event_name == 'workflow_dispatch' || + github.event_name == 'push' || (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk') ) runs-on: ubuntu-24.04 From bcc0ab41af0d637975d053e9e5d8de70a4ad5a88 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Feb 2026 17:46:39 +0000 Subject: [PATCH 05/16] Fix: Run only integration tests on push trigger The push trigger was running the full test suite (18 tests including behavior and condenser tests), causing runs to take hours with reasoning models. Now it runs only integration tests (8 tests) like the schedule trigger. Co-authored-by: openhands --- .github/workflows/integration-runner.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 265635bab6..8eba56386f 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -287,6 +287,9 @@ jobs: elif [ "${{ github.event_name }}" = "schedule" ]; then TEST_TYPE_ARGS="--test-type integration" echo "Scheduled run; running integration tests only." + elif [ "${{ github.event_name }}" = "push" ]; then + TEST_TYPE_ARGS="--test-type integration" + echo "Push trigger; running integration tests only." else echo "Running full integration test suite." fi From bca02e843f5e324878a4e4c3038ec1294a463e31 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Feb 2026 18:15:23 +0000 Subject: [PATCH 06/16] Fix claude-sonnet-4-6 config: set top_p=None to avoid conflict Anthropic's claude-sonnet-4-6 model doesn't support having both temperature and top_p specified simultaneously. Set top_p=None to override the SDK default (1.0) when temperature is set. Co-authored-by: openhands --- .github/run-eval/resolve_model_config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index 445f3cdfa3..8bcbc2c19b 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -79,7 +79,10 @@ "display_name": "Claude Sonnet 4.6", "llm_config": { "model": "litellm_proxy/anthropic/claude-sonnet-4-6", + # Note: claude-sonnet-4-6 doesn't support both temperature and top_p + # so we only set temperature (SDK default top_p is 1.0) "temperature": 0.0, + "top_p": None, }, }, "gemini-3-pro": { From 878ae4c4fca2bc2fa74c2e99940e273a587cf804 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Feb 2026 19:42:21 +0000 Subject: [PATCH 07/16] Add supports_top_p feature for claude-sonnet-4-6 Claude Sonnet 4.6 doesn't support both temperature and top_p specified at the same time. This adds a new model feature flag supports_top_p that removes top_p from API calls for models that don't support it. Changes: - Add supports_top_p field to ModelFeatures dataclass - Add SUPPORTS_TOP_P_FALSE_MODELS list with claude-sonnet-4-6 - Update chat_options.py to remove top_p for unsupported models - Revert the top_p=None workaround in resolve_model_config.py - Add tests for the new feature Co-authored-by: openhands --- .github/run-eval/resolve_model_config.py | 3 --- .../openhands/sdk/llm/options/chat_options.py | 5 +++++ .../openhands/sdk/llm/utils/model_features.py | 9 +++++++++ tests/sdk/llm/test_chat_options.py | 14 +++++++++++++ tests/sdk/llm/test_model_features.py | 20 +++++++++++++++++++ 5 files changed, 48 insertions(+), 3 deletions(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index 8bcbc2c19b..445f3cdfa3 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -79,10 +79,7 @@ "display_name": "Claude Sonnet 4.6", "llm_config": { "model": "litellm_proxy/anthropic/claude-sonnet-4-6", - # Note: claude-sonnet-4-6 doesn't support both temperature and top_p - # so we only set temperature (SDK default top_p is 1.0) "temperature": 0.0, - "top_p": None, }, }, "gemini-3-pro": { diff --git a/openhands-sdk/openhands/sdk/llm/options/chat_options.py b/openhands-sdk/openhands/sdk/llm/options/chat_options.py index 7f29acd8db..cbca080f2f 100644 --- a/openhands-sdk/openhands/sdk/llm/options/chat_options.py +++ b/openhands-sdk/openhands/sdk/llm/options/chat_options.py @@ -94,4 +94,9 @@ def select_chat_options( if llm.litellm_extra_body: out["extra_body"] = llm.litellm_extra_body + # Remove top_p for models that don't support it (e.g., claude-sonnet-4-6 doesn't + # support having both temperature and top_p specified) + if not get_features(llm.model).supports_top_p: + out.pop("top_p", None) + return out diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index d454f5aa88..58f9209838 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -47,6 +47,7 @@ class ModelFeatures: force_string_serializer: bool send_reasoning_content: bool supports_prompt_cache_retention: bool + supports_top_p: bool # Model lists capturing current behavior. Keep entries lowercase. @@ -162,6 +163,13 @@ class ModelFeatures: "deepseek/deepseek-reasoner", ] +# Models that do NOT support top_p parameter (or don't support both temperature +# and top_p). When specified, these models will have top_p removed from API calls. +SUPPORTS_TOP_P_FALSE_MODELS: list[str] = [ + # Claude Sonnet 4.6 rejects requests with both temperature and top_p specified + "claude-sonnet-4-6", +] + def get_features(model: str) -> ModelFeatures: """Get model features.""" @@ -177,6 +185,7 @@ def get_features(model: str) -> ModelFeatures: supports_prompt_cache_retention=apply_ordered_model_rules( model, PROMPT_CACHE_RETENTION_MODELS ), + supports_top_p=not model_matches(model, SUPPORTS_TOP_P_FALSE_MODELS), ) diff --git a/tests/sdk/llm/test_chat_options.py b/tests/sdk/llm/test_chat_options.py index ce47695a4f..6e4a504e50 100644 --- a/tests/sdk/llm/test_chat_options.py +++ b/tests/sdk/llm/test_chat_options.py @@ -76,6 +76,20 @@ def test_non_reasoning_model_preserves_temp_and_top_p(): assert out.get("top_p") == 0.7 +def test_claude_sonnet_4_6_strips_top_p(): + """Claude Sonnet 4.6 doesn't support both temperature and top_p.""" + llm = DummyLLM( + model="litellm_proxy/anthropic/claude-sonnet-4-6", + temperature=0.0, + top_p=1.0, + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + + # temperature should be preserved, top_p should be removed + assert out.get("temperature") == 0.0 + assert "top_p" not in out + + def test_azure_renames_max_completion_tokens_to_max_tokens(): llm = DummyLLM(model="azure/gpt-4o") out = select_chat_options(llm, user_kwargs={}, has_tools=True) diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py index 304448a728..c81b146ecd 100644 --- a/tests/sdk/llm/test_model_features.py +++ b/tests/sdk/llm/test_model_features.py @@ -363,3 +363,23 @@ def test_get_default_temperature_case_insensitive(): assert get_default_temperature("KIMI-K2-THINKING") == 1.0 assert get_default_temperature("Kimi-K2-Thinking") == 1.0 assert get_default_temperature("KiMi-k2-ThInKiNg") == 1.0 + + +@pytest.mark.parametrize( + "model,expected_supports_top_p", + [ + # Claude Sonnet 4.6 doesn't support top_p with temperature + ("claude-sonnet-4-6", False), + ("anthropic/claude-sonnet-4-6", False), + ("litellm_proxy/anthropic/claude-sonnet-4-6", False), + # Other models should support top_p + ("gpt-4o", True), + ("claude-sonnet-4-5", True), + ("claude-3-5-sonnet", True), + ("unknown-model", True), + ], +) +def test_supports_top_p(model, expected_supports_top_p): + """Test that models correctly report top_p support.""" + features = get_features(model) + assert features.supports_top_p is expected_supports_top_p From 1162f3b0766c9954f2715a3a76b7665b42176caa Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 19 Feb 2026 23:25:57 +0800 Subject: [PATCH 08/16] Revert "Add supports_top_p feature for claude-sonnet-4-6" This reverts commit 878ae4c4fca2bc2fa74c2e99940e273a587cf804. --- .github/run-eval/resolve_model_config.py | 3 +++ .../openhands/sdk/llm/options/chat_options.py | 5 ----- .../openhands/sdk/llm/utils/model_features.py | 9 --------- tests/sdk/llm/test_chat_options.py | 14 ------------- tests/sdk/llm/test_model_features.py | 20 ------------------- 5 files changed, 3 insertions(+), 48 deletions(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index 445f3cdfa3..8bcbc2c19b 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -79,7 +79,10 @@ "display_name": "Claude Sonnet 4.6", "llm_config": { "model": "litellm_proxy/anthropic/claude-sonnet-4-6", + # Note: claude-sonnet-4-6 doesn't support both temperature and top_p + # so we only set temperature (SDK default top_p is 1.0) "temperature": 0.0, + "top_p": None, }, }, "gemini-3-pro": { diff --git a/openhands-sdk/openhands/sdk/llm/options/chat_options.py b/openhands-sdk/openhands/sdk/llm/options/chat_options.py index cbca080f2f..7f29acd8db 100644 --- a/openhands-sdk/openhands/sdk/llm/options/chat_options.py +++ b/openhands-sdk/openhands/sdk/llm/options/chat_options.py @@ -94,9 +94,4 @@ def select_chat_options( if llm.litellm_extra_body: out["extra_body"] = llm.litellm_extra_body - # Remove top_p for models that don't support it (e.g., claude-sonnet-4-6 doesn't - # support having both temperature and top_p specified) - if not get_features(llm.model).supports_top_p: - out.pop("top_p", None) - return out diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index 58f9209838..d454f5aa88 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -47,7 +47,6 @@ class ModelFeatures: force_string_serializer: bool send_reasoning_content: bool supports_prompt_cache_retention: bool - supports_top_p: bool # Model lists capturing current behavior. Keep entries lowercase. @@ -163,13 +162,6 @@ class ModelFeatures: "deepseek/deepseek-reasoner", ] -# Models that do NOT support top_p parameter (or don't support both temperature -# and top_p). When specified, these models will have top_p removed from API calls. -SUPPORTS_TOP_P_FALSE_MODELS: list[str] = [ - # Claude Sonnet 4.6 rejects requests with both temperature and top_p specified - "claude-sonnet-4-6", -] - def get_features(model: str) -> ModelFeatures: """Get model features.""" @@ -185,7 +177,6 @@ def get_features(model: str) -> ModelFeatures: supports_prompt_cache_retention=apply_ordered_model_rules( model, PROMPT_CACHE_RETENTION_MODELS ), - supports_top_p=not model_matches(model, SUPPORTS_TOP_P_FALSE_MODELS), ) diff --git a/tests/sdk/llm/test_chat_options.py b/tests/sdk/llm/test_chat_options.py index 6e4a504e50..ce47695a4f 100644 --- a/tests/sdk/llm/test_chat_options.py +++ b/tests/sdk/llm/test_chat_options.py @@ -76,20 +76,6 @@ def test_non_reasoning_model_preserves_temp_and_top_p(): assert out.get("top_p") == 0.7 -def test_claude_sonnet_4_6_strips_top_p(): - """Claude Sonnet 4.6 doesn't support both temperature and top_p.""" - llm = DummyLLM( - model="litellm_proxy/anthropic/claude-sonnet-4-6", - temperature=0.0, - top_p=1.0, - ) - out = select_chat_options(llm, user_kwargs={}, has_tools=True) - - # temperature should be preserved, top_p should be removed - assert out.get("temperature") == 0.0 - assert "top_p" not in out - - def test_azure_renames_max_completion_tokens_to_max_tokens(): llm = DummyLLM(model="azure/gpt-4o") out = select_chat_options(llm, user_kwargs={}, has_tools=True) diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py index c81b146ecd..304448a728 100644 --- a/tests/sdk/llm/test_model_features.py +++ b/tests/sdk/llm/test_model_features.py @@ -363,23 +363,3 @@ def test_get_default_temperature_case_insensitive(): assert get_default_temperature("KIMI-K2-THINKING") == 1.0 assert get_default_temperature("Kimi-K2-Thinking") == 1.0 assert get_default_temperature("KiMi-k2-ThInKiNg") == 1.0 - - -@pytest.mark.parametrize( - "model,expected_supports_top_p", - [ - # Claude Sonnet 4.6 doesn't support top_p with temperature - ("claude-sonnet-4-6", False), - ("anthropic/claude-sonnet-4-6", False), - ("litellm_proxy/anthropic/claude-sonnet-4-6", False), - # Other models should support top_p - ("gpt-4o", True), - ("claude-sonnet-4-5", True), - ("claude-3-5-sonnet", True), - ("unknown-model", True), - ], -) -def test_supports_top_p(model, expected_supports_top_p): - """Test that models correctly report top_p support.""" - features = get_features(model) - assert features.supports_top_p is expected_supports_top_p From 91d72a316ad82a1b3125bd99754d46f0d9a2f6e1 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 19 Feb 2026 23:26:04 +0800 Subject: [PATCH 09/16] Revert "Fix claude-sonnet-4-6 config: set top_p=None to avoid conflict" This reverts commit bca02e843f5e324878a4e4c3038ec1294a463e31. --- .github/run-eval/resolve_model_config.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index 8bcbc2c19b..445f3cdfa3 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -79,10 +79,7 @@ "display_name": "Claude Sonnet 4.6", "llm_config": { "model": "litellm_proxy/anthropic/claude-sonnet-4-6", - # Note: claude-sonnet-4-6 doesn't support both temperature and top_p - # so we only set temperature (SDK default top_p is 1.0) "temperature": 0.0, - "top_p": None, }, }, "gemini-3-pro": { From c52d46910dc2b9ecb8d02921638f44260e966d0d Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 19 Feb 2026 23:26:22 +0800 Subject: [PATCH 10/16] Revert "Fix: Run only integration tests on push trigger" This reverts commit bcc0ab41af0d637975d053e9e5d8de70a4ad5a88. --- .github/workflows/integration-runner.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 8eba56386f..265635bab6 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -287,9 +287,6 @@ jobs: elif [ "${{ github.event_name }}" = "schedule" ]; then TEST_TYPE_ARGS="--test-type integration" echo "Scheduled run; running integration tests only." - elif [ "${{ github.event_name }}" = "push" ]; then - TEST_TYPE_ARGS="--test-type integration" - echo "Push trigger; running integration tests only." else echo "Running full integration test suite." fi From eb6a3c1164ac06697ba147a4afa93819c3e901e2 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 19 Feb 2026 23:26:25 +0800 Subject: [PATCH 11/16] Revert "ci: add temporary push trigger for testing workflow changes" This reverts commit f595d48cc998c1f71e2c957dda6a083273fe3e57. --- .github/workflows/integration-runner.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 265635bab6..4b2684b8bb 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -9,10 +9,6 @@ on: pull_request_target: types: - labeled - # TODO: Remove this push trigger after PR #2113 is merged - push: - branches: - - update-integration-test-model-to-sonnet-4-6 workflow_dispatch: inputs: reason: @@ -219,7 +215,6 @@ jobs: ) ) || github.event_name == 'workflow_dispatch' || - github.event_name == 'push' || (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk') ) && needs.setup-matrix.result == 'success' needs: [setup-matrix, post-label-comment, post-dispatch-comment] @@ -373,7 +368,6 @@ jobs: ) ) || github.event_name == 'workflow_dispatch' || - github.event_name == 'push' || (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk') ) runs-on: ubuntu-24.04 From 9adeca1663387e685d5c5661e6bc224f30638390 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 19 Feb 2026 23:26:29 +0800 Subject: [PATCH 12/16] Revert "fix: make litellm import lazy in resolve_model_config.py" This reverts commit 45ec6985556fcf8b43645da5c020c9d11d1176cf. --- .github/run-eval/resolve_model_config.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index 445f3cdfa3..c6cb7edc31 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -12,16 +12,12 @@ - models_json: JSON array of full model configs with display names """ -from __future__ import annotations - import json import os import sys -from typing import TYPE_CHECKING, Any - +from typing import Any -if TYPE_CHECKING: - pass +import litellm # Model configurations dictionary @@ -239,8 +235,6 @@ def test_model( Returns: Tuple of (success: bool, message: str) """ - import litellm - llm_config = model_config.get("llm_config", {}) model_name = llm_config.get("model", "unknown") display_name = model_config.get("display_name", model_name) From ccd9d13fbe81b460425f8e1443d060c0662fff60 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 19 Feb 2026 23:26:32 +0800 Subject: [PATCH 13/16] Revert "fix: install litellm before resolving model configs in integration workflow" This reverts commit 8d7940945d92e74b87d80132b1b8af8131340040. --- .github/workflows/integration-runner.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 4b2684b8bb..86ae4a1f60 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -71,9 +71,6 @@ jobs: with: python-version: '3.13' - - name: Install required dependencies - run: pip install litellm - - name: Resolve model configurations id: resolve-models env: From f7a176e602b8ca2f5458cf430cb63269558f295a Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 19 Feb 2026 23:27:33 +0800 Subject: [PATCH 14/16] add sonnet to extended thinking and prompt caching models --- openhands-sdk/openhands/sdk/llm/utils/model_features.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index d454f5aa88..3f16b9c8ab 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -71,6 +71,7 @@ class ModelFeatures: # Anthropic Opus 4.5 and 4.6 "claude-opus-4-5", "claude-opus-4-6", + "claude-sonnet-4-6", # Nova 2 Lite "nova-2-lite", ] @@ -96,6 +97,7 @@ class ModelFeatures: "claude-haiku-4-5", "claude-opus-4-5", "claude-opus-4-6", + "claude-sonnet-4-6", ] # Models that support a top-level prompt_cache_retention parameter From 2f8bba5766e2344badc62f907db099cb02f50b46 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 19 Feb 2026 16:20:40 +0000 Subject: [PATCH 15/16] fix: make litellm import lazy in resolve_model_config.py Move the litellm import inside the test_model function to prevent import errors when the workflow only needs to access the MODELS dict. The integration workflow setup-matrix step imports the module to read model configurations, but doesn't need litellm at that point. Co-authored-by: openhands --- .github/run-eval/resolve_model_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index c6cb7edc31..bdaebb3512 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -17,8 +17,6 @@ import sys from typing import Any -import litellm - # Model configurations dictionary MODELS = { @@ -235,6 +233,8 @@ def test_model( Returns: Tuple of (success: bool, message: str) """ + import litellm + llm_config = model_config.get("llm_config", {}) model_name = llm_config.get("model", "unknown") display_name = model_config.get("display_name", model_name) From 70133088b6ba54a4df873e59bac191a887af14b1 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 19 Feb 2026 16:48:16 +0000 Subject: [PATCH 16/16] ci: restore Blacksmith runners for integration tests Switch back to Blacksmith CI runners for performance-critical jobs: - run-integration-tests: blacksmith-4vcpu-ubuntu-2204 (was ubuntu-22.04) - consolidate-results: blacksmith-2vcpu-ubuntu-2404 (was ubuntu-24.04) This should significantly improve integration test execution time. Co-authored-by: openhands --- .github/workflows/integration-runner.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index 86ae4a1f60..f2d2e45561 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -215,7 +215,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk') ) && needs.setup-matrix.result == 'success' needs: [setup-matrix, post-label-comment, post-dispatch-comment] - runs-on: ubuntu-22.04 + runs-on: blacksmith-4vcpu-ubuntu-2204 permissions: contents: read id-token: write @@ -367,7 +367,7 @@ jobs: github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk') ) - runs-on: ubuntu-24.04 + runs-on: blacksmith-2vcpu-ubuntu-2404 permissions: contents: read pull-requests: write