From 81c933e456233340fead3db363859cf08c3c7c4d Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 18 Feb 2026 09:03:19 +0000
Subject: [PATCH 01/16] Update integration tests to use claude-sonnet-4-6

Replace claude-sonnet-4-5-20250929 with claude-sonnet-4-6 in the integration
test workflow default model list.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/integration-runner.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 63cb881303..86ae4a1f60 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -22,7 +22,7 @@ on:
             model_ids:
                 description: >-
                     Comma-separated model IDs to test (from resolve_model_config.py).
-                    Example: claude-sonnet-4-5-20250929,glm-4.7. Defaults to a standard set.
+                    Example: claude-sonnet-4-6,glm-4.7. Defaults to a standard set.
                 required: false
                 default: ''
                 type: string
@@ -50,7 +50,7 @@ on:
 env:
     N_PROCESSES: 4 # Global configuration for number of parallel processes for evaluation
     # Default models for scheduled/label-triggered runs (subset of models from resolve_model_config.py)
-    DEFAULT_MODEL_IDS: claude-sonnet-4-5-20250929,deepseek-v3.2-reasoner,kimi-k2-thinking,gemini-3-pro
+    DEFAULT_MODEL_IDS: claude-sonnet-4-6,deepseek-v3.2-reasoner,kimi-k2-thinking,gemini-3-pro
 
 jobs:
     setup-matrix:

From 8d7940945d92e74b87d80132b1b8af8131340040 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 18 Feb 2026 14:24:41 +0000
Subject: [PATCH 02/16] fix: install litellm before resolving model configs in
 integration workflow

The setup-matrix job was failing because resolve_model_config.py imports litellm,
but the dependency was not installed before running the script.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/integration-runner.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 86ae4a1f60..4b2684b8bb 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -71,6 +71,9 @@ jobs:
               with:
                   python-version: '3.13'
 
+            - name: Install required dependencies
+              run: pip install litellm
+
             - name: Resolve model configurations
               id: resolve-models
               env:

From 45ec6985556fcf8b43645da5c020c9d11d1176cf Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 18 Feb 2026 14:28:45 +0000
Subject: [PATCH 03/16] fix: make litellm import lazy in
 resolve_model_config.py

The MODELS dictionary is used by the integration-runner workflow to resolve
model configurations without needing litellm installed. By moving the litellm
import inside the test_model function, we allow importing MODELS without
requiring the litellm dependency.

This fixes the setup-matrix job failure in pull_request_target workflows
where the workflow file from main branch is used but the PR code is checked out.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/run-eval/resolve_model_config.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
index c6cb7edc31..445f3cdfa3 100755
--- a/.github/run-eval/resolve_model_config.py
+++ b/.github/run-eval/resolve_model_config.py
@@ -12,12 +12,16 @@
 - models_json: JSON array of full model configs with display names
 """
 
+from __future__ import annotations
+
 import json
 import os
 import sys
-from typing import Any
+from typing import TYPE_CHECKING, Any
+
 
-import litellm
+if TYPE_CHECKING:
+    pass
 
 
 # Model configurations dictionary
@@ -235,6 +239,8 @@ def test_model(
     Returns:
         Tuple of (success: bool, message: str)
     """
+    import litellm
+
     llm_config = model_config.get("llm_config", {})
     model_name = llm_config.get("model", "unknown")
     display_name = model_config.get("display_name", model_name)

From f595d48cc998c1f71e2c957dda6a083273fe3e57 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 18 Feb 2026 14:32:19 +0000
Subject: [PATCH 04/16] ci: add temporary push trigger for testing workflow
 changes

Add a push trigger for this branch so the workflow runs using the PR's code
instead of the main branch's workflow file.

TODO: Remove the push trigger after this PR is merged.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/integration-runner.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 4b2684b8bb..265635bab6 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -9,6 +9,10 @@ on:
     pull_request_target:
         types:
             - labeled
+    # TODO: Remove this push trigger after PR #2113 is merged
+    push:
+        branches:
+            - update-integration-test-model-to-sonnet-4-6
     workflow_dispatch:
         inputs:
             reason:
@@ -215,6 +219,7 @@ jobs:
                     )
                 ) ||
                 github.event_name == 'workflow_dispatch' ||
+                github.event_name == 'push' ||
                 (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk')
             ) && needs.setup-matrix.result == 'success'
         needs: [setup-matrix, post-label-comment, post-dispatch-comment]
@@ -368,6 +373,7 @@ jobs:
                     )
                 ) ||
                 github.event_name == 'workflow_dispatch' ||
+                github.event_name == 'push' ||
                 (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk')
             )
         runs-on: ubuntu-24.04

From bcc0ab41af0d637975d053e9e5d8de70a4ad5a88 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 18 Feb 2026 17:46:39 +0000
Subject: [PATCH 05/16] Fix: Run only integration tests on push trigger

The push trigger was running the full test suite (18 tests including
behavior and condenser tests), causing runs to take hours with
reasoning models. Now it runs only integration tests (8 tests) like
the schedule trigger.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/integration-runner.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 265635bab6..8eba56386f 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -287,6 +287,9 @@ jobs:
                   elif [ "${{ github.event_name }}" = "schedule" ]; then
                     TEST_TYPE_ARGS="--test-type integration"
                     echo "Scheduled run; running integration tests only."
+                  elif [ "${{ github.event_name }}" = "push" ]; then
+                    TEST_TYPE_ARGS="--test-type integration"
+                    echo "Push trigger; running integration tests only."
                   else
                     echo "Running full integration test suite."
                   fi

From bca02e843f5e324878a4e4c3038ec1294a463e31 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 18 Feb 2026 18:15:23 +0000
Subject: [PATCH 06/16] Fix claude-sonnet-4-6 config: set top_p=None to avoid
 conflict

Anthropic's claude-sonnet-4-6 model doesn't support having both
temperature and top_p specified simultaneously. Set top_p=None
to override the SDK default (1.0) when temperature is set.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/run-eval/resolve_model_config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
index 445f3cdfa3..8bcbc2c19b 100755
--- a/.github/run-eval/resolve_model_config.py
+++ b/.github/run-eval/resolve_model_config.py
@@ -79,7 +79,10 @@
         "display_name": "Claude Sonnet 4.6",
         "llm_config": {
             "model": "litellm_proxy/anthropic/claude-sonnet-4-6",
+            # Note: claude-sonnet-4-6 doesn't support both temperature and top_p
+            # so we only set temperature (SDK default top_p is 1.0)
             "temperature": 0.0,
+            "top_p": None,
         },
     },
     "gemini-3-pro": {

From 878ae4c4fca2bc2fa74c2e99940e273a587cf804 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 18 Feb 2026 19:42:21 +0000
Subject: [PATCH 07/16] Add supports_top_p feature for claude-sonnet-4-6

Claude Sonnet 4.6 doesn't support both temperature and top_p specified
at the same time. This adds a new model feature flag supports_top_p
that removes top_p from API calls for models that don't support it.

Changes:
- Add supports_top_p field to ModelFeatures dataclass
- Add SUPPORTS_TOP_P_FALSE_MODELS list with claude-sonnet-4-6
- Update chat_options.py to remove top_p for unsupported models
- Revert the top_p=None workaround in resolve_model_config.py
- Add tests for the new feature

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/run-eval/resolve_model_config.py      |  3 ---
 .../openhands/sdk/llm/options/chat_options.py |  5 +++++
 .../openhands/sdk/llm/utils/model_features.py |  9 +++++++++
 tests/sdk/llm/test_chat_options.py            | 14 +++++++++++++
 tests/sdk/llm/test_model_features.py          | 20 +++++++++++++++++++
 5 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
index 8bcbc2c19b..445f3cdfa3 100755
--- a/.github/run-eval/resolve_model_config.py
+++ b/.github/run-eval/resolve_model_config.py
@@ -79,10 +79,7 @@
         "display_name": "Claude Sonnet 4.6",
         "llm_config": {
             "model": "litellm_proxy/anthropic/claude-sonnet-4-6",
-            # Note: claude-sonnet-4-6 doesn't support both temperature and top_p
-            # so we only set temperature (SDK default top_p is 1.0)
             "temperature": 0.0,
-            "top_p": None,
         },
     },
     "gemini-3-pro": {
diff --git a/openhands-sdk/openhands/sdk/llm/options/chat_options.py b/openhands-sdk/openhands/sdk/llm/options/chat_options.py
index 7f29acd8db..cbca080f2f 100644
--- a/openhands-sdk/openhands/sdk/llm/options/chat_options.py
+++ b/openhands-sdk/openhands/sdk/llm/options/chat_options.py
@@ -94,4 +94,9 @@ def select_chat_options(
     if llm.litellm_extra_body:
         out["extra_body"] = llm.litellm_extra_body
 
+    # Remove top_p for models that don't support it (e.g., claude-sonnet-4-6 doesn't
+    # support having both temperature and top_p specified)
+    if not get_features(llm.model).supports_top_p:
+        out.pop("top_p", None)
+
     return out
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
index d454f5aa88..58f9209838 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -47,6 +47,7 @@ class ModelFeatures:
     force_string_serializer: bool
     send_reasoning_content: bool
     supports_prompt_cache_retention: bool
+    supports_top_p: bool
 
 
 # Model lists capturing current behavior. Keep entries lowercase.
@@ -162,6 +163,13 @@ class ModelFeatures:
     "deepseek/deepseek-reasoner",
 ]
 
+# Models that do NOT support top_p parameter (or don't support both temperature
+# and top_p). When specified, these models will have top_p removed from API calls.
+SUPPORTS_TOP_P_FALSE_MODELS: list[str] = [
+    # Claude Sonnet 4.6 rejects requests with both temperature and top_p specified
+    "claude-sonnet-4-6",
+]
+
 
 def get_features(model: str) -> ModelFeatures:
     """Get model features."""
@@ -177,6 +185,7 @@ def get_features(model: str) -> ModelFeatures:
         supports_prompt_cache_retention=apply_ordered_model_rules(
             model, PROMPT_CACHE_RETENTION_MODELS
         ),
+        supports_top_p=not model_matches(model, SUPPORTS_TOP_P_FALSE_MODELS),
     )
 
 
diff --git a/tests/sdk/llm/test_chat_options.py b/tests/sdk/llm/test_chat_options.py
index ce47695a4f..6e4a504e50 100644
--- a/tests/sdk/llm/test_chat_options.py
+++ b/tests/sdk/llm/test_chat_options.py
@@ -76,6 +76,20 @@ def test_non_reasoning_model_preserves_temp_and_top_p():
     assert out.get("top_p") == 0.7
 
 
+def test_claude_sonnet_4_6_strips_top_p():
+    """Claude Sonnet 4.6 doesn't support both temperature and top_p."""
+    llm = DummyLLM(
+        model="litellm_proxy/anthropic/claude-sonnet-4-6",
+        temperature=0.0,
+        top_p=1.0,
+    )
+    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
+
+    # temperature should be preserved, top_p should be removed
+    assert out.get("temperature") == 0.0
+    assert "top_p" not in out
+
+
 def test_azure_renames_max_completion_tokens_to_max_tokens():
     llm = DummyLLM(model="azure/gpt-4o")
     out = select_chat_options(llm, user_kwargs={}, has_tools=True)
diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py
index 304448a728..c81b146ecd 100644
--- a/tests/sdk/llm/test_model_features.py
+++ b/tests/sdk/llm/test_model_features.py
@@ -363,3 +363,23 @@ def test_get_default_temperature_case_insensitive():
     assert get_default_temperature("KIMI-K2-THINKING") == 1.0
     assert get_default_temperature("Kimi-K2-Thinking") == 1.0
     assert get_default_temperature("KiMi-k2-ThInKiNg") == 1.0
+
+
+@pytest.mark.parametrize(
+    "model,expected_supports_top_p",
+    [
+        # Claude Sonnet 4.6 doesn't support top_p with temperature
+        ("claude-sonnet-4-6", False),
+        ("anthropic/claude-sonnet-4-6", False),
+        ("litellm_proxy/anthropic/claude-sonnet-4-6", False),
+        # Other models should support top_p
+        ("gpt-4o", True),
+        ("claude-sonnet-4-5", True),
+        ("claude-3-5-sonnet", True),
+        ("unknown-model", True),
+    ],
+)
+def test_supports_top_p(model, expected_supports_top_p):
+    """Test that models correctly report top_p support."""
+    features = get_features(model)
+    assert features.supports_top_p is expected_supports_top_p

From 1162f3b0766c9954f2715a3a76b7665b42176caa Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 19 Feb 2026 23:25:57 +0800
Subject: [PATCH 08/16] Revert "Add supports_top_p feature for
 claude-sonnet-4-6"

This reverts commit 878ae4c4fca2bc2fa74c2e99940e273a587cf804.
---
 .github/run-eval/resolve_model_config.py      |  3 +++
 .../openhands/sdk/llm/options/chat_options.py |  5 -----
 .../openhands/sdk/llm/utils/model_features.py |  9 ---------
 tests/sdk/llm/test_chat_options.py            | 14 -------------
 tests/sdk/llm/test_model_features.py          | 20 -------------------
 5 files changed, 3 insertions(+), 48 deletions(-)

diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
index 445f3cdfa3..8bcbc2c19b 100755
--- a/.github/run-eval/resolve_model_config.py
+++ b/.github/run-eval/resolve_model_config.py
@@ -79,7 +79,10 @@
         "display_name": "Claude Sonnet 4.6",
         "llm_config": {
             "model": "litellm_proxy/anthropic/claude-sonnet-4-6",
+            # Note: claude-sonnet-4-6 doesn't support both temperature and top_p
+            # so we only set temperature (SDK default top_p is 1.0)
             "temperature": 0.0,
+            "top_p": None,
         },
     },
     "gemini-3-pro": {
diff --git a/openhands-sdk/openhands/sdk/llm/options/chat_options.py b/openhands-sdk/openhands/sdk/llm/options/chat_options.py
index cbca080f2f..7f29acd8db 100644
--- a/openhands-sdk/openhands/sdk/llm/options/chat_options.py
+++ b/openhands-sdk/openhands/sdk/llm/options/chat_options.py
@@ -94,9 +94,4 @@ def select_chat_options(
     if llm.litellm_extra_body:
         out["extra_body"] = llm.litellm_extra_body
 
-    # Remove top_p for models that don't support it (e.g., claude-sonnet-4-6 doesn't
-    # support having both temperature and top_p specified)
-    if not get_features(llm.model).supports_top_p:
-        out.pop("top_p", None)
-
     return out
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
index 58f9209838..d454f5aa88 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -47,7 +47,6 @@ class ModelFeatures:
     force_string_serializer: bool
     send_reasoning_content: bool
     supports_prompt_cache_retention: bool
-    supports_top_p: bool
 
 
 # Model lists capturing current behavior. Keep entries lowercase.
@@ -163,13 +162,6 @@ class ModelFeatures:
     "deepseek/deepseek-reasoner",
 ]
 
-# Models that do NOT support top_p parameter (or don't support both temperature
-# and top_p). When specified, these models will have top_p removed from API calls.
-SUPPORTS_TOP_P_FALSE_MODELS: list[str] = [
-    # Claude Sonnet 4.6 rejects requests with both temperature and top_p specified
-    "claude-sonnet-4-6",
-]
-
 
 def get_features(model: str) -> ModelFeatures:
     """Get model features."""
@@ -185,7 +177,6 @@ def get_features(model: str) -> ModelFeatures:
         supports_prompt_cache_retention=apply_ordered_model_rules(
             model, PROMPT_CACHE_RETENTION_MODELS
         ),
-        supports_top_p=not model_matches(model, SUPPORTS_TOP_P_FALSE_MODELS),
     )
 
 
diff --git a/tests/sdk/llm/test_chat_options.py b/tests/sdk/llm/test_chat_options.py
index 6e4a504e50..ce47695a4f 100644
--- a/tests/sdk/llm/test_chat_options.py
+++ b/tests/sdk/llm/test_chat_options.py
@@ -76,20 +76,6 @@ def test_non_reasoning_model_preserves_temp_and_top_p():
     assert out.get("top_p") == 0.7
 
 
-def test_claude_sonnet_4_6_strips_top_p():
-    """Claude Sonnet 4.6 doesn't support both temperature and top_p."""
-    llm = DummyLLM(
-        model="litellm_proxy/anthropic/claude-sonnet-4-6",
-        temperature=0.0,
-        top_p=1.0,
-    )
-    out = select_chat_options(llm, user_kwargs={}, has_tools=True)
-
-    # temperature should be preserved, top_p should be removed
-    assert out.get("temperature") == 0.0
-    assert "top_p" not in out
-
-
 def test_azure_renames_max_completion_tokens_to_max_tokens():
     llm = DummyLLM(model="azure/gpt-4o")
     out = select_chat_options(llm, user_kwargs={}, has_tools=True)
diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py
index c81b146ecd..304448a728 100644
--- a/tests/sdk/llm/test_model_features.py
+++ b/tests/sdk/llm/test_model_features.py
@@ -363,23 +363,3 @@ def test_get_default_temperature_case_insensitive():
     assert get_default_temperature("KIMI-K2-THINKING") == 1.0
     assert get_default_temperature("Kimi-K2-Thinking") == 1.0
     assert get_default_temperature("KiMi-k2-ThInKiNg") == 1.0
-
-
-@pytest.mark.parametrize(
-    "model,expected_supports_top_p",
-    [
-        # Claude Sonnet 4.6 doesn't support top_p with temperature
-        ("claude-sonnet-4-6", False),
-        ("anthropic/claude-sonnet-4-6", False),
-        ("litellm_proxy/anthropic/claude-sonnet-4-6", False),
-        # Other models should support top_p
-        ("gpt-4o", True),
-        ("claude-sonnet-4-5", True),
-        ("claude-3-5-sonnet", True),
-        ("unknown-model", True),
-    ],
-)
-def test_supports_top_p(model, expected_supports_top_p):
-    """Test that models correctly report top_p support."""
-    features = get_features(model)
-    assert features.supports_top_p is expected_supports_top_p

From 91d72a316ad82a1b3125bd99754d46f0d9a2f6e1 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 19 Feb 2026 23:26:04 +0800
Subject: [PATCH 09/16] Revert "Fix claude-sonnet-4-6 config: set top_p=None to
 avoid conflict"

This reverts commit bca02e843f5e324878a4e4c3038ec1294a463e31.
---
 .github/run-eval/resolve_model_config.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
index 8bcbc2c19b..445f3cdfa3 100755
--- a/.github/run-eval/resolve_model_config.py
+++ b/.github/run-eval/resolve_model_config.py
@@ -79,10 +79,7 @@
         "display_name": "Claude Sonnet 4.6",
         "llm_config": {
             "model": "litellm_proxy/anthropic/claude-sonnet-4-6",
-            # Note: claude-sonnet-4-6 doesn't support both temperature and top_p
-            # so we only set temperature (SDK default top_p is 1.0)
             "temperature": 0.0,
-            "top_p": None,
         },
     },
     "gemini-3-pro": {

From c52d46910dc2b9ecb8d02921638f44260e966d0d Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 19 Feb 2026 23:26:22 +0800
Subject: [PATCH 10/16] Revert "Fix: Run only integration tests on push
 trigger"

This reverts commit bcc0ab41af0d637975d053e9e5d8de70a4ad5a88.
---
 .github/workflows/integration-runner.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 8eba56386f..265635bab6 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -287,9 +287,6 @@ jobs:
                   elif [ "${{ github.event_name }}" = "schedule" ]; then
                     TEST_TYPE_ARGS="--test-type integration"
                     echo "Scheduled run; running integration tests only."
-                  elif [ "${{ github.event_name }}" = "push" ]; then
-                    TEST_TYPE_ARGS="--test-type integration"
-                    echo "Push trigger; running integration tests only."
                   else
                     echo "Running full integration test suite."
                   fi

From eb6a3c1164ac06697ba147a4afa93819c3e901e2 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 19 Feb 2026 23:26:25 +0800
Subject: [PATCH 11/16] Revert "ci: add temporary push trigger for testing
 workflow changes"

This reverts commit f595d48cc998c1f71e2c957dda6a083273fe3e57.
---
 .github/workflows/integration-runner.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 265635bab6..4b2684b8bb 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -9,10 +9,6 @@ on:
     pull_request_target:
         types:
             - labeled
-    # TODO: Remove this push trigger after PR #2113 is merged
-    push:
-        branches:
-            - update-integration-test-model-to-sonnet-4-6
     workflow_dispatch:
         inputs:
             reason:
@@ -219,7 +215,6 @@ jobs:
                     )
                 ) ||
                 github.event_name == 'workflow_dispatch' ||
-                github.event_name == 'push' ||
                 (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk')
             ) && needs.setup-matrix.result == 'success'
         needs: [setup-matrix, post-label-comment, post-dispatch-comment]
@@ -373,7 +368,6 @@ jobs:
                     )
                 ) ||
                 github.event_name == 'workflow_dispatch' ||
-                github.event_name == 'push' ||
                 (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk')
             )
         runs-on: ubuntu-24.04

From 9adeca1663387e685d5c5661e6bc224f30638390 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 19 Feb 2026 23:26:29 +0800
Subject: [PATCH 12/16] Revert "fix: make litellm import lazy in
 resolve_model_config.py"

This reverts commit 45ec6985556fcf8b43645da5c020c9d11d1176cf.
---
 .github/run-eval/resolve_model_config.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
index 445f3cdfa3..c6cb7edc31 100755
--- a/.github/run-eval/resolve_model_config.py
+++ b/.github/run-eval/resolve_model_config.py
@@ -12,16 +12,12 @@
 - models_json: JSON array of full model configs with display names
 """
 
-from __future__ import annotations
-
 import json
 import os
 import sys
-from typing import TYPE_CHECKING, Any
-
+from typing import Any
 
-if TYPE_CHECKING:
-    pass
+import litellm
 
 
 # Model configurations dictionary
@@ -239,8 +235,6 @@ def test_model(
     Returns:
         Tuple of (success: bool, message: str)
     """
-    import litellm
-
     llm_config = model_config.get("llm_config", {})
     model_name = llm_config.get("model", "unknown")
     display_name = model_config.get("display_name", model_name)

From ccd9d13fbe81b460425f8e1443d060c0662fff60 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 19 Feb 2026 23:26:32 +0800
Subject: [PATCH 13/16] Revert "fix: install litellm before resolving model
 configs in integration workflow"

This reverts commit 8d7940945d92e74b87d80132b1b8af8131340040.
---
 .github/workflows/integration-runner.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 4b2684b8bb..86ae4a1f60 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -71,9 +71,6 @@ jobs:
               with:
                   python-version: '3.13'
 
-            - name: Install required dependencies
-              run: pip install litellm
-
             - name: Resolve model configurations
               id: resolve-models
               env:

From f7a176e602b8ca2f5458cf430cb63269558f295a Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 19 Feb 2026 23:27:33 +0800
Subject: [PATCH 14/16] add sonnet to extended thinking and prompt caching
 models

---
 openhands-sdk/openhands/sdk/llm/utils/model_features.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
index d454f5aa88..3f16b9c8ab 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -71,6 +71,7 @@ class ModelFeatures:
     # Anthropic Opus 4.5 and 4.6
     "claude-opus-4-5",
     "claude-opus-4-6",
+    "claude-sonnet-4-6",
     # Nova 2 Lite
     "nova-2-lite",
 ]
@@ -96,6 +97,7 @@ class ModelFeatures:
     "claude-haiku-4-5",
     "claude-opus-4-5",
     "claude-opus-4-6",
+    "claude-sonnet-4-6",
 ]
 
 # Models that support a top-level prompt_cache_retention parameter

From 2f8bba5766e2344badc62f907db099cb02f50b46 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 19 Feb 2026 16:20:40 +0000
Subject: [PATCH 15/16] fix: make litellm import lazy in
 resolve_model_config.py

Move the litellm import inside the test_model function to prevent
import errors when the workflow only needs to access the MODELS dict.
The integration workflow setup-matrix step imports the module to read
model configurations, but doesn't need litellm at that point.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/run-eval/resolve_model_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
index c6cb7edc31..bdaebb3512 100755
--- a/.github/run-eval/resolve_model_config.py
+++ b/.github/run-eval/resolve_model_config.py
@@ -17,8 +17,6 @@
 import sys
 from typing import Any
 
-import litellm
-
 
 # Model configurations dictionary
 MODELS = {
@@ -235,6 +233,8 @@ def test_model(
     Returns:
         Tuple of (success: bool, message: str)
     """
+    import litellm
+
     llm_config = model_config.get("llm_config", {})
     model_name = llm_config.get("model", "unknown")
     display_name = model_config.get("display_name", model_name)

From 70133088b6ba54a4df873e59bac191a887af14b1 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 19 Feb 2026 16:48:16 +0000
Subject: [PATCH 16/16] ci: restore Blacksmith runners for integration tests

Switch back to Blacksmith CI runners for performance-critical jobs:
- run-integration-tests: blacksmith-4vcpu-ubuntu-2204 (was ubuntu-22.04)
- consolidate-results: blacksmith-2vcpu-ubuntu-2404 (was ubuntu-24.04)

This should significantly improve integration test execution time.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/integration-runner.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 86ae4a1f60..f2d2e45561 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -215,7 +215,7 @@ jobs:
                 (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk')
             ) && needs.setup-matrix.result == 'success'
         needs: [setup-matrix, post-label-comment, post-dispatch-comment]
-        runs-on: ubuntu-22.04
+        runs-on: blacksmith-4vcpu-ubuntu-2204
         permissions:
             contents: read
             id-token: write
@@ -367,7 +367,7 @@ jobs:
                 github.event_name == 'workflow_dispatch' ||
                 (github.event_name == 'schedule' && github.repository == 'OpenHands/software-agent-sdk')
             )
-        runs-on: ubuntu-24.04
+        runs-on: blacksmith-2vcpu-ubuntu-2404
         permissions:
             contents: read
             pull-requests: write