From 183a8b3a3e9a975511aac5781011a73e89b9e6ef Mon Sep 17 00:00:00 2001
From: Wojtek <wojtek@grabski.ca>
Date: Thu, 16 Apr 2026 14:20:02 -0400
Subject: [PATCH 1/2] Validate mixed tool serialization live

---
 cllama                                        |   2 +-
 ...spike_openclaw_additive_tools_live_test.go | 134 ++++++++++++++++++
 .../020-cllama-compiled-tool-mediation.md     |  18 ++-
 site/changelog.md                             |   2 +-
 site/guide/tools.md                           |   5 +-
 5 files changed, 150 insertions(+), 11 deletions(-)

diff --git a/cllama b/cllama
index 7de6f30..3ea4762 160000
--- a/cllama
+++ b/cllama
@@ -1 +1 @@
-Subproject commit 7de6f30ee7c3519e5e72e81a51055ca75ff3d299
+Subproject commit 3ea476274525bc3e33eecea8c477cd7a998b89bd
diff --git a/cmd/claw/spike_openclaw_additive_tools_live_test.go b/cmd/claw/spike_openclaw_additive_tools_live_test.go
index 1b98bb1..643939b 100644
--- a/cmd/claw/spike_openclaw_additive_tools_live_test.go
+++ b/cmd/claw/spike_openclaw_additive_tools_live_test.go
@@ -16,11 +16,16 @@ import (
 	"gopkg.in/yaml.v3"
 )
 
+const openClawAdditiveMixedSerializationIntervention = "managed_prefix_native_suffix_serialized"
+
 // TestSpikeOpenClawAdditiveToolsLive exercises a real OpenClaw runner behind
 // cllama with managed tools enabled and validates additive tool availability in
 // one live session:
 //   - turn 1 reads a nonce file via a runner-native tool
 //   - turn 2 calls a cllama-mediated HTTP tool on the same session
+//   - turn 3 asks the model to emit managed+native tool calls in one response
+//     and proves cllama serialized the managed prefix before handing native
+//     execution back to the runner
 //   - session history contains both non-mediated and mediated entries
 func TestSpikeOpenClawAdditiveToolsLive(t *testing.T) {
 	_, thisFile, _, ok := runtime.Caller(0)
@@ -141,10 +146,25 @@ func TestSpikeOpenClawAdditiveToolsLive(t *testing.T) {
 		t.Fatalf("managed tool turn did not surface phrase %q\n%s", managedPhrase, managedOut)
 	}
 
+	mixedPhrase := fmt.Sprintf("mixed-additive-%d", time.Now().UnixNano())
+	mixedSince := time.Now().Add(-1 * time.Second)
+	mixedOut := openClawAdditiveRunMixedTurnWithRetry(
+		t,
+		agentContainerID,
+		cllamaContainerID,
+		nativeProof,
+		mixedPhrase,
+		mixedSince,
+	)
+	if !strings.Contains(mixedOut, mixedPhrase) || !strings.Contains(mixedOut, nativeProof) {
+		t.Fatalf("mixed tool turn did not surface phrase %q and proof %q\n%s", mixedPhrase, nativeProof, mixedOut)
+	}
+
 	rollcallAssertAuditTelemetry(t, podPath, "oc-additive", "openclaw", auditWindowStart)
 	rollcallAssertSessionHistory(t, sessionHistoryDir, "oc-additive")
 	rollcallAssertManagedToolTrace(t, sessionHistoryDir, "oc-additive", "tool-svc")
 	openClawAdditiveAssertHistoryMix(t, sessionHistoryDir, "oc-additive")
+	openClawAdditiveAssertMixedHandoffHistory(t, sessionHistoryDir, "oc-additive", proxyRequest.APIFormat)
 }
 
 func openClawAdditiveWriteFixture(t *testing.T, dir, baseTag, model, nativeProof string) {
@@ -172,6 +192,12 @@ Do not guess the file contents.
 If the user tells you to call the managed tool tool-svc.get_runtime_context,
 call it before you reply.
 
+## Combined tool rule
+
+If the user explicitly asks for both the managed tool and the native read tool
+in the same turn, emit both tool calls before any text reply. Call the managed
+tool first and then the native read tool. Do not skip either tool.
+
 ## Output rule
 
 When the user asks for exact text or exact file contents, reply with exactly
@@ -295,6 +321,60 @@ func openClawAdditiveRunAgent(t *testing.T, containerID, sessionID, message stri
 	return text
 }
 
+func openClawAdditiveRunMixedTurnWithRetry(t *testing.T, agentContainerID, cllamaContainerID, nativeProof, phrase string, logSince time.Time) string {
+	t.Helper()
+
+	prompt := fmt.Sprintf(
+		"Before any final text, emit exactly two tool calls in one response and no others. First call the managed tool tool-svc.get_runtime_context. Second call your native read tool on /proof/native-proof.txt. After both tool results arrive, reply with exactly %s %s and nothing else.",
+		phrase,
+		nativeProof,
+	)
+
+	var lastOut string
+	for attempt := 1; attempt <= 4; attempt++ {
+		sessionID := fmt.Sprintf("additive-mixed-live-%d-%d", time.Now().UnixNano(), attempt)
+		lastOut = openClawAdditiveRunAgent(t, agentContainerID, sessionID, prompt)
+		if !strings.Contains(lastOut, phrase) || !strings.Contains(lastOut, nativeProof) {
+			t.Logf("mixed turn attempt %d did not produce final text yet", attempt)
+			continue
+		}
+		if openClawAdditiveHasInterventionLog(t, cllamaContainerID, "oc-additive", openClawAdditiveMixedSerializationIntervention, logSince) {
+			t.Logf("mixed turn attempt %d confirmed live mixed-prefix serialization", attempt)
+			return lastOut
+		}
+		t.Logf("mixed turn attempt %d completed without serialization telemetry; retrying to force same-response mixed batch", attempt)
+	}
+
+	t.Fatalf("did not observe live mixed-prefix serialization after retries; last output:\n%s", lastOut)
+	return ""
+}
+
+func openClawAdditiveHasInterventionLog(t *testing.T, cllamaContainerID, agentName, reason string, since time.Time) bool {
+	t.Helper()
+
+	args := []string{"logs", "--since", since.UTC().Format(time.RFC3339), cllamaContainerID}
+	out, err := exec.Command("docker", args...).CombinedOutput()
+	if err != nil {
+		t.Logf("warning: could not read cllama logs for mixed-serialization check: %v", err)
+		return false
+	}
+
+	for _, line := range strings.Split(string(out), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		var entry map[string]any
+		if err := json.Unmarshal([]byte(line), &entry); err != nil {
+			continue
+		}
+		if entry["type"] == "intervention" && entry["claw_id"] == agentName && entry["intervention"] == reason {
+			return true
+		}
+	}
+	return false
+}
+
 func openClawAdditiveAssertHistoryMix(t *testing.T, sessionHistoryDir, agentName string) {
 	t.Helper()
 
@@ -331,3 +411,57 @@ func openClawAdditiveAssertHistoryMix(t *testing.T, sessionHistoryDir, agentName
 	}
 	t.Logf("session history for %s confirms both native-only and managed turns", agentName)
 }
+
+func openClawAdditiveAssertMixedHandoffHistory(t *testing.T, sessionHistoryDir, agentName, apiFormat string) {
+	t.Helper()
+
+	histFile := filepath.Join(sessionHistoryDir, agentName, "history.jsonl")
+	data, err := os.ReadFile(histFile)
+	if err != nil {
+		t.Fatalf("read session history for %s: %v", agentName, err)
+	}
+
+	var sawMixedHandoff bool
+	for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
+		if strings.TrimSpace(line) == "" {
+			continue
+		}
+		var entry struct {
+			Usage struct {
+				TotalRounds int `json:"total_rounds"`
+			} `json:"usage"`
+			ToolTrace []json.RawMessage `json:"tool_trace"`
+			Response  struct {
+				Format string          `json:"format"`
+				JSON   json.RawMessage `json:"json"`
+				Text   string          `json:"text"`
+			} `json:"response"`
+		}
+		if err := json.Unmarshal([]byte(line), &entry); err != nil {
+			t.Fatalf("parse session history for %s: %v\n%s", agentName, err, line)
+		}
+		if len(entry.ToolTrace) == 0 || entry.Usage.TotalRounds < 2 {
+			continue
+		}
+
+		body := string(entry.Response.JSON)
+		if entry.Response.Format == "sse" {
+			body = entry.Response.Text
+		}
+		switch apiFormat {
+		case "anthropic":
+			if strings.Contains(body, `"tool_use"`) {
+				sawMixedHandoff = true
+			}
+		default:
+			if strings.Contains(body, `"tool_calls"`) {
+				sawMixedHandoff = true
+			}
+		}
+	}
+
+	if !sawMixedHandoff {
+		t.Fatalf("expected a mediated handoff history entry for %s in %s", agentName, histFile)
+	}
+	t.Logf("session history for %s confirms a mediated native handoff entry", agentName)
+}
diff --git a/docs/decisions/020-cllama-compiled-tool-mediation.md b/docs/decisions/020-cllama-compiled-tool-mediation.md
index 42dcd46..0bd72cc 100644
--- a/docs/decisions/020-cllama-compiled-tool-mediation.md
+++ b/docs/decisions/020-cllama-compiled-tool-mediation.md
@@ -330,11 +330,11 @@ A fundamental constraint: when the LLM returns tool_calls, the protocol requires
 
 `mediated` mode therefore partitions by response ownership rather than pretending both executors can satisfy the same tool round.
 
-**Current rule:** runner-native and managed tools can coexist on the same request surface, but a single model response still has one owner:
+**Current rule:** runner-native and managed tools can coexist on the same request surface, but cllama preserves a monotonic execution boundary inside each mediated chain:
 - If a response contains managed tool calls only, cllama owns that round and executes them internally.
 - If a response contains runner-native tool calls only, cllama passes the response back to the runner unchanged. If the downstream client originally requested streaming, cllama synthesizes an equivalent SSE stream so the runner still receives its expected protocol shape.
-- If a single response mixes managed and runner-native tool calls, cllama fails closed.
-- If cllama has already hidden managed rounds inside the current request and a later response contains runner-native tool calls only, cllama hands that response back to the runner and stores a one-shot continuity handoff so the hidden managed transcript is reinserted before the runner's follow-up tool-result request.
+- If a single response contains a managed prefix followed by a runner-native suffix, cllama occludes the runner-native suffix, executes the managed prefix internally, appends the managed results into the hidden transcript, and asks the model to continue from that state. If the model later emits runner-native tool calls only, cllama hands that response back to the runner and stores the usual one-shot continuity handoff so the hidden managed transcript is reinserted before the runner's follow-up tool-result request.
+- If a single response contains runner-native calls before later managed calls, or otherwise interleaves ownership, cllama fails closed with an explicit retry instruction rather than silently reordering the model's plan.
 
 **If the response contains managed tool_calls only:**
 1. cllama validates each call against the manifest (reject unknown tools — fail closed)
@@ -346,17 +346,21 @@ A fundamental constraint: when the LLM returns tool_calls, the protocol requires
 **If the response contains runner-native tool_calls only before any hidden managed round:**
 - Return the response to the runner so its native tool loop can continue normally.
 
-**If the response contains mixed ownership in one model response:**
-- Fail closed with a direct proxy error rather than fabricating transcript state.
+**If the response contains a managed prefix and a runner-native suffix in one model response:**
+- Serialize the round. cllama executes the managed prefix first, feeds those results back upstream, and waits for the model to re-emit any runner-native step cleanly in a later response.
+
+**If the response contains runner-native calls before later managed calls, or otherwise interleaves ownership:**
+- Fail closed with a direct proxy error instructing the agent to emit managed service tools first and runner-native tools in a later response.
 
 **If the response contains only text:**
 - Return directly (or re-stream if the runner requested streaming).
 
-This single-executor model handles the common cases cleanly:
+This monotonic-executor model handles the common cases cleanly:
 - Service-only tool chains: cllama handles transparently, runner sees text
 - Runner-only tool chains in mediated requests: cllama preserves them, runner remains the executor
+- Managed-first mixed batches: cllama serializes the managed prefix before letting the runner resume
 - Native additive tool chains: runner handles both local and pod-shared tools in `native` mode
-- Mixed batches in `mediated` mode: refuse execution, feed errors back
+- Native-first or interleaved mixed batches in `mediated` mode: refuse execution, feed errors back
 
 **Future:** `native` mode is the preferred additive path. Any later two-phase mediated execution would require an explicit runner-side protocol extension and is not the architectural target.
 
diff --git a/site/changelog.md b/site/changelog.md
index e949937..67f42e4 100644
--- a/site/changelog.md
+++ b/site/changelog.md
@@ -29,7 +29,7 @@ outline: deep
 
 ## Unreleased
 
-<!-- Nothing yet -->
+- **Managed-first mixed tool batches now serialize instead of hard-failing** ([#165](https://github.com/mostlydev/clawdapus/issues/165)) — when a mediated model response contains a managed tool prefix followed by runner-native tool calls, `cllama` no longer aborts the turn with `mixed managed and runner-native tool calls are not supported in one model response`. The proxy now occludes the runner-native suffix, executes the managed prefix internally, feeds those results back upstream, and waits for the model to re-issue any runner-native step cleanly in a later response. Native-first or interleaved mixed batches still fail closed, but the returned proxy error now explicitly tells the agent to emit managed service tools first and runner-native tools in a later response. OpenAI-compatible and Anthropic paths both have regression coverage.
 
 ## v0.8.13 <Badge type="tip" text="Latest" /> {#v0-8-13}
 
diff --git a/site/guide/tools.md b/site/guide/tools.md
index 351dbda..0549ca7 100644
--- a/site/guide/tools.md
+++ b/site/guide/tools.md
@@ -178,8 +178,9 @@ When the LLM returns a response containing tool calls, cllama dispatches based o
 
 1. **Managed tool calls only** — cllama validates each call against the manifest, executes the tools via HTTP against the declared services, and constructs a follow-up LLM request with the tool results appended. This loop repeats until the LLM returns terminal text. The runner never sees the intermediate managed rounds — only the terminal text is returned.
 2. **Runner-native tool calls only** — cllama passes the response back to the runner unchanged. The runner executes its own tools and continues the conversation normally.
-3. **A mix of managed and runner-native tool calls in the same response** — cllama fail-closes with a precise error rather than dropping or replacing tools.
-4. **Managed first, native later in the same overall turn** — if cllama has already hidden managed rounds and a later model response contains only runner-native tool calls, cllama returns that native tool-call response to the runner and stores a one-shot continuity handoff. On the runner's follow-up request with the native tool result, cllama reinjects the hidden managed assistant/tool transcript immediately before the native tool-call message so the upstream model still sees a coherent history.
+3. **Managed first, native later in the same response** — cllama serializes the round instead of hard-failing. It occludes the runner-native suffix, executes the managed prefix internally, appends the managed results into the hidden transcript, and asks the model to continue from there. If the model then emits runner-native tool calls only, cllama returns that response to the runner.
+4. **Managed first, native later in the same overall turn** — once cllama has hidden managed rounds and a later model response contains only runner-native tool calls, cllama returns that native tool-call response to the runner and stores a one-shot continuity handoff. On the runner's follow-up request with the native tool result, cllama reinjects the hidden managed assistant/tool transcript immediately before the native tool-call message so the upstream model still sees a coherent history.
+5. **Runner-native first, managed later in the same response** — cllama still fail-closes. It does not silently reorder the model's plan; the proxy returns an explicit retry instruction telling the agent to emit managed service tools first and runner-native tools in a later response.
 
 Unknown managed tool names are rejected at validation time.
 

From 2373bee72d506f80125a5185e73624c3d6eb370f Mon Sep 17 00:00:00 2001
From: Wojtek <wojtek@grabski.ca>
Date: Thu, 16 Apr 2026 14:38:48 -0400
Subject: [PATCH 2/2] Tighten mixed-tool spike follow-ups

---
 cllama                                        |  2 +-
 ...spike_openclaw_additive_tools_live_test.go | 48 ++++++++++++++-----
 2 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/cllama b/cllama
index 3ea4762..0000f4b 160000
--- a/cllama
+++ b/cllama
@@ -1 +1 @@
-Subproject commit 3ea476274525bc3e33eecea8c477cd7a998b89bd
+Subproject commit 0000f4bd7fad3537263474bd60a983b14a824594
diff --git a/cmd/claw/spike_openclaw_additive_tools_live_test.go b/cmd/claw/spike_openclaw_additive_tools_live_test.go
index 643939b..9e2bd15 100644
--- a/cmd/claw/spike_openclaw_additive_tools_live_test.go
+++ b/cmd/claw/spike_openclaw_additive_tools_live_test.go
@@ -421,6 +421,16 @@ func openClawAdditiveAssertMixedHandoffHistory(t *testing.T, sessionHistoryDir,
 		t.Fatalf("read session history for %s: %v", agentName, err)
 	}
 
+	type toolCall struct {
+		Name       string `json:"name"`
+		Service    string `json:"service"`
+		StatusCode int    `json:"status_code"`
+	}
+	type toolRound struct {
+		Round     int        `json:"round"`
+		ToolCalls []toolCall `json:"tool_calls"`
+	}
+
 	var sawMixedHandoff bool
 	for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
 		if strings.TrimSpace(line) == "" {
@@ -430,7 +440,7 @@ func openClawAdditiveAssertMixedHandoffHistory(t *testing.T, sessionHistoryDir,
 			Usage struct {
 				TotalRounds int `json:"total_rounds"`
 			} `json:"usage"`
-			ToolTrace []json.RawMessage `json:"tool_trace"`
+			ToolTrace []toolRound `json:"tool_trace"`
 			Response  struct {
 				Format string          `json:"format"`
 				JSON   json.RawMessage `json:"json"`
@@ -444,24 +454,38 @@ func openClawAdditiveAssertMixedHandoffHistory(t *testing.T, sessionHistoryDir,
 			continue
 		}
 
+		var sawManagedTool bool
+		for _, round := range entry.ToolTrace {
+			for _, call := range round.ToolCalls {
+				if call.Name == "tool-svc.get_runtime_context" && call.Service == "tool-svc" && call.StatusCode == 200 {
+					sawManagedTool = true
+					break
+				}
+			}
+			if sawManagedTool {
+				break
+			}
+		}
+		if !sawManagedTool {
+			continue
+		}
+
 		body := string(entry.Response.JSON)
 		if entry.Response.Format == "sse" {
 			body = entry.Response.Text
 		}
-		switch apiFormat {
-		case "anthropic":
-			if strings.Contains(body, `"tool_use"`) {
-				sawMixedHandoff = true
-			}
-		default:
-			if strings.Contains(body, `"tool_calls"`) {
-				sawMixedHandoff = true
-			}
+		handoffMarker := `"tool_calls"`
+		if apiFormat == "anthropic" {
+			handoffMarker = `"tool_use"`
+		}
+		if strings.Contains(body, `"runner_local"`) || strings.Contains(body, handoffMarker) {
+			sawMixedHandoff = true
+			t.Logf("session history for %s confirms mediated %s handoff after managed tool trace", agentName, apiFormat)
+			break
 		}
 	}
 
 	if !sawMixedHandoff {
-		t.Fatalf("expected a mediated handoff history entry for %s in %s", agentName, histFile)
+		t.Fatalf("expected a mediated %s handoff entry for %s with tool-svc.get_runtime_context trace and native handoff marker in %s", apiFormat, agentName, histFile)
 	}
-	t.Logf("session history for %s confirms a mediated native handoff entry", agentName)
 }