Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cllama
158 changes: 158 additions & 0 deletions cmd/claw/spike_openclaw_additive_tools_live_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,16 @@ import (
"gopkg.in/yaml.v3"
)

const openClawAdditiveMixedSerializationIntervention = "managed_prefix_native_suffix_serialized"

// TestSpikeOpenClawAdditiveToolsLive exercises a real OpenClaw runner behind
// cllama with managed tools enabled and validates additive tool availability in
// one live session:
// - turn 1 reads a nonce file via a runner-native tool
// - turn 2 calls a cllama-mediated HTTP tool on the same session
// - turn 3 asks the model to emit managed+native tool calls in one response
// and proves cllama serialized the managed prefix before handing native
// execution back to the runner
// - session history contains both non-mediated and mediated entries
func TestSpikeOpenClawAdditiveToolsLive(t *testing.T) {
_, thisFile, _, ok := runtime.Caller(0)
Expand Down Expand Up @@ -141,10 +146,25 @@ func TestSpikeOpenClawAdditiveToolsLive(t *testing.T) {
t.Fatalf("managed tool turn did not surface phrase %q\n%s", managedPhrase, managedOut)
}

mixedPhrase := fmt.Sprintf("mixed-additive-%d", time.Now().UnixNano())
mixedSince := time.Now().Add(-1 * time.Second)
mixedOut := openClawAdditiveRunMixedTurnWithRetry(
t,
agentContainerID,
cllamaContainerID,
nativeProof,
mixedPhrase,
mixedSince,
)
if !strings.Contains(mixedOut, mixedPhrase) || !strings.Contains(mixedOut, nativeProof) {
t.Fatalf("mixed tool turn did not surface phrase %q and proof %q\n%s", mixedPhrase, nativeProof, mixedOut)
}

rollcallAssertAuditTelemetry(t, podPath, "oc-additive", "openclaw", auditWindowStart)
rollcallAssertSessionHistory(t, sessionHistoryDir, "oc-additive")
rollcallAssertManagedToolTrace(t, sessionHistoryDir, "oc-additive", "tool-svc")
openClawAdditiveAssertHistoryMix(t, sessionHistoryDir, "oc-additive")
openClawAdditiveAssertMixedHandoffHistory(t, sessionHistoryDir, "oc-additive", proxyRequest.APIFormat)
}

func openClawAdditiveWriteFixture(t *testing.T, dir, baseTag, model, nativeProof string) {
Expand Down Expand Up @@ -172,6 +192,12 @@ Do not guess the file contents.
If the user tells you to call the managed tool tool-svc.get_runtime_context,
call it before you reply.

## Combined tool rule

If the user explicitly asks for both the managed tool and the native read tool
in the same turn, emit both tool calls before any text reply. Call the managed
tool first and then the native read tool. Do not skip either tool.

## Output rule

When the user asks for exact text or exact file contents, reply with exactly
Expand Down Expand Up @@ -295,6 +321,60 @@ func openClawAdditiveRunAgent(t *testing.T, containerID, sessionID, message stri
return text
}

func openClawAdditiveRunMixedTurnWithRetry(t *testing.T, agentContainerID, cllamaContainerID, nativeProof, phrase string, logSince time.Time) string {
t.Helper()

prompt := fmt.Sprintf(
"Before any final text, emit exactly two tool calls in one response and no others. First call the managed tool tool-svc.get_runtime_context. Second call your native read tool on /proof/native-proof.txt. After both tool results arrive, reply with exactly %s %s and nothing else.",
phrase,
nativeProof,
)

var lastOut string
for attempt := 1; attempt <= 4; attempt++ {
sessionID := fmt.Sprintf("additive-mixed-live-%d-%d", time.Now().UnixNano(), attempt)
lastOut = openClawAdditiveRunAgent(t, agentContainerID, sessionID, prompt)
if !strings.Contains(lastOut, phrase) || !strings.Contains(lastOut, nativeProof) {
t.Logf("mixed turn attempt %d did not produce final text yet", attempt)
continue
}
if openClawAdditiveHasInterventionLog(t, cllamaContainerID, "oc-additive", openClawAdditiveMixedSerializationIntervention, logSince) {
t.Logf("mixed turn attempt %d confirmed live mixed-prefix serialization", attempt)
return lastOut
}
t.Logf("mixed turn attempt %d completed without serialization telemetry; retrying to force same-response mixed batch", attempt)
}

t.Fatalf("did not observe live mixed-prefix serialization after retries; last output:\n%s", lastOut)
return ""
}

func openClawAdditiveHasInterventionLog(t *testing.T, cllamaContainerID, agentName, reason string, since time.Time) bool {
t.Helper()

args := []string{"logs", "--since", since.UTC().Format(time.RFC3339), cllamaContainerID}
out, err := exec.Command("docker", args...).CombinedOutput()
if err != nil {
t.Logf("warning: could not read cllama logs for mixed-serialization check: %v", err)
return false
}

for _, line := range strings.Split(string(out), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
var entry map[string]any
if err := json.Unmarshal([]byte(line), &entry); err != nil {
continue
}
if entry["type"] == "intervention" && entry["claw_id"] == agentName && entry["intervention"] == reason {
return true
}
}
return false
}

func openClawAdditiveAssertHistoryMix(t *testing.T, sessionHistoryDir, agentName string) {
t.Helper()

Expand Down Expand Up @@ -331,3 +411,81 @@ func openClawAdditiveAssertHistoryMix(t *testing.T, sessionHistoryDir, agentName
}
t.Logf("session history for %s confirms both native-only and managed turns", agentName)
}

func openClawAdditiveAssertMixedHandoffHistory(t *testing.T, sessionHistoryDir, agentName, apiFormat string) {
t.Helper()

histFile := filepath.Join(sessionHistoryDir, agentName, "history.jsonl")
data, err := os.ReadFile(histFile)
if err != nil {
t.Fatalf("read session history for %s: %v", agentName, err)
}

type toolCall struct {
Name string `json:"name"`
Service string `json:"service"`
StatusCode int `json:"status_code"`
}
type toolRound struct {
Round int `json:"round"`
ToolCalls []toolCall `json:"tool_calls"`
}

var sawMixedHandoff bool
for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
if strings.TrimSpace(line) == "" {
continue
}
var entry struct {
Usage struct {
TotalRounds int `json:"total_rounds"`
} `json:"usage"`
ToolTrace []toolRound `json:"tool_trace"`
Response struct {
Format string `json:"format"`
JSON json.RawMessage `json:"json"`
Text string `json:"text"`
} `json:"response"`
}
if err := json.Unmarshal([]byte(line), &entry); err != nil {
t.Fatalf("parse session history for %s: %v\n%s", agentName, err, line)
}
if len(entry.ToolTrace) == 0 || entry.Usage.TotalRounds < 2 {
continue
}

var sawManagedTool bool
for _, round := range entry.ToolTrace {
for _, call := range round.ToolCalls {
if call.Name == "tool-svc.get_runtime_context" && call.Service == "tool-svc" && call.StatusCode == 200 {
sawManagedTool = true
break
}
}
if sawManagedTool {
break
}
}
if !sawManagedTool {
continue
}

body := string(entry.Response.JSON)
if entry.Response.Format == "sse" {
body = entry.Response.Text
}
handoffMarker := `"tool_calls"`
if apiFormat == "anthropic" {
handoffMarker = `"tool_use"`
}
if strings.Contains(body, `"runner_local"`) || strings.Contains(body, handoffMarker) {
sawMixedHandoff = true
t.Logf("session history for %s confirms mediated %s handoff after managed tool trace", agentName, apiFormat)
break
}
}

if !sawMixedHandoff {
t.Fatalf("expected a mediated %s handoff entry for %s with tool-svc.get_runtime_context trace and native handoff marker in %s", apiFormat, agentName, histFile)
}
}
18 changes: 11 additions & 7 deletions docs/decisions/020-cllama-compiled-tool-mediation.md
Original file line number Diff line number Diff line change
Expand Up @@ -330,11 +330,11 @@ A fundamental constraint: when the LLM returns tool_calls, the protocol requires

`mediated` mode therefore partitions by response ownership rather than pretending both executors can satisfy the same tool round.

**Current rule:** runner-native and managed tools can coexist on the same request surface, but a single model response still has one owner:
**Current rule:** runner-native and managed tools can coexist on the same request surface, but cllama preserves a monotonic execution boundary inside each mediated chain:
- If a response contains managed tool calls only, cllama owns that round and executes them internally.
- If a response contains runner-native tool calls only, cllama passes the response back to the runner unchanged. If the downstream client originally requested streaming, cllama synthesizes an equivalent SSE stream so the runner still receives its expected protocol shape.
- If a single response mixes managed and runner-native tool calls, cllama fails closed.
- If cllama has already hidden managed rounds inside the current request and a later response contains runner-native tool calls only, cllama hands that response back to the runner and stores a one-shot continuity handoff so the hidden managed transcript is reinserted before the runner's follow-up tool-result request.
- If a single response contains a managed prefix followed by a runner-native suffix, cllama occludes the runner-native suffix, executes the managed prefix internally, appends the managed results into the hidden transcript, and asks the model to continue from that state. If the model later emits runner-native tool calls only, cllama hands that response back to the runner and stores the usual one-shot continuity handoff so the hidden managed transcript is reinserted before the runner's follow-up tool-result request.
- If a single response contains runner-native calls before later managed calls, or otherwise interleaves ownership, cllama fails closed with an explicit retry instruction rather than silently reordering the model's plan.

**If the response contains managed tool_calls only:**
1. cllama validates each call against the manifest (reject unknown tools — fail closed)
Expand All @@ -346,17 +346,21 @@ A fundamental constraint: when the LLM returns tool_calls, the protocol requires
**If the response contains runner-native tool_calls only before any hidden managed round:**
- Return the response to the runner so its native tool loop can continue normally.

**If the response contains mixed ownership in one model response:**
- Fail closed with a direct proxy error rather than fabricating transcript state.
**If the response contains a managed prefix and a runner-native suffix in one model response:**
- Serialize the round. cllama executes the managed prefix first, feeds those results back upstream, and waits for the model to re-emit any runner-native step cleanly in a later response.

**If the response contains runner-native calls before later managed calls, or otherwise interleaves ownership:**
- Fail closed with a direct proxy error instructing the agent to emit managed service tools first and runner-native tools in a later response.

**If the response contains only text:**
- Return directly (or re-stream if the runner requested streaming).

This single-executor model handles the common cases cleanly:
This monotonic-executor model handles the common cases cleanly:
- Service-only tool chains: cllama handles transparently, runner sees text
- Runner-only tool chains in mediated requests: cllama preserves them, runner remains the executor
- Managed-first mixed batches: cllama serializes the managed prefix before letting the runner resume
- Native additive tool chains: runner handles both local and pod-shared tools in `native` mode
- Mixed batches in `mediated` mode: refuse execution, feed errors back
- Native-first or interleaved mixed batches in `mediated` mode: refuse execution, feed errors back

**Future:** `native` mode is the preferred additive path. Any later two-phase mediated execution would require an explicit runner-side protocol extension and is not the architectural target.

Expand Down
2 changes: 1 addition & 1 deletion site/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ outline: deep

## Unreleased

<!-- Nothing yet -->
- **Managed-first mixed tool batches now serialize instead of hard-failing** ([#165](https://github.com/mostlydev/clawdapus/issues/165)) — when a mediated model response contains a managed tool prefix followed by runner-native tool calls, `cllama` no longer aborts the turn with `mixed managed and runner-native tool calls are not supported in one model response`. The proxy now occludes the runner-native suffix, executes the managed prefix internally, feeds those results back upstream, and waits for the model to re-issue any runner-native step cleanly in a later response. Native-first or interleaved mixed batches still fail closed, but the returned proxy error now explicitly tells the agent to emit managed service tools first and runner-native tools in a later response. OpenAI-compatible and Anthropic paths both have regression coverage.

## v0.8.13 <Badge type="tip" text="Latest" /> {#v0-8-13}

Expand Down
5 changes: 3 additions & 2 deletions site/guide/tools.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,9 @@ When the LLM returns a response containing tool calls, cllama dispatches based o

1. **Managed tool calls only** — cllama validates each call against the manifest, executes the tools via HTTP against the declared services, and constructs a follow-up LLM request with the tool results appended. This loop repeats until the LLM returns terminal text. The runner never sees the intermediate managed rounds — only the terminal text is returned.
2. **Runner-native tool calls only** — cllama passes the response back to the runner unchanged. The runner executes its own tools and continues the conversation normally.
3. **A mix of managed and runner-native tool calls in the same response** — cllama fail-closes with a precise error rather than dropping or replacing tools.
4. **Managed first, native later in the same overall turn** — if cllama has already hidden managed rounds and a later model response contains only runner-native tool calls, cllama returns that native tool-call response to the runner and stores a one-shot continuity handoff. On the runner's follow-up request with the native tool result, cllama reinjects the hidden managed assistant/tool transcript immediately before the native tool-call message so the upstream model still sees a coherent history.
3. **Managed first, native later in the same response** — cllama serializes the round instead of hard-failing. It occludes the runner-native suffix, executes the managed prefix internally, appends the managed results into the hidden transcript, and asks the model to continue from there. If the model then emits runner-native tool calls only, cllama returns that response to the runner.
4. **Managed first, native later in the same overall turn** — once cllama has hidden managed rounds and a later model response contains only runner-native tool calls, cllama returns that native tool-call response to the runner and stores a one-shot continuity handoff. On the runner's follow-up request with the native tool result, cllama reinjects the hidden managed assistant/tool transcript immediately before the native tool-call message so the upstream model still sees a coherent history.
5. **Runner-native first, managed later in the same response** — cllama still fail-closes. It does not silently reorder the model's plan; the proxy returns an explicit retry instruction telling the agent to emit managed service tools first and runner-native tools in a later response.

Unknown managed tool names are rejected at validation time.

Expand Down