From 8eeb80d067de17f2ad4177138cae00257a58d93c Mon Sep 17 00:00:00 2001
From: Wojtek <wojtek@grabski.ca>
Date: Thu, 9 Apr 2026 18:14:12 -0400
Subject: [PATCH] Spike: multi-model rollcall matrix exercising both cllama
 ingress surfaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor TestSpikeRollCall around a per-runtime config matrix so the spike
exercises both `/v1/messages` and `/v1/chat/completions` in a single run, and
add a dedicated openclaw + google/gemini variant as the end-to-end regression
test for the original #127 incident that motivated ADR-023.

Each matrix entry pins a model (real runners override `x-claw.models.primary`
so cllama's policy actually allows it; stubs set both `ROLLCALL_CLLAMA_MODEL`
and the same policy override) and an `expectedSurface` that a final
`ingress_surface_coverage` subtest asserts both canonical cllama surfaces have
been hit. `session_history_persistence` now skips matrix entries whose
required keys were missing so partial-coverage runs do not false-fail.

`pc-roll` is gated behind `CLAW_SPIKE_ENABLE_PICOCLAW` while #137 is open —
the picoclaw gateway port=0 failure reproduces on master and is unrelated to
this change. The capability-wave-live spike grows its own
`capabilityWaveProxyRequest` helper since the old shared single-model picker
was deleted.
---
 cmd/claw/spike_capability_wave_live_test.go |  38 ++-
 cmd/claw/spike_rollcall_test.go             | 301 +++++++++++++++++---
 2 files changed, 286 insertions(+), 53 deletions(-)

diff --git a/cmd/claw/spike_capability_wave_live_test.go b/cmd/claw/spike_capability_wave_live_test.go
index 617150f..30eee53 100644
--- a/cmd/claw/spike_capability_wave_live_test.go
+++ b/cmd/claw/spike_capability_wave_live_test.go
@@ -73,12 +73,7 @@ func TestSpikeCapabilityWaveLive(t *testing.T) {
 	t.Setenv("CLLAMA_UI_PORT", env["CLLAMA_UI_PORT"])
 	t.Setenv("CLAWDASH_ADDR", env["CLAWDASH_ADDR"])
 
-	proxyRequest := chooseRollcallProxyRequest(
-		t,
-		strings.TrimSpace(env["XAI_API_KEY"]),
-		strings.TrimSpace(env["ANTHROPIC_API_KEY"]),
-		strings.TrimSpace(env["OPENROUTER_API_KEY"]),
-	)
+	proxyRequest := capabilityWaveProxyRequest(t, env)
 
 	spikeBuildImage(t, dir, "openclaw:latest", "Dockerfile.openclaw-base")
 	spikeBuildImage(t, dir, "rollcall-openclaw:latest", "agents/oc-roll/Clawfile")
@@ -160,6 +155,37 @@ func TestSpikeCapabilityWaveLive(t *testing.T) {
 	t.Log("clawdash sidecar confirmed running")
 }
 
+// capabilityWaveProxyRequest picks a single inbound proxy-request shape for
+// the capability-wave-live spike. Unlike TestSpikeRollCall this test runs only
+// one runtime, so it just needs one provider/model that the local environment
+// can serve.
+func capabilityWaveProxyRequest(t *testing.T, env map[string]string) rollcallProxyRequest {
+	t.Helper()
+
+	xaiKey := strings.TrimSpace(env["XAI_API_KEY"])
+	anthropicKey := strings.TrimSpace(env["ANTHROPIC_API_KEY"])
+	openrouterKey := strings.TrimSpace(env["OPENROUTER_API_KEY"])
+
+	cfg := rollcallProxyRequest{CllamaEnv: make(map[string]string)}
+	switch {
+	case xaiKey != "":
+		cfg.APIFormat = "openai"
+		cfg.Model = "xai/grok-4-1-fast-reasoning"
+		cfg.CllamaEnv["XAI_API_KEY"] = xaiKey
+	case anthropicKey != "":
+		cfg.APIFormat = "anthropic"
+		cfg.Model = "claude-sonnet-4"
+		cfg.CllamaEnv["ANTHROPIC_API_KEY"] = anthropicKey
+	case openrouterKey != "":
+		cfg.APIFormat = "openai"
+		cfg.Model = "openrouter/anthropic/claude-sonnet-4"
+		cfg.CllamaEnv["OPENROUTER_API_KEY"] = openrouterKey
+	default:
+		t.Fatal("capability-wave proxy request requires at least one real provider key")
+	}
+	return cfg
+}
+
 func capabilityWaveLivePod(t *testing.T, expandedPod string, proxyRequest rollcallProxyRequest) string {
 	t.Helper()
 
diff --git a/cmd/claw/spike_rollcall_test.go b/cmd/claw/spike_rollcall_test.go
index 8917aeb..35b5362 100644
--- a/cmd/claw/spike_rollcall_test.go
+++ b/cmd/claw/spike_rollcall_test.go
@@ -82,7 +82,16 @@ func TestSpikeRollCall(t *testing.T) {
 	botToken := env["DISCORD_BOT_TOKEN"]
 	botID := env["DISCORD_BOT_ID"]
 	webhookURL := env["DISCORD_WEBHOOK_URL"]
-	proxyRequest := chooseRollcallProxyRequest(t, xaiKey, anthropicKey, openrouterKey)
+	geminiKey := strings.TrimSpace(env["GEMINI_API_KEY"])
+	if _, ok := env["GEMINI_API_KEY"]; !ok {
+		env["GEMINI_API_KEY"] = ""
+	}
+	availableKeys := map[string]string{
+		"XAI_API_KEY":        xaiKey,
+		"ANTHROPIC_API_KEY":  anthropicKey,
+		"OPENROUTER_API_KEY": openrouterKey,
+		"GEMINI_API_KEY":     geminiKey,
+	}
 	if webhookURL == "" {
 		t.Fatal("DISCORD_WEBHOOK_URL not set in rollcall/.env")
 	}
@@ -138,17 +147,116 @@ func TestSpikeRollCall(t *testing.T) {
 		spikeBuildImage(t, dir, a.image, a.dockerfile)
 	}
 
-	allAgents := []struct {
-		name    string
-		runtime string
-	}{
-		{"oc-roll", "openclaw"},
-		{"nc-roll", "nullclaw"},
-		{"mc-roll", "microclaw"},
-		{"nano-roll", "nanoclaw"},
-		{"nb-roll", "nanobot"},
-		{"pc-roll", "picoclaw"},
-		{"hm-roll", "hermes"},
+	// allAgents is the per-runtime test matrix. Each entry pins a model and
+	// (for stub runtimes) an inbound proxy request format so the spike
+	// exercises both cllama ingress surfaces and multiple distinct
+	// provider/model pairs in a single run.
+	//
+	// expectedSurface is the cllama ingress surface a runtime is expected to
+	// hit when its request reaches the proxy. It is asserted at the end of the
+	// test as coverage protection for ADR-023.
+	allAgents := []rollcallAgentEntry{
+		{
+			name:            "oc-roll",
+			runtime:         "openclaw",
+			subtestName:     "openclaw_openai_surface",
+			modelOverride:   "openrouter/anthropic/claude-sonnet-4",
+			expectedSurface: "openai-chat-completions",
+			requireKeys:     []string{"OPENROUTER_API_KEY"},
+		},
+		{
+			name:            "oc-roll",
+			runtime:         "openclaw",
+			subtestName:     "openclaw_anthropic_surface",
+			modelOverride:   "anthropic/claude-sonnet-4-6",
+			expectedSurface: "anthropic-messages",
+			requireKeys:     []string{"ANTHROPIC_API_KEY"},
+		},
+		{
+			// Direct regression test for issue #127: openclaw + google/gemini-*
+			// behind cllama. This is the exact provider that triggered the bug
+			// fixed by ADR-023's shared ingress surface matrix. The new code
+			// must compile this to api="openai-completions" — not the old
+			// vendor-native "google-generative-ai".
+			name:            "oc-roll",
+			runtime:         "openclaw",
+			subtestName:     "openclaw_google_surface",
+			modelOverride:   "google/gemini-2.5-flash",
+			expectedSurface: "openai-chat-completions",
+			requireKeys:     []string{"GEMINI_API_KEY"},
+		},
+		{
+			// Stubs send the bare provider/model ref directly via curl, so we
+			// must use a model name that Anthropic actually recognises today
+			// (claude-sonnet-4 alone is no longer a valid alias upstream).
+			name:            "nc-roll",
+			runtime:         "nullclaw",
+			proxyFormat:     "anthropic",
+			proxyModel:      "anthropic/claude-sonnet-4-6",
+			expectedSurface: "anthropic-messages",
+			requireKeys:     []string{"ANTHROPIC_API_KEY"},
+		},
+		{
+			name:            "mc-roll",
+			runtime:         "microclaw",
+			proxyFormat:     "openai",
+			proxyModel:      "openrouter/anthropic/claude-sonnet-4",
+			expectedSurface: "openai-chat-completions",
+			requireKeys:     []string{"OPENROUTER_API_KEY"},
+		},
+		{
+			name:            "nano-roll",
+			runtime:         "nanoclaw",
+			proxyFormat:     "anthropic",
+			proxyModel:      "anthropic/claude-sonnet-4-6",
+			expectedSurface: "anthropic-messages",
+			requireKeys:     []string{"ANTHROPIC_API_KEY"},
+		},
+		{
+			name:            "nb-roll",
+			runtime:         "nanobot",
+			proxyFormat:     "openai",
+			proxyModel:      "openrouter/anthropic/claude-sonnet-4",
+			expectedSurface: "openai-chat-completions",
+			requireKeys:     []string{"OPENROUTER_API_KEY"},
+		},
+		{
+			// pc-roll is currently broken upstream — picoclaw's gateway binary
+			// rejects a port=0 config pre-check that the clawdapus picoclaw
+			// driver does not populate. Reproduces on master, tracked in #137.
+			// Gated behind CLAW_SPIKE_ENABLE_PICOCLAW so it skips by default
+			// instead of failing the suite while #137 is open.
+			name:            "pc-roll",
+			runtime:         "picoclaw",
+			proxyFormat:     "anthropic",
+			proxyModel:      "anthropic/claude-sonnet-4-6",
+			expectedSurface: "anthropic-messages",
+			requireKeys:     []string{"ANTHROPIC_API_KEY", "CLAW_SPIKE_ENABLE_PICOCLAW"},
+		},
+		{
+			name:            "hm-roll",
+			runtime:         "hermes",
+			modelOverride:   "openrouter/anthropic/claude-sonnet-4",
+			expectedSurface: "openai-chat-completions",
+			requireKeys:     []string{"OPENROUTER_API_KEY"},
+		},
+	}
+
+	if !rollcallMatrixHasUsableEntry(allAgents, availableKeys) {
+		t.Skip("no API keys available for any rollcall matrix entry — skipping")
+	}
+
+	var (
+		exercisedSurfacesMu sync.Mutex
+		exercisedSurfaces   = make(map[string]bool)
+	)
+	markSurfaceExercised := func(surface string) {
+		if surface == "" {
+			return
+		}
+		exercisedSurfacesMu.Lock()
+		exercisedSurfaces[surface] = true
+		exercisedSurfacesMu.Unlock()
 	}
 
 	// ── Expand env vars in pod YAML ─────────────────────────────────────
@@ -184,9 +292,18 @@ func TestSpikeRollCall(t *testing.T) {
 
 	for _, agent := range allAgents {
 		agent := agent
-		t.Run(agent.runtime, func(t *testing.T) {
+		subtest := agent.subtestName
+		if subtest == "" {
+			subtest = agent.runtime
+		}
+		t.Run(subtest, func(t *testing.T) {
+			missing := rollcallMissingKeys(agent.requireKeys, availableKeys)
+			if len(missing) > 0 {
+				t.Skipf("missing API keys for %s: %v", subtest, missing)
+			}
+			proxyRequest := rollcallProxyRequestForEntry(agent, availableKeys)
 			const composeProject = "rollcall"
-			podPath := filepath.Join(dir, fmt.Sprintf("spike-%s-pod.yml", agent.name))
+			podPath := filepath.Join(dir, fmt.Sprintf("spike-%s-pod.yml", subtest))
 			podYAML := rollcallSingleServicePod(t, expandedPod, agent.name, proxyRequest)
 			if err := os.WriteFile(podPath, []byte(podYAML), 0o644); err != nil {
 				t.Fatalf("write %s: %v", filepath.Base(podPath), err)
@@ -245,25 +362,61 @@ func TestSpikeRollCall(t *testing.T) {
 			rollcallAssertSessionHistory(t, sessionHistoryDir, agent.name)
 
 			// oc-roll has memory configured — confirm memory_op telemetry fired.
-			if agent.name == "oc-roll" {
+			// Only assert memory on the openai-surface oc-roll variant so we don't
+			// double-spend the assertion against memory state from a prior subtest.
+			if agent.name == "oc-roll" && agent.subtestName == "openclaw_openai_surface" {
 				rollcallAssertMemoryTelemetry(t, cllamaContainerID, agent.name)
 			}
 
 			spikeWaitRunning(t, clawdashContainerID, 30*time.Second)
 			t.Log("clawdash sidecar confirmed running")
+
+			// Reaching this line means the runtime successfully completed an LLM
+			// call through cllama using its assigned ingress surface.
+			markSurfaceExercised(agent.expectedSurface)
 		})
 	}
 
 	// Verify session history survived every teardown — each agent's JSONL must
-	// still exist after all seven runtimes ran and tore down in sequence.
+	// still exist after all matrix entries ran and tore down in sequence.
+	// Skip entries that never ran (missing API keys) so partial coverage runs
+	// don't fail this check.
 	t.Run("session_history_persistence", func(t *testing.T) {
+		seen := make(map[string]bool)
 		for _, agent := range allAgents {
+			if len(rollcallMissingKeys(agent.requireKeys, availableKeys)) > 0 {
+				continue
+			}
+			if seen[agent.name] {
+				continue
+			}
+			seen[agent.name] = true
 			histFile := filepath.Join(sessionHistoryDir, agent.name, "history.jsonl")
 			if _, err := os.Stat(histFile); os.IsNotExist(err) {
 				t.Errorf("session history for %s (%s) missing after all runtimes completed — did not survive teardown", agent.name, agent.runtime)
 			}
 		}
-		t.Logf("session history confirmed persistent for all %d agents", len(allAgents))
+		t.Logf("session history confirmed persistent for %d distinct agents across %d matrix entries", len(seen), len(allAgents))
+	})
+
+	// Confirm the matrix exercised both canonical cllama ingress surfaces.
+	// This is the spike-level regression for ADR-023: if a future change
+	// reroutes everything to a single surface (or breaks one of them) it
+	// will be caught here even if individual subtests still pass.
+	t.Run("ingress_surface_coverage", func(t *testing.T) {
+		exercisedSurfacesMu.Lock()
+		defer exercisedSurfacesMu.Unlock()
+		required := []string{"anthropic-messages", "openai-chat-completions"}
+		for _, surface := range required {
+			if !exercisedSurfaces[surface] {
+				t.Errorf("cllama ingress surface %q was not exercised by any successful subtest", surface)
+			}
+		}
+		exercised := make([]string, 0, len(exercisedSurfaces))
+		for surface := range exercisedSurfaces {
+			exercised = append(exercised, surface)
+		}
+		t.Logf("exercised cllama ingress surfaces: %v", exercised)
 	})
 }
 
@@ -282,41 +435,65 @@ type rollcallDiscordAuthor struct {
 }
 
 type rollcallProxyRequest struct {
-	APIFormat string
-	Model     string
-	CllamaEnv map[string]string
+	APIFormat     string
+	Model         string
+	ModelOverride string // sets x-claw.models.primary for real runners (oc-roll, hm-roll)
+	CllamaEnv     map[string]string
 }
 
-func chooseRollcallProxyRequest(t *testing.T, xaiKey, anthropicKey, openrouterKey string) rollcallProxyRequest {
-	t.Helper()
+// rollcallAgentEntry describes a single matrix row in the rollcall spike. It
+// pins both the model that the runtime should request and the cllama ingress
+// surface that request is expected to traverse.
+type rollcallAgentEntry struct {
+	name            string   // agent service name in the rollcall pod
+	runtime         string   // runtime label, used for log/keyword matching
+	subtestName     string   // optional t.Run name; defaults to runtime
+	modelOverride   string   // x-claw.models.primary override (real runners)
+	proxyFormat     string   // ROLLCALL_CLLAMA_API_FORMAT (stub runners)
+	proxyModel      string   // ROLLCALL_CLLAMA_MODEL (stub runners)
+	expectedSurface string   // "anthropic-messages" | "openai-chat-completions"
+	requireKeys     []string // env keys required for this entry to run
+}
 
-	cfg := rollcallProxyRequest{
-		CllamaEnv: make(map[string]string),
-	}
-	if xaiKey != "" {
-		cfg.CllamaEnv["XAI_API_KEY"] = xaiKey
-	}
-	if anthropicKey != "" {
-		cfg.CllamaEnv["ANTHROPIC_API_KEY"] = anthropicKey
-	}
-	if openrouterKey != "" {
-		cfg.CllamaEnv["OPENROUTER_API_KEY"] = openrouterKey
+// rollcallMatrixHasUsableEntry returns true if at least one matrix entry has
+// every required key present in availableKeys.
+func rollcallMatrixHasUsableEntry(entries []rollcallAgentEntry, availableKeys map[string]string) bool {
+	for _, entry := range entries {
+		if len(rollcallMissingKeys(entry.requireKeys, availableKeys)) == 0 {
+			return true
+		}
 	}
+	return false
+}
 
-	switch {
-	case xaiKey != "":
-		cfg.APIFormat = "openai"
-		cfg.Model = "xai/grok-4-1-fast-reasoning"
-	case anthropicKey != "":
-		cfg.APIFormat = "anthropic"
-		cfg.Model = "claude-sonnet-4"
-	case openrouterKey != "":
-		cfg.APIFormat = "openai"
-		cfg.Model = "openrouter/anthropic/claude-sonnet-4"
-	default:
-		t.Fatal("rollcall proxy request requires at least one real provider key")
+// rollcallMissingKeys reports which of required are absent or empty in
+// availableKeys.
+func rollcallMissingKeys(required []string, availableKeys map[string]string) []string {
+	var missing []string
+	for _, key := range required {
+		if strings.TrimSpace(availableKeys[key]) == "" {
+			missing = append(missing, key)
+		}
 	}
+	return missing
+}
 
+// rollcallProxyRequestForEntry builds the inbound proxy-request and cllama-env
+// configuration for a single matrix entry.
+func rollcallProxyRequestForEntry(entry rollcallAgentEntry, availableKeys map[string]string) rollcallProxyRequest {
+	cfg := rollcallProxyRequest{
+		APIFormat:     entry.proxyFormat,
+		Model:         entry.proxyModel,
+		ModelOverride: entry.modelOverride,
+		CllamaEnv:     make(map[string]string),
+	}
+	// Forward only the keys this entry actually needs to the cllama sidecar.
+	// We deliberately do not flood every container with every API key.
+	for _, key := range entry.requireKeys {
+		if v := strings.TrimSpace(availableKeys[key]); v != "" {
+			cfg.CllamaEnv[key] = v
+		}
+	}
 	return cfg
 }
 
@@ -381,8 +558,12 @@ func rollcallInjectProxyRequest(t *testing.T, service map[string]interface{}, pr
 			rawEnv[k] = v
 		}
 	}
-	rawEnv["ROLLCALL_CLLAMA_API_FORMAT"] = proxyRequest.APIFormat
-	rawEnv["ROLLCALL_CLLAMA_MODEL"] = proxyRequest.Model
+	if proxyRequest.APIFormat != "" {
+		rawEnv["ROLLCALL_CLLAMA_API_FORMAT"] = proxyRequest.APIFormat
+	}
+	if proxyRequest.Model != "" {
+		rawEnv["ROLLCALL_CLLAMA_MODEL"] = proxyRequest.Model
+	}
 	service["environment"] = rawEnv
 
 	rawClaw, ok := service["x-claw"].(map[string]interface{})
@@ -406,6 +587,32 @@ func rollcallInjectProxyRequest(t *testing.T, service map[string]interface{}, pr
 		}
 	}
 	rawClaw["cllama-env"] = rawCllamaEnv
+
+	// Apply x-claw.models.primary override for both real runners and stubs.
+	// Pod-level model slots overlay image MODEL labels at compile time, which
+	// also seeds cllama's per-agent model policy. Stubs that send a model the
+	// policy doesn't allow get clamped via cllama's "disallowed_clamped"
+	// intervention back to the policy default — so the policy must allow the
+	// model the request will actually carry. For real runners we use
+	// proxyRequest.ModelOverride; for stubs we mirror proxyRequest.Model.
+	policyModel := strings.TrimSpace(proxyRequest.ModelOverride)
+	if policyModel == "" {
+		policyModel = strings.TrimSpace(proxyRequest.Model)
+	}
+	if policyModel != "" {
+		rawModels := make(map[string]interface{})
+		if existing, ok := rawClaw["models"]; ok && existing != nil {
+			existingMap, ok := existing.(map[string]interface{})
+			if !ok {
+				t.Fatalf("rollcall x-claw.models is not a map: %T", existing)
+			}
+			for k, v := range existingMap {
+				rawModels[k] = v
+			}
+		}
+		rawModels["primary"] = policyModel
+		rawClaw["models"] = rawModels
+	}
 }
 
 func rollcallResolveContainerID(t *testing.T, composePath, serviceName string) string {