From 1e03ba9eab0cd5eb34890b84b22e0f1f155aaacd Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 15:08:25 -0400 Subject: [PATCH 01/19] fix: preserve healthy gateway across sandbox lifecycle --- bin/lib/onboard.js | 36 +++++++++++++-- bin/nemoclaw.js | 105 +++++++++++++++++++++++++++++++++++++++---- test/cli.test.js | 100 ++++++++++++++++++++++++++++++++++++++++- test/onboard.test.js | 84 ++++++++++++++++++++++++++++++++++ 4 files changed, 312 insertions(+), 13 deletions(-) diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index 48a4cb2411..1d5efcc69f 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -185,6 +185,11 @@ function hasStaleGateway(gwInfoOutput) { return typeof gwInfoOutput === "string" && gwInfoOutput.length > 0 && gwInfoOutput.includes(GATEWAY_NAME); } +function isGatewayHealthy(statusOutput = "", gwInfoOutput = "") { + const connected = typeof statusOutput === "string" && statusOutput.includes("Connected"); + return connected && hasStaleGateway(gwInfoOutput); +} + function streamSandboxCreate(command, env = process.env, options = {}) { const child = spawn("bash", ["-lc", command], { cwd: ROOT, @@ -1237,8 +1242,16 @@ async function preflight() { // A previous onboard run may have left the gateway container and port // forward running. If a NemoClaw-owned gateway is still present, tear // it down so the port check below doesn't fail on our own leftovers. + const gatewayStatus = runCaptureOpenshell(["status"], { ignoreError: true }); const gwInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], { ignoreError: true }); - if (hasStaleGateway(gwInfo)) { + const healthyGateway = isGatewayHealthy(gatewayStatus, gwInfo); + if (healthyGateway) { + console.log(" Reusing existing NemoClaw gateway..."); + runOpenshell(["forward", "stop", "18789"], { ignoreError: true }); + runOpenshell(["gateway", "select", GATEWAY_NAME], { ignoreError: true }); + process.env.OPENSHELL_GATEWAY = GATEWAY_NAME; + console.log(" ✓ Existing gateway selected"); + } else if (hasStaleGateway(gwInfo)) { console.log(" Cleaning up previous NemoClaw session..."); runOpenshell(["forward", "stop", "18789"], { ignoreError: true }); runOpenshell(["gateway", "destroy", "-g", GATEWAY_NAME], { ignoreError: true }); @@ -1251,6 +1264,10 @@ async function preflight() { { port: 18789, label: "NemoClaw dashboard" }, ]; for (const { port, label } of requiredPorts) { + if (port === 8080 && healthyGateway) { + console.log(` ✓ Port ${port} already in use by active NemoClaw gateway (${label})`); + continue; + } const portCheck = await checkPortAvailable(port); if (!portCheck.ok) { console.error(""); @@ -1299,8 +1316,18 @@ async function preflight() { async function startGateway(_gpu) { step(3, 7, "Starting OpenShell gateway"); - // Destroy old gateway - runOpenshell(["gateway", "destroy", "-g", GATEWAY_NAME], { ignoreError: true }); + const gatewayStatus = runCaptureOpenshell(["status"], { ignoreError: true }); + const gwInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], { ignoreError: true }); + if (isGatewayHealthy(gatewayStatus, gwInfo)) { + console.log(" ✓ Reusing existing gateway"); + runOpenshell(["gateway", "select", GATEWAY_NAME], { ignoreError: true }); + process.env.OPENSHELL_GATEWAY = GATEWAY_NAME; + return; + } + + if (hasStaleGateway(gwInfo)) { + runOpenshell(["gateway", "destroy", "-g", GATEWAY_NAME], { ignoreError: true }); + } const gwArgs = ["--name", GATEWAY_NAME]; // Do NOT pass --gpu here. On DGX Spark (and most GPU hosts), inference is @@ -2204,12 +2231,15 @@ module.exports = { getInstalledOpenshellVersion, getStableGatewayImageRef, hasStaleGateway, + isGatewayHealthy, isSandboxReady, onboard, + preflight, pruneStaleSandboxEntry, runCaptureOpenshell, setupInference, setupNim, + startGateway, writeSandboxConfigSyncFile, patchStagedDockerfile, }; diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 737c59c160..bae21f421c 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -21,6 +21,7 @@ const _RD = _useColor ? "\x1b[1;31m" : ""; const YW = _useColor ? "\x1b[1;33m" : ""; const { ROOT, SCRIPTS, run, runCapture: _runCapture, runInteractive, shellQuote, validateName } = require("./lib/runner"); +const { resolveOpenshell } = require("./lib/resolve-openshell"); const { ensureApiKey, ensureGithubToken, @@ -40,6 +41,72 @@ const GLOBAL_COMMANDS = new Set([ ]); const REMOTE_UNINSTALL_URL = "https://raw.githubusercontent.com/NVIDIA/NemoClaw/refs/heads/main/uninstall.sh"; +let OPENSHELL_BIN = null; + +function getOpenshellBinary() { + if (!OPENSHELL_BIN) { + OPENSHELL_BIN = resolveOpenshell(); + } + if (!OPENSHELL_BIN) { + console.error("openshell CLI not found. Install OpenShell before using sandbox commands."); + process.exit(1); + } + return OPENSHELL_BIN; +} + +function runOpenshell(args, opts = {}) { + const result = spawnSync(getOpenshellBinary(), args, { + cwd: ROOT, + env: { ...process.env, ...opts.env }, + encoding: "utf-8", + stdio: opts.stdio ?? "inherit", + }); + if (result.status !== 0 && !opts.ignoreError) { + console.error(` Command failed (exit ${result.status}): openshell ${args.join(" ")}`); + process.exit(result.status || 1); + } + return result; +} + +function getSandboxGatewayState(sandboxName) { + const result = spawnSync(getOpenshellBinary(), ["sandbox", "get", sandboxName], { + cwd: ROOT, + env: process.env, + encoding: "utf-8", + stdio: ["ignore", "pipe", "pipe"], + }); + const output = `${result.stdout || ""}${result.stderr || ""}`.trim(); + if (result.status === 0) { + return { state: "present", output }; + } + if (/NotFound|sandbox not found/i.test(output)) { + return { state: "missing", output }; + } + if (/transport error|Connection refused|handshake verification failed|Missing gateway auth token|device identity required/i.test(output)) { + return { state: "gateway_error", output }; + } + return { state: "unknown_error", output }; +} + +function ensureLiveSandboxOrExit(sandboxName) { + const lookup = getSandboxGatewayState(sandboxName); + if (lookup.state === "present") { + return lookup; + } + if (lookup.state === "missing") { + registry.removeSandbox(sandboxName); + console.error(` Sandbox '${sandboxName}' is not present in the live OpenShell gateway.`); + console.error(" Removed stale local registry entry."); + console.error(" Run `nemoclaw list` to confirm the remaining sandboxes, or `nemoclaw onboard` to create a new one."); + process.exit(1); + } + console.error(` Unable to verify sandbox '${sandboxName}' against the live OpenShell gateway.`); + if (lookup.output) { + console.error(lookup.output); + } + console.error(" Check `openshell status` and the active gateway, then retry."); + process.exit(1); +} function resolveUninstallScript() { const candidates = [ @@ -298,10 +365,15 @@ function listSandboxes() { // ── Sandbox-scoped actions ─────────────────────────────────────── function sandboxConnect(sandboxName) { - const qn = shellQuote(sandboxName); + ensureLiveSandboxOrExit(sandboxName); // Ensure port forward is alive before connecting - run(`openshell forward start --background 18789 ${qn} 2>/dev/null || true`, { ignoreError: true }); - runInteractive(`openshell sandbox connect ${qn}`); + runOpenshell(["forward", "start", "--background", "18789", sandboxName], { ignoreError: true }); + const result = spawnSync(getOpenshellBinary(), ["sandbox", "connect", sandboxName], { + stdio: "inherit", + cwd: ROOT, + env: process.env, + }); + exitWithSpawnResult(result); } function sandboxStatus(sandboxName) { @@ -312,11 +384,25 @@ function sandboxStatus(sandboxName) { console.log(` Model: ${sb.model || "unknown"}`); console.log(` Provider: ${sb.provider || "unknown"}`); console.log(` GPU: ${sb.gpuEnabled ? "yes" : "no"}`); - console.log(` Policies: ${(sb.policies || []).join(", ") || "none"}`); + console.log(` Policies: ${(sb.policies || []).join(", ") || "none"}`); } - // openshell info - run(`openshell sandbox get ${shellQuote(sandboxName)} 2>/dev/null || true`, { ignoreError: true }); + const lookup = getSandboxGatewayState(sandboxName); + if (lookup.state === "present") { + console.log(""); + console.log(lookup.output); + } else if (lookup.state === "missing") { + registry.removeSandbox(sandboxName); + console.log(""); + console.log(` Sandbox '${sandboxName}' is not present in the live OpenShell gateway.`); + console.log(" Removed stale local registry entry."); + } else { + console.log(""); + console.log(` Could not verify sandbox '${sandboxName}' against the live OpenShell gateway.`); + if (lookup.output) { + console.log(lookup.output); + } + } // NIM health const nimStat = sb && sb.nimContainer ? nim.nimStatusByName(sb.nimContainer) : nim.nimStatus(sandboxName); @@ -328,8 +414,9 @@ function sandboxStatus(sandboxName) { } function sandboxLogs(sandboxName, follow) { - const followFlag = follow ? " --tail" : ""; - run(`openshell logs ${shellQuote(sandboxName)}${followFlag}`); + const args = ["logs", sandboxName]; + if (follow) args.push("--tail"); + runOpenshell(args); } async function sandboxPolicyAdd(sandboxName) { @@ -386,7 +473,7 @@ async function sandboxDestroy(sandboxName, args = []) { else nim.stopNimContainer(sandboxName); console.log(` Deleting sandbox '${sandboxName}'...`); - run(`openshell sandbox delete ${shellQuote(sandboxName)} 2>/dev/null || true`, { ignoreError: true }); + runOpenshell(["sandbox", "delete", sandboxName], { ignoreError: true }); registry.removeSandbox(sandboxName); console.log(` ${G}✓${R} Sandbox '${sandboxName}' destroyed`); diff --git a/test/cli.test.js b/test/cli.test.js index 82dd5ee649..42e3ff2a5d 100644 --- a/test/cli.test.js +++ b/test/cli.test.js @@ -3,16 +3,22 @@ import { describe, it, expect } from "vitest"; import { execSync } from "node:child_process"; +import fs from "node:fs"; +import os from "node:os"; import path from "node:path"; const CLI = path.join(import.meta.dirname, "..", "bin", "nemoclaw.js"); function run(args) { + return runWithEnv(args); +} + +function runWithEnv(args, env = {}) { try { const out = execSync(`node "${CLI}" ${args}`, { encoding: "utf-8", timeout: 10000, - env: { ...process.env, HOME: "/tmp/nemoclaw-cli-test-" + Date.now() }, + env: { ...process.env, HOME: "/tmp/nemoclaw-cli-test-" + Date.now(), ...env }, }); return { code: 0, out }; } catch (err) { @@ -90,4 +96,96 @@ describe("CLI dispatch", () => { expect(r.out.includes("Troubleshooting")).toBeTruthy(); expect(r.out.includes("nemoclaw debug")).toBeTruthy(); }); + + it("removes stale registry entries when connect targets a missing live sandbox", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-stale-connect-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: status: NotFound, message: \"sandbox not found\"' >&2", + " exit 1", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha connect", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + expect(r.out.includes("Removed stale local registry entry")).toBeTruthy(); + const saved = JSON.parse(fs.readFileSync(path.join(registryDir, "sandboxes.json"), "utf8")); + expect(saved.sandboxes.alpha).toBeUndefined(); + }); + + it("keeps registry entries when status hits a gateway-level transport error", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-gateway-error-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: transport error: handshake verification failed' >&2", + " exit 1", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out.includes("Could not verify sandbox 'alpha'")).toBeTruthy(); + const saved = JSON.parse(fs.readFileSync(path.join(registryDir, "sandboxes.json"), "utf8")); + expect(saved.sandboxes.alpha).toBeTruthy(); + }); }); diff --git a/test/onboard.test.js b/test/onboard.test.js index f1240a9ed4..841bc76e8d 100644 --- a/test/onboard.test.js +++ b/test/onboard.test.js @@ -12,6 +12,7 @@ import { buildSandboxConfigSyncScript, getFutureShellPathHint, getInstalledOpenshellVersion, + isGatewayHealthy, getSandboxInferenceConfig, getStableGatewayImageRef, patchStagedDockerfile, @@ -152,6 +153,27 @@ describe("onboard helpers", () => { expect(getStableGatewayImageRef("bogus")).toBe(null); }); + it("recognizes only a connected named NemoClaw gateway as healthy", () => { + expect( + isGatewayHealthy( + "Server Status\n\n Gateway: nemoclaw\n Status: Connected", + "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080" + ) + ).toBe(true); + expect( + isGatewayHealthy( + "Server Status\n\n Gateway: openshell\n Status: Connected", + "Error: no gateway metadata found" + ) + ).toBe(false); + expect( + isGatewayHealthy( + "Server Status\n\n Gateway: nemoclaw\n Status: Disconnected", + "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080" + ) + ).toBe(false); + }); + it("returns a future-shell PATH hint for user-local openshell installs", () => { expect(getFutureShellPathHint("/home/test/.local/bin", "/usr/local/bin:/usr/bin")).toBe( 'export PATH="/home/test/.local/bin:$PATH"' @@ -436,6 +458,68 @@ console.log(JSON.stringify({ liveExists, sandbox: registry.getSandbox("my-assist assert.equal(payload.sandbox, null); }); + it("reuses an existing healthy gateway instead of destroying it", () => { + const repoRoot = path.join(import.meta.dirname, ".."); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-gateway-reuse-")); + const fakeBin = path.join(tmpDir, "bin"); + const scriptPath = path.join(tmpDir, "gateway-reuse-check.js"); + const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js")); + const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js")); + + fs.mkdirSync(fakeBin, { recursive: true }); + fs.writeFileSync(path.join(fakeBin, "openshell"), "#!/usr/bin/env bash\nexit 0\n", { mode: 0o755 }); + + const script = String.raw` +const runner = require(${runnerPath}); +const commands = []; + +runner.run = (command, opts = {}) => { + commands.push(command); + return { status: 0 }; +}; +runner.runCapture = (command) => { + if (command.includes("'status'")) { + return "Server Status\n\n Gateway: nemoclaw\n Status: Connected"; + } + if (command.includes("'gateway' 'info' '-g' 'nemoclaw'")) { + return "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080"; + } + if (command.includes("'--version'")) { + return "openshell 0.0.12"; + } + return ""; +}; + +const { startGateway } = require(${onboardPath}); + +(async () => { + await startGateway(null); + console.log(JSON.stringify(commands)); +})().catch((error) => { + console.error(error); + process.exit(1); +}); +`; + fs.writeFileSync(scriptPath, script); + + const result = spawnSync(process.execPath, [scriptPath], { + cwd: repoRoot, + encoding: "utf-8", + env: { + ...process.env, + HOME: tmpDir, + PATH: `${fakeBin}:${process.env.PATH || ""}`, + }, + }); + + assert.equal(result.status, 0, result.stderr); + const commands = JSON.parse(result.stdout.trim().split("\n").pop()); + assert.equal(commands.length, 1); + assert.match(commands[0], /gateway' 'select' 'nemoclaw'/); + assert.doesNotMatch(commands[0], /gateway' 'destroy'/); + assert.doesNotMatch(commands[0], /gateway' 'start'/); + }); + it("builds the sandbox without uploading an external OpenClaw config file", async () => { const repoRoot = path.join(import.meta.dirname, ".."); const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-create-sandbox-")); From 4f625d66b63411e9e161c75144d479271e76bc3c Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 15:09:49 -0400 Subject: [PATCH 02/19] fix: reconcile live sandbox state during connect --- bin/nemoclaw.js | 20 ++++++++++++++++++++ test/cli.test.js | 1 + 2 files changed, 21 insertions(+) diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index bae21f421c..3a71eadafd 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -88,6 +88,24 @@ function getSandboxGatewayState(sandboxName) { return { state: "unknown_error", output }; } +function printGatewayLifecycleHint(output = "", sandboxName = "", writer = console.error) { + if (/handshake verification failed/i.test(output)) { + writer(" This looks like gateway identity drift after restart."); + writer(" Existing sandboxes may still be recorded locally, but the current gateway no longer trusts their prior connection state."); + writer(" Try re-establishing the NemoClaw gateway/runtime first. If the sandbox is still unreachable, recreate it with `nemoclaw onboard`."); + return; + } + if (/Connection refused|transport error/i.test(output)) { + writer(` The sandbox '${sandboxName}' may still exist, but the current gateway/runtime is not reachable.`); + writer(" Check `openshell status`, verify the active gateway, and retry."); + return; + } + if (/Missing gateway auth token|device identity required/i.test(output)) { + writer(" The gateway is reachable, but the current auth or device identity state is not usable."); + writer(" Verify the active gateway and retry after re-establishing the runtime."); + } +} + function ensureLiveSandboxOrExit(sandboxName) { const lookup = getSandboxGatewayState(sandboxName); if (lookup.state === "present") { @@ -104,6 +122,7 @@ function ensureLiveSandboxOrExit(sandboxName) { if (lookup.output) { console.error(lookup.output); } + printGatewayLifecycleHint(lookup.output, sandboxName); console.error(" Check `openshell status` and the active gateway, then retry."); process.exit(1); } @@ -402,6 +421,7 @@ function sandboxStatus(sandboxName) { if (lookup.output) { console.log(lookup.output); } + printGatewayLifecycleHint(lookup.output, sandboxName, console.log); } // NIM health diff --git a/test/cli.test.js b/test/cli.test.js index 42e3ff2a5d..8e1f66bf05 100644 --- a/test/cli.test.js +++ b/test/cli.test.js @@ -185,6 +185,7 @@ describe("CLI dispatch", () => { expect(r.code).toBe(0); expect(r.out.includes("Could not verify sandbox 'alpha'")).toBeTruthy(); + expect(r.out.includes("gateway identity drift after restart")).toBeTruthy(); const saved = JSON.parse(fs.readFileSync(path.join(registryDir, "sandboxes.json"), "utf8")); expect(saved.sandboxes.alpha).toBeTruthy(); }); From b67a72fedb81048ae2bd6e4e3c14443c5a58b93e Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 15:10:30 -0400 Subject: [PATCH 03/19] test: cover gateway reuse across double onboard --- test/e2e/test-double-onboard.sh | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/test/e2e/test-double-onboard.sh b/test/e2e/test-double-onboard.sh index f70d6533e7..f28821f5f2 100755 --- a/test/e2e/test-double-onboard.sh +++ b/test/e2e/test-double-onboard.sh @@ -2,11 +2,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Double onboard: verify that consecutive `nemoclaw onboard` runs recover -# automatically from stale state (gateway, port forward, registry entries) -# left behind by a previous run. +# Double onboard: verify that consecutive `nemoclaw onboard` runs can reuse +# the shared NemoClaw gateway safely and preserve existing sandboxes instead of +# destroying the prior session on every repeat run. # -# Regression test for issues #21, #22, #140, #152, #397. +# Regression test for issues #21, #22, #140, #152, #397, and #849. # # Key insight: running onboard without NVIDIA_API_KEY in non-interactive # mode causes process.exit(1) at step 4, but steps 1-3 (preflight, @@ -172,10 +172,10 @@ else fail "Second onboard exited $exit2 (expected 1)" fi -if grep -q "Cleaning up previous NemoClaw session" <<<"$output2"; then - pass "Stale session cleanup fired on second onboard" +if grep -q "Reusing existing NemoClaw gateway" <<<"$output2"; then + pass "Healthy gateway reused on second onboard" else - fail "Stale session cleanup did NOT fire (regression: #397)" + fail "Healthy gateway was not reused on second onboard" fi if grep -q "Port 8080 is not available" <<<"$output2"; then @@ -223,10 +223,10 @@ else fail "Third onboard exited $exit3 (expected 1)" fi -if grep -q "Cleaning up previous NemoClaw session" <<<"$output3"; then - pass "Stale session cleanup fired on third onboard" +if grep -q "Reusing existing NemoClaw gateway" <<<"$output3"; then + pass "Healthy gateway reused on third onboard" else - fail "Stale session cleanup did NOT fire on third onboard" + fail "Healthy gateway was not reused on third onboard" fi if grep -q "Port 8080 is not available" <<<"$output3"; then @@ -247,6 +247,12 @@ else fail "Sandbox '$SANDBOX_B' was not created" fi +if openshell sandbox get "$SANDBOX_A" >/dev/null 2>&1; then + pass "First sandbox '$SANDBOX_A' still exists after creating '$SANDBOX_B'" +else + fail "First sandbox '$SANDBOX_A' disappeared after creating '$SANDBOX_B' (regression: #849)" +fi + # ══════════════════════════════════════════════════════════════════ # Phase 5: Final cleanup # ══════════════════════════════════════════════════════════════════ From 55626e7974b9ee5632f945deb914fa176d08c2a2 Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 15:38:34 -0400 Subject: [PATCH 04/19] fix: classify gateway trust rotation on reconnect --- bin/nemoclaw.js | 125 +++++++++++++++++++++++++++++++++++++++++++--- test/cli.test.js | 126 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 244 insertions(+), 7 deletions(-) diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 3a71eadafd..27b13dd92b 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -68,14 +68,72 @@ function runOpenshell(args, opts = {}) { return result; } -function getSandboxGatewayState(sandboxName) { - const result = spawnSync(getOpenshellBinary(), ["sandbox", "get", sandboxName], { +function captureOpenshell(args, opts = {}) { + const result = spawnSync(getOpenshellBinary(), args, { cwd: ROOT, - env: process.env, + env: { ...process.env, ...opts.env }, encoding: "utf-8", stdio: ["ignore", "pipe", "pipe"], }); - const output = `${result.stdout || ""}${result.stderr || ""}`.trim(); + return { + status: result.status ?? 1, + output: `${result.stdout || ""}${result.stderr || ""}`.trim(), + }; +} + +function stripAnsi(value = "") { + // eslint-disable-next-line no-control-regex + return String(value).replace(/\x1b\[[0-9;]*m/g, ""); +} + +function hasNamedGateway(output = "") { + return stripAnsi(output).includes("Gateway: nemoclaw"); +} + +function getNamedGatewayLifecycleState() { + const status = captureOpenshell(["status"]); + const gatewayInfo = captureOpenshell(["gateway", "info", "-g", "nemoclaw"], { ignoreError: true }); + const connected = /Connected/i.test(stripAnsi(status.output)); + const named = hasNamedGateway(gatewayInfo.output); + if (connected && named) { + return { state: "healthy_named", status: status.output, gatewayInfo: gatewayInfo.output }; + } + if (named) { + return { state: "named_unhealthy", status: status.output, gatewayInfo: gatewayInfo.output }; + } + if (connected) { + return { state: "connected_other", status: status.output, gatewayInfo: gatewayInfo.output }; + } + return { state: "missing_named", status: status.output, gatewayInfo: gatewayInfo.output }; +} + +function recoverNamedGatewayRuntime() { + const before = getNamedGatewayLifecycleState(); + if (before.state === "healthy_named") { + return { recovered: true, before, after: before, attempted: false }; + } + + runOpenshell(["gateway", "select", "nemoclaw"], { ignoreError: true }); + let after = getNamedGatewayLifecycleState(); + if (after.state === "healthy_named") { + process.env.OPENSHELL_GATEWAY = "nemoclaw"; + return { recovered: true, before, after, attempted: true, via: "select" }; + } + + runOpenshell(["gateway", "start", "--name", "nemoclaw"], { ignoreError: true }); + runOpenshell(["gateway", "select", "nemoclaw"], { ignoreError: true }); + after = getNamedGatewayLifecycleState(); + if (after.state === "healthy_named") { + process.env.OPENSHELL_GATEWAY = "nemoclaw"; + return { recovered: true, before, after, attempted: true, via: "start" }; + } + + return { recovered: false, before, after, attempted: true }; +} + +function getSandboxGatewayState(sandboxName) { + const result = captureOpenshell(["sandbox", "get", sandboxName]); + const output = result.output; if (result.status === 0) { return { state: "present", output }; } @@ -92,7 +150,7 @@ function printGatewayLifecycleHint(output = "", sandboxName = "", writer = conso if (/handshake verification failed/i.test(output)) { writer(" This looks like gateway identity drift after restart."); writer(" Existing sandboxes may still be recorded locally, but the current gateway no longer trusts their prior connection state."); - writer(" Try re-establishing the NemoClaw gateway/runtime first. If the sandbox is still unreachable, recreate it with `nemoclaw onboard`."); + writer(" Try re-establishing the NemoClaw gateway/runtime first. If the sandbox is still unreachable, recreate just that sandbox with `nemoclaw onboard`."); return; } if (/Connection refused|transport error/i.test(output)) { @@ -106,8 +164,40 @@ function printGatewayLifecycleHint(output = "", sandboxName = "", writer = conso } } +function getReconciledSandboxGatewayState(sandboxName) { + let lookup = getSandboxGatewayState(sandboxName); + if (lookup.state === "present") { + return lookup; + } + if (lookup.state === "missing") { + return lookup; + } + + if (lookup.state === "gateway_error") { + const recovery = recoverNamedGatewayRuntime(); + if (recovery.recovered) { + const retried = getSandboxGatewayState(sandboxName); + if (retried.state === "present" || retried.state === "missing") { + return { ...retried, recoveredGateway: true, recoveryVia: recovery.via || null }; + } + if (/handshake verification failed/i.test(retried.output)) { + return { + state: "identity_drift", + output: retried.output, + recoveredGateway: true, + recoveryVia: recovery.via || null, + }; + } + return { ...retried, recoveredGateway: true, recoveryVia: recovery.via || null }; + } + return { ...lookup, gatewayRecoveryFailed: true }; + } + + return lookup; +} + function ensureLiveSandboxOrExit(sandboxName) { - const lookup = getSandboxGatewayState(sandboxName); + const lookup = getReconciledSandboxGatewayState(sandboxName); if (lookup.state === "present") { return lookup; } @@ -118,6 +208,15 @@ function ensureLiveSandboxOrExit(sandboxName) { console.error(" Run `nemoclaw list` to confirm the remaining sandboxes, or `nemoclaw onboard` to create a new one."); process.exit(1); } + if (lookup.state === "identity_drift") { + console.error(` Sandbox '${sandboxName}' is recorded locally, but the gateway trust material rotated after restart.`); + if (lookup.output) { + console.error(lookup.output); + } + console.error(" Existing sandbox connections cannot be reattached safely after this gateway identity change."); + console.error(" Recreate this sandbox with `nemoclaw onboard` once the gateway runtime is stable."); + process.exit(1); + } console.error(` Unable to verify sandbox '${sandboxName}' against the live OpenShell gateway.`); if (lookup.output) { console.error(lookup.output); @@ -406,15 +505,27 @@ function sandboxStatus(sandboxName) { console.log(` Policies: ${(sb.policies || []).join(", ") || "none"}`); } - const lookup = getSandboxGatewayState(sandboxName); + const lookup = getReconciledSandboxGatewayState(sandboxName); if (lookup.state === "present") { console.log(""); + if (lookup.recoveredGateway) { + console.log(` Recovered NemoClaw gateway runtime via ${lookup.recoveryVia || "gateway reattach"}.`); + console.log(""); + } console.log(lookup.output); } else if (lookup.state === "missing") { registry.removeSandbox(sandboxName); console.log(""); console.log(` Sandbox '${sandboxName}' is not present in the live OpenShell gateway.`); console.log(" Removed stale local registry entry."); + } else if (lookup.state === "identity_drift") { + console.log(""); + console.log(` Sandbox '${sandboxName}' is recorded locally, but the gateway trust material rotated after restart.`); + if (lookup.output) { + console.log(lookup.output); + } + console.log(" Existing sandbox connections cannot be reattached safely after this gateway identity change."); + console.log(" Recreate this sandbox with `nemoclaw onboard` once the gateway runtime is stable."); } else { console.log(""); console.log(` Could not verify sandbox '${sandboxName}' against the live OpenShell gateway.`); diff --git a/test/cli.test.js b/test/cli.test.js index 8e1f66bf05..c66af859be 100644 --- a/test/cli.test.js +++ b/test/cli.test.js @@ -189,4 +189,130 @@ describe("CLI dispatch", () => { const saved = JSON.parse(fs.readFileSync(path.join(registryDir, "sandboxes.json"), "utf8")); expect(saved.sandboxes.alpha).toBeTruthy(); }); + + it("recovers status after gateway runtime is reattached", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-recover-status-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + const stateFile = path.join(home, "sandbox-get-count"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `state_file=${JSON.stringify(stateFile)}`, + "count=$(cat \"$state_file\" 2>/dev/null || echo 0)", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " count=$((count + 1))", + " echo \"$count\" > \"$state_file\"", + " if [ \"$count\" -eq 1 ]; then", + " echo 'Error: transport error: Connection refused' >&2", + " exit 1", + " fi", + " echo 'Sandbox: alpha'", + " exit 0", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Connected'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " echo 'Gateway Info'", + " echo", + " echo ' Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out.includes("Recovered NemoClaw gateway runtime")).toBeTruthy(); + expect(r.out.includes("Sandbox: alpha")).toBeTruthy(); + }); + + it("explains unrecoverable gateway trust rotation after restart", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-identity-drift-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: transport error: handshake verification failed' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Connected'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " echo 'Gateway Info'", + " echo", + " echo ' Gateway: nemoclaw'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const statusResult = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + expect(statusResult.code).toBe(0); + expect(statusResult.out.includes("gateway trust material rotated after restart")).toBeTruthy(); + expect(statusResult.out.includes("cannot be reattached safely")).toBeTruthy(); + + const connectResult = runWithEnv("alpha connect", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + expect(connectResult.code).toBe(1); + expect(connectResult.out.includes("gateway trust material rotated after restart")).toBeTruthy(); + expect(connectResult.out.includes("Recreate this sandbox")).toBeTruthy(); + }); }); From 3f265082df2f8645d2919557c398add6bcb57df1 Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 16:44:21 -0400 Subject: [PATCH 05/19] fix: classify unreachable gateway after restart --- bin/nemoclaw.js | 37 +++++++++++++++++++++++- test/cli.test.js | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 27b13dd92b..40ba3cb0fd 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -93,11 +93,16 @@ function hasNamedGateway(output = "") { function getNamedGatewayLifecycleState() { const status = captureOpenshell(["status"]); const gatewayInfo = captureOpenshell(["gateway", "info", "-g", "nemoclaw"], { ignoreError: true }); - const connected = /Connected/i.test(stripAnsi(status.output)); + const cleanStatus = stripAnsi(status.output); + const connected = /Connected/i.test(cleanStatus); const named = hasNamedGateway(gatewayInfo.output); + const refusing = /Connection refused|client error \(Connect\)|tcp connect error/i.test(cleanStatus); if (connected && named) { return { state: "healthy_named", status: status.output, gatewayInfo: gatewayInfo.output }; } + if (named && refusing) { + return { state: "named_unreachable", status: status.output, gatewayInfo: gatewayInfo.output }; + } if (named) { return { state: "named_unhealthy", status: status.output, gatewayInfo: gatewayInfo.output }; } @@ -147,6 +152,13 @@ function getSandboxGatewayState(sandboxName) { } function printGatewayLifecycleHint(output = "", sandboxName = "", writer = console.error) { + const cleanOutput = stripAnsi(output); + if (/Connection refused|client error \(Connect\)|tcp connect error/i.test(cleanOutput) && /Gateway:\s+nemoclaw/i.test(cleanOutput)) { + writer(" The selected NemoClaw gateway exists in metadata, but its API is refusing connections after restart."); + writer(" This usually means the gateway runtime did not come back cleanly after the restart."); + writer(" Retry `openshell gateway start --name nemoclaw`; if it stays in this state, rebuild the gateway before expecting existing sandboxes to reconnect."); + return; + } if (/handshake verification failed/i.test(output)) { writer(" This looks like gateway identity drift after restart."); writer(" Existing sandboxes may still be recorded locally, but the current gateway no longer trusts their prior connection state."); @@ -190,6 +202,12 @@ function getReconciledSandboxGatewayState(sandboxName) { } return { ...retried, recoveredGateway: true, recoveryVia: recovery.via || null }; } + if (recovery.after?.state === "named_unreachable" || recovery.before?.state === "named_unreachable") { + return { + state: "gateway_unreachable_after_restart", + output: recovery.after?.status || recovery.before?.status || lookup.output, + }; + } return { ...lookup, gatewayRecoveryFailed: true }; } @@ -217,6 +235,15 @@ function ensureLiveSandboxOrExit(sandboxName) { console.error(" Recreate this sandbox with `nemoclaw onboard` once the gateway runtime is stable."); process.exit(1); } + if (lookup.state === "gateway_unreachable_after_restart") { + console.error(` Sandbox '${sandboxName}' may still exist, but the selected NemoClaw gateway is still refusing connections after restart.`); + if (lookup.output) { + console.error(lookup.output); + } + console.error(" Retry `openshell gateway start --name nemoclaw` and verify `openshell status` is healthy before reconnecting."); + console.error(" If the gateway never becomes healthy, rebuild the gateway and then recreate the affected sandbox."); + process.exit(1); + } console.error(` Unable to verify sandbox '${sandboxName}' against the live OpenShell gateway.`); if (lookup.output) { console.error(lookup.output); @@ -526,6 +553,14 @@ function sandboxStatus(sandboxName) { } console.log(" Existing sandbox connections cannot be reattached safely after this gateway identity change."); console.log(" Recreate this sandbox with `nemoclaw onboard` once the gateway runtime is stable."); + } else if (lookup.state === "gateway_unreachable_after_restart") { + console.log(""); + console.log(` Sandbox '${sandboxName}' may still exist, but the selected NemoClaw gateway is still refusing connections after restart.`); + if (lookup.output) { + console.log(lookup.output); + } + console.log(" Retry `openshell gateway start --name nemoclaw` and verify `openshell status` is healthy before reconnecting."); + console.log(" If the gateway never becomes healthy, rebuild the gateway and then recreate the affected sandbox."); } else { console.log(""); console.log(` Could not verify sandbox '${sandboxName}' against the live OpenShell gateway.`); diff --git a/test/cli.test.js b/test/cli.test.js index c66af859be..e5d3a985d7 100644 --- a/test/cli.test.js +++ b/test/cli.test.js @@ -315,4 +315,77 @@ describe("CLI dispatch", () => { expect(connectResult.out.includes("gateway trust material rotated after restart")).toBeTruthy(); expect(connectResult.out.includes("Recreate this sandbox")).toBeTruthy(); }); + + it("explains when gateway metadata exists but the restarted API is still refusing connections", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-gateway-unreachable-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: transport error: Connection refused' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Server Status'", + " echo", + " echo ' Gateway: nemoclaw'", + " echo ' Server: https://127.0.0.1:8080'", + " echo 'Error: client error (Connect)' >&2", + " echo 'Connection refused (os error 111)' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " echo 'Gateway Info'", + " echo", + " echo ' Gateway: nemoclaw'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"select\" ] && [ \"$3\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"start\" ] && [ \"$3\" = \"--name\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const statusResult = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + expect(statusResult.code).toBe(0); + expect(statusResult.out.includes("gateway is still refusing connections after restart")).toBeTruthy(); + expect(statusResult.out.includes("Retry `openshell gateway start --name nemoclaw`")).toBeTruthy(); + + const connectResult = runWithEnv("alpha connect", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + expect(connectResult.code).toBe(1); + expect(connectResult.out.includes("gateway is still refusing connections after restart")).toBeTruthy(); + expect(connectResult.out.includes("If the gateway never becomes healthy")).toBeTruthy(); + }); }); From c840cb267763e5d6993efeaecb0ceb7d00fdf9a2 Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 17:00:45 -0400 Subject: [PATCH 06/19] fix: detect unreachable restarted gateway from status --- bin/nemoclaw.js | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 40ba3cb0fd..9afc8b34f4 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -202,6 +202,14 @@ function getReconciledSandboxGatewayState(sandboxName) { } return { ...retried, recoveredGateway: true, recoveryVia: recovery.via || null }; } + const latestLifecycle = getNamedGatewayLifecycleState(); + const latestStatus = stripAnsi(latestLifecycle.status || ""); + if (/Connection refused|client error \(Connect\)|tcp connect error/i.test(latestStatus) && /Gateway:\s+nemoclaw/i.test(latestStatus)) { + return { + state: "gateway_unreachable_after_restart", + output: latestLifecycle.status || lookup.output, + }; + } if (recovery.after?.state === "named_unreachable" || recovery.before?.state === "named_unreachable") { return { state: "gateway_unreachable_after_restart", From 8cff01f401a51d819d2b81d02b8e3d0c1dc28934 Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 17:04:47 -0400 Subject: [PATCH 07/19] fix: distinguish missing gateway after rebuild --- bin/nemoclaw.js | 29 ++++++++++++++++++++++++ test/cli.test.js | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 9afc8b34f4..2c3e210b27 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -153,6 +153,12 @@ function getSandboxGatewayState(sandboxName) { function printGatewayLifecycleHint(output = "", sandboxName = "", writer = console.error) { const cleanOutput = stripAnsi(output); + if (/No gateway configured/i.test(cleanOutput)) { + writer(" The selected NemoClaw gateway is no longer configured or its metadata/runtime has been lost."); + writer(" Start the gateway again with `openshell gateway start --name nemoclaw` before expecting existing sandboxes to reconnect."); + writer(" If the gateway has to be rebuilt from scratch, recreate the affected sandbox afterward."); + return; + } if (/Connection refused|client error \(Connect\)|tcp connect error/i.test(cleanOutput) && /Gateway:\s+nemoclaw/i.test(cleanOutput)) { writer(" The selected NemoClaw gateway exists in metadata, but its API is refusing connections after restart."); writer(" This usually means the gateway runtime did not come back cleanly after the restart."); @@ -204,6 +210,12 @@ function getReconciledSandboxGatewayState(sandboxName) { } const latestLifecycle = getNamedGatewayLifecycleState(); const latestStatus = stripAnsi(latestLifecycle.status || ""); + if (/No gateway configured/i.test(latestStatus)) { + return { + state: "gateway_missing_after_restart", + output: latestLifecycle.status || lookup.output, + }; + } if (/Connection refused|client error \(Connect\)|tcp connect error/i.test(latestStatus) && /Gateway:\s+nemoclaw/i.test(latestStatus)) { return { state: "gateway_unreachable_after_restart", @@ -252,6 +264,15 @@ function ensureLiveSandboxOrExit(sandboxName) { console.error(" If the gateway never becomes healthy, rebuild the gateway and then recreate the affected sandbox."); process.exit(1); } + if (lookup.state === "gateway_missing_after_restart") { + console.error(` Sandbox '${sandboxName}' may still exist locally, but the NemoClaw gateway is no longer configured after restart/rebuild.`); + if (lookup.output) { + console.error(lookup.output); + } + console.error(" Start the gateway again with `openshell gateway start --name nemoclaw` before retrying."); + console.error(" If the gateway had to be rebuilt from scratch, recreate the affected sandbox afterward."); + process.exit(1); + } console.error(` Unable to verify sandbox '${sandboxName}' against the live OpenShell gateway.`); if (lookup.output) { console.error(lookup.output); @@ -569,6 +590,14 @@ function sandboxStatus(sandboxName) { } console.log(" Retry `openshell gateway start --name nemoclaw` and verify `openshell status` is healthy before reconnecting."); console.log(" If the gateway never becomes healthy, rebuild the gateway and then recreate the affected sandbox."); + } else if (lookup.state === "gateway_missing_after_restart") { + console.log(""); + console.log(` Sandbox '${sandboxName}' may still exist locally, but the NemoClaw gateway is no longer configured after restart/rebuild.`); + if (lookup.output) { + console.log(lookup.output); + } + console.log(" Start the gateway again with `openshell gateway start --name nemoclaw` before retrying."); + console.log(" If the gateway had to be rebuilt from scratch, recreate the affected sandbox afterward."); } else { console.log(""); console.log(` Could not verify sandbox '${sandboxName}' against the live OpenShell gateway.`); diff --git a/test/cli.test.js b/test/cli.test.js index e5d3a985d7..b64e801910 100644 --- a/test/cli.test.js +++ b/test/cli.test.js @@ -388,4 +388,63 @@ describe("CLI dispatch", () => { expect(connectResult.out.includes("gateway is still refusing connections after restart")).toBeTruthy(); expect(connectResult.out.includes("If the gateway never becomes healthy")).toBeTruthy(); }); + + it("explains when the named gateway is no longer configured after restart or rebuild", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-gateway-missing-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: transport error: Connection refused' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Gateway Status'", + " echo", + " echo ' Status: No gateway configured.'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " exit 1", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"select\" ] && [ \"$3\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"start\" ] && [ \"$3\" = \"--name\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " exit 1", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const statusResult = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + expect(statusResult.code).toBe(0); + expect(statusResult.out.includes("gateway is no longer configured after restart/rebuild")).toBeTruthy(); + expect(statusResult.out.includes("Start the gateway again")).toBeTruthy(); + }); }); From e9a98e76463c9cd2a6d247734c7cc65751a9f73d Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 18:23:26 -0400 Subject: [PATCH 08/19] refactor: reuse shared gateway startup recovery --- bin/lib/onboard.js | 16 ++++++++++++++-- bin/nemoclaw.js | 46 +++++++++++++++++++++++++++++----------------- test/cli.test.js | 16 ++++++++-------- 3 files changed, 51 insertions(+), 27 deletions(-) diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index 3642d358e0..f9d3ecd58a 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -1313,7 +1313,7 @@ async function preflight() { // ── Step 2: Gateway ────────────────────────────────────────────── -async function startGateway(_gpu) { +async function startGatewayWithOptions(_gpu, { exitOnFailure = true } = {}) { step(3, 7, "Starting OpenShell gateway"); const gatewayStatus = runCaptureOpenshell(["status"], { ignoreError: true }); @@ -1357,7 +1357,10 @@ async function startGateway(_gpu) { } if (i === 4) { console.error(" Gateway failed to start. Run: openshell gateway info"); - process.exit(1); + if (exitOnFailure) { + process.exit(1); + } + throw new Error("Gateway failed to start"); } sleep(2); } @@ -1374,6 +1377,14 @@ async function startGateway(_gpu) { process.env.OPENSHELL_GATEWAY = GATEWAY_NAME; } +async function startGateway(_gpu) { + return startGatewayWithOptions(_gpu, { exitOnFailure: true }); +} + +async function startGatewayForRecovery(_gpu) { + return startGatewayWithOptions(_gpu, { exitOnFailure: false }); +} + // ── Step 3: Sandbox ────────────────────────────────────────────── async function createSandbox(gpu, model, provider, preferredInferenceApi = null) { @@ -2316,6 +2327,7 @@ module.exports = { setupInference, setupNim, startGateway, + startGatewayForRecovery, writeSandboxConfigSyncFile, patchStagedDockerfile, }; diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 3770136245..3d00c894ee 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -22,6 +22,7 @@ const YW = _useColor ? "\x1b[1;33m" : ""; const { ROOT, SCRIPTS, run, runCapture: _runCapture, runInteractive, shellQuote, validateName } = require("./lib/runner"); const { resolveOpenshell } = require("./lib/resolve-openshell"); +const { startGatewayForRecovery } = require("./lib/onboard"); const { ensureApiKey, ensureGithubToken, @@ -113,7 +114,7 @@ function getNamedGatewayLifecycleState() { return { state: "missing_named", status: status.output, gatewayInfo: gatewayInfo.output }; } -function recoverNamedGatewayRuntime() { +async function recoverNamedGatewayRuntime() { const before = getNamedGatewayLifecycleState(); if (before.state === "healthy_named") { return { recovered: true, before, after: before, attempted: false }; @@ -126,12 +127,23 @@ function recoverNamedGatewayRuntime() { return { recovered: true, before, after, attempted: true, via: "select" }; } - runOpenshell(["gateway", "start", "--name", "nemoclaw"], { ignoreError: true }); - runOpenshell(["gateway", "select", "nemoclaw"], { ignoreError: true }); - after = getNamedGatewayLifecycleState(); - if (after.state === "healthy_named") { - process.env.OPENSHELL_GATEWAY = "nemoclaw"; - return { recovered: true, before, after, attempted: true, via: "start" }; + const shouldStartGateway = [before.state, after.state].some((state) => + ["named_unhealthy", "named_unreachable", "connected_other"].includes(state) + ); + + if (shouldStartGateway) { + try { + await startGatewayForRecovery(); + } catch { + // Fall through to the lifecycle re-check below so we preserve the + // existing recovery result shape and emit the correct classification. + } + runOpenshell(["gateway", "select", "nemoclaw"], { ignoreError: true }); + after = getNamedGatewayLifecycleState(); + if (after.state === "healthy_named") { + process.env.OPENSHELL_GATEWAY = "nemoclaw"; + return { recovered: true, before, after, attempted: true, via: "start" }; + } } return { recovered: false, before, after, attempted: true }; @@ -183,7 +195,7 @@ function printGatewayLifecycleHint(output = "", sandboxName = "", writer = conso } } -function getReconciledSandboxGatewayState(sandboxName) { +async function getReconciledSandboxGatewayState(sandboxName) { let lookup = getSandboxGatewayState(sandboxName); if (lookup.state === "present") { return lookup; @@ -193,7 +205,7 @@ function getReconciledSandboxGatewayState(sandboxName) { } if (lookup.state === "gateway_error") { - const recovery = recoverNamedGatewayRuntime(); + const recovery = await recoverNamedGatewayRuntime(); if (recovery.recovered) { const retried = getSandboxGatewayState(sandboxName); if (retried.state === "present" || retried.state === "missing") { @@ -235,8 +247,8 @@ function getReconciledSandboxGatewayState(sandboxName) { return lookup; } -function ensureLiveSandboxOrExit(sandboxName) { - const lookup = getReconciledSandboxGatewayState(sandboxName); +async function ensureLiveSandboxOrExit(sandboxName) { + const lookup = await getReconciledSandboxGatewayState(sandboxName); if (lookup.state === "present") { return lookup; } @@ -539,8 +551,8 @@ function listSandboxes() { // ── Sandbox-scoped actions ─────────────────────────────────────── -function sandboxConnect(sandboxName) { - ensureLiveSandboxOrExit(sandboxName); +async function sandboxConnect(sandboxName) { + await ensureLiveSandboxOrExit(sandboxName); // Ensure port forward is alive before connecting runOpenshell(["forward", "start", "--background", "18789", sandboxName], { ignoreError: true }); const result = spawnSync(getOpenshellBinary(), ["sandbox", "connect", sandboxName], { @@ -551,7 +563,7 @@ function sandboxConnect(sandboxName) { exitWithSpawnResult(result); } -function sandboxStatus(sandboxName) { +async function sandboxStatus(sandboxName) { const sb = registry.getSandbox(sandboxName); const live = parseGatewayInference( _runCapture("openshell inference get 2>/dev/null", { ignoreError: true }) @@ -565,7 +577,7 @@ function sandboxStatus(sandboxName) { console.log(` Policies: ${(sb.policies || []).join(", ") || "none"}`); } - const lookup = getReconciledSandboxGatewayState(sandboxName); + const lookup = await getReconciledSandboxGatewayState(sandboxName); if (lookup.state === "present") { console.log(""); if (lookup.recoveredGateway) { @@ -778,8 +790,8 @@ const [cmd, ...args] = process.argv.slice(2); const actionArgs = args.slice(1); switch (action) { - case "connect": sandboxConnect(cmd); break; - case "status": sandboxStatus(cmd); break; + case "connect": await sandboxConnect(cmd); break; + case "status": await sandboxStatus(cmd); break; case "logs": sandboxLogs(cmd, actionArgs.includes("--follow")); break; case "policy-add": await sandboxPolicyAdd(cmd); break; case "policy-list": sandboxPolicyList(cmd); break; diff --git a/test/cli.test.js b/test/cli.test.js index b64e801910..1bb7715f2d 100644 --- a/test/cli.test.js +++ b/test/cli.test.js @@ -13,11 +13,11 @@ function run(args) { return runWithEnv(args); } -function runWithEnv(args, env = {}) { +function runWithEnv(args, env = {}, timeout = 10000) { try { const out = execSync(`node "${CLI}" ${args}`, { encoding: "utf-8", - timeout: 10000, + timeout, env: { ...process.env, HOME: "/tmp/nemoclaw-cli-test-" + Date.now(), ...env }, }); return { code: 0, out }; @@ -181,14 +181,14 @@ describe("CLI dispatch", () => { const r = runWithEnv("alpha status", { HOME: home, PATH: `${localBin}:${process.env.PATH || ""}`, - }); + }, 25000); expect(r.code).toBe(0); expect(r.out.includes("Could not verify sandbox 'alpha'")).toBeTruthy(); expect(r.out.includes("gateway identity drift after restart")).toBeTruthy(); const saved = JSON.parse(fs.readFileSync(path.join(registryDir, "sandboxes.json"), "utf8")); expect(saved.sandboxes.alpha).toBeTruthy(); - }); + }, 25000); it("recovers status after gateway runtime is reattached", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-recover-status-")); @@ -302,7 +302,7 @@ describe("CLI dispatch", () => { const statusResult = runWithEnv("alpha status", { HOME: home, PATH: `${localBin}:${process.env.PATH || ""}`, - }); + }, 25000); expect(statusResult.code).toBe(0); expect(statusResult.out.includes("gateway trust material rotated after restart")).toBeTruthy(); expect(statusResult.out.includes("cannot be reattached safely")).toBeTruthy(); @@ -375,7 +375,7 @@ describe("CLI dispatch", () => { const statusResult = runWithEnv("alpha status", { HOME: home, PATH: `${localBin}:${process.env.PATH || ""}`, - }); + }, 25000); expect(statusResult.code).toBe(0); expect(statusResult.out.includes("gateway is still refusing connections after restart")).toBeTruthy(); expect(statusResult.out.includes("Retry `openshell gateway start --name nemoclaw`")).toBeTruthy(); @@ -387,7 +387,7 @@ describe("CLI dispatch", () => { expect(connectResult.code).toBe(1); expect(connectResult.out.includes("gateway is still refusing connections after restart")).toBeTruthy(); expect(connectResult.out.includes("If the gateway never becomes healthy")).toBeTruthy(); - }); + }, 25000); it("explains when the named gateway is no longer configured after restart or rebuild", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-gateway-missing-")); @@ -446,5 +446,5 @@ describe("CLI dispatch", () => { expect(statusResult.code).toBe(0); expect(statusResult.out.includes("gateway is no longer configured after restart/rebuild")).toBeTruthy(); expect(statusResult.out.includes("Start the gateway again")).toBeTruthy(); - }); + }, 25000); }); From f73f38813e8aab63582a7a2c6338fffa23f0615f Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 18:31:41 -0400 Subject: [PATCH 09/19] fix: pass follow flag through to logs --- bin/nemoclaw.js | 2 +- test/cli.test.js | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 3d00c894ee..1a383d4cb3 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -634,7 +634,7 @@ async function sandboxStatus(sandboxName) { function sandboxLogs(sandboxName, follow) { const args = ["logs", sandboxName]; - if (follow) args.push("--tail"); + if (follow) args.push("--follow"); runOpenshell(args); } diff --git a/test/cli.test.js b/test/cli.test.js index 1bb7715f2d..7c7ed82b07 100644 --- a/test/cli.test.js +++ b/test/cli.test.js @@ -97,6 +97,49 @@ describe("CLI dispatch", () => { expect(r.out.includes("nemoclaw debug")).toBeTruthy(); }); + it("passes --follow through to openshell logs", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-logs-follow-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + const markerFile = path.join(home, "logs-args"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `marker_file=${JSON.stringify(markerFile)}`, + "printf '%s ' \"$@\" > \"$marker_file\"", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha logs --follow", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(fs.readFileSync(markerFile, "utf8")).toContain("logs alpha --follow"); + }); + it("removes stale registry entries when connect targets a missing live sandbox", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-stale-connect-")); const localBin = path.join(home, "bin"); From 2e1d01923df0c004ccd4050afdd619f60ea2e2eb Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 18:38:51 -0400 Subject: [PATCH 10/19] fix: tighten gateway cli capture handling --- bin/nemoclaw.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 1a383d4cb3..3d8e72a641 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -79,7 +79,7 @@ function captureOpenshell(args, opts = {}) { }); return { status: result.status ?? 1, - output: `${result.stdout || ""}${result.stderr || ""}`.trim(), + output: `${result.stdout || ""}${opts.ignoreError ? "" : result.stderr || ""}`.trim(), }; } @@ -94,7 +94,7 @@ function hasNamedGateway(output = "") { function getNamedGatewayLifecycleState() { const status = captureOpenshell(["status"]); - const gatewayInfo = captureOpenshell(["gateway", "info", "-g", "nemoclaw"], { ignoreError: true }); + const gatewayInfo = captureOpenshell(["gateway", "info", "-g", "nemoclaw"]); const cleanStatus = stripAnsi(status.output); const connected = /Connected/i.test(cleanStatus); const named = hasNamedGateway(gatewayInfo.output); @@ -178,7 +178,7 @@ function printGatewayLifecycleHint(output = "", sandboxName = "", writer = conso writer(" Retry `openshell gateway start --name nemoclaw`; if it stays in this state, rebuild the gateway before expecting existing sandboxes to reconnect."); return; } - if (/handshake verification failed/i.test(output)) { + if (/handshake verification failed/i.test(cleanOutput)) { writer(" This looks like gateway identity drift after restart."); writer(" Existing sandboxes may still be recorded locally, but the current gateway no longer trusts their prior connection state."); writer(" Try re-establishing the NemoClaw gateway/runtime first. If the sandbox is still unreachable, recreate just that sandbox with `nemoclaw onboard`."); @@ -566,7 +566,7 @@ async function sandboxConnect(sandboxName) { async function sandboxStatus(sandboxName) { const sb = registry.getSandbox(sandboxName); const live = parseGatewayInference( - _runCapture("openshell inference get 2>/dev/null", { ignoreError: true }) + captureOpenshell(["inference", "get"], { ignoreError: true }).output ); if (sb) { console.log(""); @@ -574,7 +574,7 @@ async function sandboxStatus(sandboxName) { console.log(` Model: ${(live && live.model) || sb.model || "unknown"}`); console.log(` Provider: ${(live && live.provider) || sb.provider || "unknown"}`); console.log(` GPU: ${sb.gpuEnabled ? "yes" : "no"}`); - console.log(` Policies: ${(sb.policies || []).join(", ") || "none"}`); + console.log(` Policies: ${(sb.policies || []).join(", ") || "none"}`); } const lookup = await getReconciledSandboxGatewayState(sandboxName); From 0955ff0aebf44c4894f148f25f789eca3d7004ed Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 19:13:03 -0400 Subject: [PATCH 11/19] fix: require named gateway identity for recovery --- bin/lib/onboard.js | 32 ++++++++++++++++-- bin/nemoclaw.js | 12 +++++-- test/cli.test.js | 78 ++++++++++++++++++++++++++++++++++++++++++-- test/onboard.test.js | 6 ++++ 4 files changed, 121 insertions(+), 7 deletions(-) diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index f9d3ecd58a..7d17685c45 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -185,9 +185,36 @@ function hasStaleGateway(gwInfoOutput) { return typeof gwInfoOutput === "string" && gwInfoOutput.length > 0 && gwInfoOutput.includes(GATEWAY_NAME); } +function stripAnsi(value = "") { + let cleaned = ""; + for (let i = 0; i < value.length; i += 1) { + if (value.charCodeAt(i) === 27 && value[i + 1] === "[") { + i += 2; + while (i < value.length && /[0-9;]/.test(value[i])) { + i += 1; + } + if (value[i] === "m") { + continue; + } + } + cleaned += value[i] || ""; + } + return cleaned; +} + +function getActiveGatewayName(statusOutput = "") { + if (typeof statusOutput !== "string" || statusOutput.length === 0) { + return ""; + } + const match = stripAnsi(statusOutput) + .match(/^\s*Gateway:\s+(.+?)\s*$/m); + return match ? match[1].trim() : ""; +} + function isGatewayHealthy(statusOutput = "", gwInfoOutput = "") { const connected = typeof statusOutput === "string" && statusOutput.includes("Connected"); - return connected && hasStaleGateway(gwInfoOutput); + const activeGateway = getActiveGatewayName(statusOutput); + return connected && activeGateway === GATEWAY_NAME && hasStaleGateway(gwInfoOutput); } function streamSandboxCreate(command, env = process.env, options = {}) { @@ -1351,7 +1378,8 @@ async function startGatewayWithOptions(_gpu, { exitOnFailure = true } = {}) { // Verify health for (let i = 0; i < 5; i++) { const status = runCaptureOpenshell(["status"], { ignoreError: true }); - if (status.includes("Connected")) { + const gwInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], { ignoreError: true }); + if (isGatewayHealthy(status, gwInfo)) { console.log(" ✓ Gateway is healthy"); break; } diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 3d8e72a641..128ab6fade 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -92,20 +92,26 @@ function hasNamedGateway(output = "") { return stripAnsi(output).includes("Gateway: nemoclaw"); } +function getActiveGatewayName(output = "") { + const match = stripAnsi(output).match(/^\s*Gateway:\s+(.+?)\s*$/m); + return match ? match[1].trim() : ""; +} + function getNamedGatewayLifecycleState() { const status = captureOpenshell(["status"]); const gatewayInfo = captureOpenshell(["gateway", "info", "-g", "nemoclaw"]); const cleanStatus = stripAnsi(status.output); + const activeGateway = getActiveGatewayName(status.output); const connected = /Connected/i.test(cleanStatus); const named = hasNamedGateway(gatewayInfo.output); const refusing = /Connection refused|client error \(Connect\)|tcp connect error/i.test(cleanStatus); - if (connected && named) { + if (connected && activeGateway === "nemoclaw" && named) { return { state: "healthy_named", status: status.output, gatewayInfo: gatewayInfo.output }; } - if (named && refusing) { + if (activeGateway === "nemoclaw" && named && refusing) { return { state: "named_unreachable", status: status.output, gatewayInfo: gatewayInfo.output }; } - if (named) { + if (activeGateway === "nemoclaw" && named) { return { state: "named_unhealthy", status: status.output, gatewayInfo: gatewayInfo.output }; } if (connected) { diff --git a/test/cli.test.js b/test/cli.test.js index 7c7ed82b07..cf13c3ab4a 100644 --- a/test/cli.test.js +++ b/test/cli.test.js @@ -273,7 +273,10 @@ describe("CLI dispatch", () => { " exit 0", "fi", "if [ \"$1\" = \"status\" ]; then", - " echo 'Connected'", + " echo 'Server Status'", + " echo", + " echo ' Gateway: nemoclaw'", + " echo ' Status: Connected'", " exit 0", "fi", "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", @@ -297,6 +300,74 @@ describe("CLI dispatch", () => { expect(r.out.includes("Sandbox: alpha")).toBeTruthy(); }); + it("does not treat a different connected gateway as a healthy nemoclaw gateway", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-mixed-gateway-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " echo 'Error: transport error: Connection refused' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Server Status'", + " echo", + " echo ' Gateway: openshell'", + " echo ' Status: Connected'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " echo 'Gateway Info'", + " echo", + " echo ' Gateway: nemoclaw'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"select\" ] && [ \"$3\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"start\" ] && [ \"$3\" = \"--name\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "if [ \"$1\" = \"inference\" ] && [ \"$2\" = \"get\" ]; then", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }, 25000); + + expect(r.code).toBe(0); + expect(r.out.includes("Recovered NemoClaw gateway runtime")).toBeFalsy(); + expect(r.out.includes("Could not verify sandbox 'alpha'")).toBeTruthy(); + expect(r.out.includes("verify the active gateway")).toBeTruthy(); + }, 25000); + it("explains unrecoverable gateway trust rotation after restart", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-identity-drift-")); const localBin = path.join(home, "bin"); @@ -328,7 +399,10 @@ describe("CLI dispatch", () => { " exit 1", "fi", "if [ \"$1\" = \"status\" ]; then", - " echo 'Connected'", + " echo 'Server Status'", + " echo", + " echo ' Gateway: nemoclaw'", + " echo ' Status: Connected'", " exit 0", "fi", "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", diff --git a/test/onboard.test.js b/test/onboard.test.js index 841bc76e8d..8a8046b528 100644 --- a/test/onboard.test.js +++ b/test/onboard.test.js @@ -160,6 +160,12 @@ describe("onboard helpers", () => { "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080" ) ).toBe(true); + expect( + isGatewayHealthy( + "Server Status\n\n Gateway: openshell\n Status: Connected", + "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080" + ) + ).toBe(false); expect( isGatewayHealthy( "Server Status\n\n Gateway: openshell\n Status: Connected", From 46b506e6dc7bdbc2c105c882827d20348f31a792 Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 19:27:46 -0400 Subject: [PATCH 12/19] test: modernize double onboard e2e setup --- test/e2e/test-double-onboard.sh | 344 +++++++++++++++++++++++--------- 1 file changed, 245 insertions(+), 99 deletions(-) diff --git a/test/e2e/test-double-onboard.sh b/test/e2e/test-double-onboard.sh index f28821f5f2..8e8b248b00 100755 --- a/test/e2e/test-double-onboard.sh +++ b/test/e2e/test-double-onboard.sh @@ -2,28 +2,25 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Double onboard: verify that consecutive `nemoclaw onboard` runs can reuse -# the shared NemoClaw gateway safely and preserve existing sandboxes instead of -# destroying the prior session on every repeat run. +# Double onboard / lifecycle recovery: +# - prove repeat onboard reuses the healthy shared NemoClaw gateway +# - prove onboarding a second sandbox does not destroy the first sandbox +# - prove stale registry entries are reconciled against live OpenShell state +# - prove gateway rebuilds surface the expected lifecycle guidance # -# Regression test for issues #21, #22, #140, #152, #397, and #849. -# -# Key insight: running onboard without NVIDIA_API_KEY in non-interactive -# mode causes process.exit(1) at step 4, but steps 1-3 (preflight, -# gateway, sandbox) complete first — naturally simulating an unclean exit. -# -# Prerequisites: -# - Docker running -# - openshell CLI installed -# - nemoclaw CLI installed -# - NVIDIA_API_KEY must NOT be set -# -# Usage: -# unset NVIDIA_API_KEY -# bash test/e2e/test-double-onboard.sh +# This script intentionally uses a local fake OpenAI-compatible endpoint so it +# matches the current onboarding flow. Older versions of this test relied on a +# missing/invalid NVIDIA_API_KEY causing a late failure after sandbox creation; +# that no longer reflects current non-interactive onboarding behavior. set -uo pipefail +if [ -z "${NEMOCLAW_E2E_NO_TIMEOUT:-}" ]; then + export NEMOCLAW_E2E_NO_TIMEOUT=1 + TIMEOUT_SECONDS="${NEMOCLAW_E2E_TIMEOUT_SECONDS:-900}" + exec timeout -s TERM "$TIMEOUT_SECONDS" "$0" "$@" +fi + PASS=0 FAIL=0 TOTAL=0 @@ -47,16 +44,121 @@ info() { printf '\033[1;34m [info]\033[0m %s\n' "$1"; } SANDBOX_A="e2e-double-a" SANDBOX_B="e2e-double-b" REGISTRY="$HOME/.nemoclaw/sandboxes.json" +FAKE_HOST="127.0.0.1" +FAKE_PORT="${NEMOCLAW_FAKE_PORT:-18080}" +FAKE_BASE_URL="http://${FAKE_HOST}:${FAKE_PORT}/v1" +FAKE_LOG="$(mktemp)" +FAKE_PID="" + +# shellcheck disable=SC2329 +cleanup() { + if [ -n "$FAKE_PID" ] && kill -0 "$FAKE_PID" 2>/dev/null; then + kill "$FAKE_PID" 2>/dev/null || true + wait "$FAKE_PID" 2>/dev/null || true + fi + rm -f "$FAKE_LOG" +} +trap cleanup EXIT + +start_fake_openai() { + python3 - "$FAKE_HOST" "$FAKE_PORT" >"$FAKE_LOG" 2>&1 <<'PY' & +import json +import sys +from http.server import BaseHTTPRequestHandler, HTTPServer + +HOST = sys.argv[1] +PORT = int(sys.argv[2]) + + +class Handler(BaseHTTPRequestHandler): + def _send(self, status, payload): + body = json.dumps(payload).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format, *args): + return + + def do_GET(self): + if self.path in ("/v1/models", "/models"): + self._send(200, {"data": [{"id": "test-model", "object": "model"}]}) + return + self._send(404, {"error": {"message": "not found"}}) + + def do_POST(self): + length = int(self.headers.get("Content-Length", "0")) + if length: + self.rfile.read(length) + if self.path in ("/v1/chat/completions", "/chat/completions"): + self._send( + 200, + { + "id": "chatcmpl-test", + "object": "chat.completion", + "choices": [{"index": 0, "message": {"role": "assistant", "content": "ok"}, "finish_reason": "stop"}], + }, + ) + return + if self.path in ("/v1/responses", "/responses"): + self._send( + 200, + { + "id": "resp-test", + "object": "response", + "output": [{"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": "ok"}]}], + }, + ) + return + self._send(404, {"error": {"message": "not found"}}) + + +HTTPServer((HOST, PORT), Handler).serve_forever() +PY + FAKE_PID=$! + + for _ in $(seq 1 20); do + if curl -sf "${FAKE_BASE_URL}/models" >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + + return 1 +} + +run_onboard() { + local sandbox_name="$1" + local recreate="${2:-0}" + local log_file + log_file="$(mktemp)" + + local -a env_args=( + "COMPATIBLE_API_KEY=dummy" + "NEMOCLAW_NON_INTERACTIVE=1" + "NEMOCLAW_PROVIDER=custom" + "NEMOCLAW_ENDPOINT_URL=${FAKE_BASE_URL}" + "NEMOCLAW_MODEL=test-model" + "NEMOCLAW_SANDBOX_NAME=${sandbox_name}" + "NEMOCLAW_POLICY_MODE=skip" + ) + if [ "$recreate" = "1" ]; then + env_args+=("NEMOCLAW_RECREATE_SANDBOX=1") + fi + + env "${env_args[@]}" nemoclaw onboard --non-interactive >"$log_file" 2>&1 + RUN_ONBOARD_EXIT=$? + RUN_ONBOARD_OUTPUT="$(cat "$log_file")" + rm -f "$log_file" +} # ══════════════════════════════════════════════════════════════════ # Phase 0: Pre-cleanup # ══════════════════════════════════════════════════════════════════ section "Phase 0: Pre-cleanup" info "Destroying any leftover test sandboxes/gateway from previous runs..." -# Use nemoclaw destroy (not just openshell sandbox delete) to also clean -# the nemoclaw registry at ~/.nemoclaw/sandboxes.json. Stale registry -# entries from a previous run would cause Phase 2 to exit with -# "Sandbox already exists" before the test even starts. if command -v nemoclaw >/dev/null 2>&1; then nemoclaw "$SANDBOX_A" destroy --yes 2>/dev/null || true nemoclaw "$SANDBOX_B" destroy --yes 2>/dev/null || true @@ -68,7 +170,7 @@ openshell gateway destroy -g nemoclaw 2>/dev/null || true pass "Pre-cleanup complete" # ══════════════════════════════════════════════════════════════════ -# Phase 1: Prerequisites +# Phase 1: Prerequisites + fake endpoint # ══════════════════════════════════════════════════════════════════ section "Phase 1: Prerequisites" @@ -93,44 +195,46 @@ else exit 1 fi -if [ -n "${NVIDIA_API_KEY:-}" ]; then - fail "NVIDIA_API_KEY is set — this test requires it UNSET (unset NVIDIA_API_KEY)" +if command -v python3 >/dev/null 2>&1; then + pass "python3 installed" +else + fail "python3 not found — cannot continue" exit 1 +fi + +if start_fake_openai; then + pass "Fake OpenAI-compatible endpoint started at ${FAKE_BASE_URL}" else - pass "NVIDIA_API_KEY is not set (required for controlled step-4 exit)" + fail "Failed to start fake OpenAI-compatible endpoint" + info "Fake server log:" + sed 's/^/ /' "$FAKE_LOG" + exit 1 fi # ══════════════════════════════════════════════════════════════════ -# Phase 2: First onboard (e2e-double-a) — leaves stale state +# Phase 2: First onboard (e2e-double-a) # ══════════════════════════════════════════════════════════════════ section "Phase 2: First onboard ($SANDBOX_A)" -info "Running nemoclaw onboard — expect exit 1 (no API key)..." - -# Write to temp file to avoid openshell FD inheritance blocking $() -ONBOARD_LOG="$(mktemp)" -NEMOCLAW_NON_INTERACTIVE=1 \ - NEMOCLAW_SANDBOX_NAME="$SANDBOX_A" \ - NEMOCLAW_POLICY_MODE=skip \ - nemoclaw onboard --non-interactive >"$ONBOARD_LOG" 2>&1 -exit1=$? -output1="$(cat "$ONBOARD_LOG")" -rm -f "$ONBOARD_LOG" - -if [ $exit1 -eq 1 ]; then - pass "First onboard exited 1 (step 4 failed as expected)" +info "Running successful non-interactive onboard against local compatible endpoint..." + +run_onboard "$SANDBOX_A" +output1="$RUN_ONBOARD_OUTPUT" +exit1="$RUN_ONBOARD_EXIT" + +if [ "$exit1" -eq 0 ]; then + pass "First onboard completed successfully" else - fail "First onboard exited $exit1 (expected 1)" + fail "First onboard exited $exit1 (expected 0)" fi if grep -q "Sandbox '${SANDBOX_A}' created" <<<"$output1"; then - pass "Sandbox '$SANDBOX_A' created (step 3 completed)" + pass "Sandbox '$SANDBOX_A' created" else - fail "Sandbox creation not confirmed in output" + fail "Sandbox '$SANDBOX_A' creation not confirmed in output" fi -# Verify stale state was left behind if openshell gateway info -g nemoclaw 2>/dev/null | grep -q "nemoclaw"; then - pass "Gateway is still running (stale state)" + pass "Gateway is running after first onboard" else fail "Gateway is not running after first onboard" fi @@ -147,29 +251,20 @@ else fail "Registry does not contain '$SANDBOX_A'" fi -info "Stale state confirmed — NOT cleaning up before next onboard" - # ══════════════════════════════════════════════════════════════════ -# Phase 3: Second onboard — SAME name (e2e-double-a) +# Phase 3: Second onboard — SAME name (recreate) # ══════════════════════════════════════════════════════════════════ -section "Phase 3: Second onboard ($SANDBOX_A — same name, stale state)" +section "Phase 3: Second onboard ($SANDBOX_A — same name, recreate)" info "Running nemoclaw onboard with NEMOCLAW_RECREATE_SANDBOX=1..." -ONBOARD_LOG="$(mktemp)" -NEMOCLAW_NON_INTERACTIVE=1 \ - NEMOCLAW_SANDBOX_NAME="$SANDBOX_A" \ - NEMOCLAW_RECREATE_SANDBOX=1 \ - NEMOCLAW_POLICY_MODE=skip \ - nemoclaw onboard --non-interactive >"$ONBOARD_LOG" 2>&1 -exit2=$? -output2="$(cat "$ONBOARD_LOG")" -rm -f "$ONBOARD_LOG" - -# Step 4 still fails (no API key), but steps 1-3 should succeed -if [ $exit2 -eq 1 ]; then - pass "Second onboard exited 1 (step 4 failed as expected)" +run_onboard "$SANDBOX_A" "1" +output2="$RUN_ONBOARD_OUTPUT" +exit2="$RUN_ONBOARD_EXIT" + +if [ "$exit2" -eq 0 ]; then + pass "Second onboard completed successfully" else - fail "Second onboard exited $exit2 (expected 1)" + fail "Second onboard exited $exit2 (expected 0)" fi if grep -q "Reusing existing NemoClaw gateway" <<<"$output2"; then @@ -179,48 +274,37 @@ else fi if grep -q "Port 8080 is not available" <<<"$output2"; then - fail "Port 8080 conflict detected (regression: #21)" + fail "Port 8080 conflict detected (regression)" else - pass "No port 8080 conflict" + pass "No port 8080 conflict on second onboard" fi if grep -q "Port 18789 is not available" <<<"$output2"; then - fail "Port 18789 conflict detected" -else - pass "No port 18789 conflict" -fi - -if grep -q "Sandbox '${SANDBOX_A}' created" <<<"$output2"; then - pass "Sandbox '$SANDBOX_A' recreated" + fail "Port 18789 conflict detected on second onboard" else - fail "Sandbox '$SANDBOX_A' was not recreated" + pass "No port 18789 conflict on second onboard" fi -if openshell gateway info -g nemoclaw 2>/dev/null | grep -q "nemoclaw"; then - pass "Gateway running after second onboard" +if openshell sandbox get "$SANDBOX_A" >/dev/null 2>&1; then + pass "Sandbox '$SANDBOX_A' still exists after recreate" else - fail "Gateway not running after second onboard" + fail "Sandbox '$SANDBOX_A' missing after recreate" fi # ══════════════════════════════════════════════════════════════════ -# Phase 4: Third onboard — DIFFERENT name (e2e-double-b) +# Phase 4: Third onboard — DIFFERENT name # ══════════════════════════════════════════════════════════════════ -section "Phase 4: Third onboard ($SANDBOX_B — different name, stale state)" +section "Phase 4: Third onboard ($SANDBOX_B — different name)" info "Running nemoclaw onboard with new sandbox name..." -ONBOARD_LOG="$(mktemp)" -NEMOCLAW_NON_INTERACTIVE=1 \ - NEMOCLAW_SANDBOX_NAME="$SANDBOX_B" \ - NEMOCLAW_POLICY_MODE=skip \ - nemoclaw onboard --non-interactive >"$ONBOARD_LOG" 2>&1 -exit3=$? -output3="$(cat "$ONBOARD_LOG")" -rm -f "$ONBOARD_LOG" - -if [ $exit3 -eq 1 ]; then - pass "Third onboard exited 1 (step 4 failed as expected)" +run_onboard "$SANDBOX_B" +output3="$RUN_ONBOARD_OUTPUT" +exit3="$RUN_ONBOARD_EXIT" + +if [ "$exit3" -eq 0 ]; then + pass "Third onboard completed successfully" else - fail "Third onboard exited $exit3 (expected 1)" + fail "Third onboard exited $exit3 (expected 0)" fi if grep -q "Reusing existing NemoClaw gateway" <<<"$output3"; then @@ -230,7 +314,7 @@ else fi if grep -q "Port 8080 is not available" <<<"$output3"; then - fail "Port 8080 conflict on third onboard (regression)" + fail "Port 8080 conflict on third onboard" else pass "No port 8080 conflict on third onboard" fi @@ -241,7 +325,7 @@ else pass "No port 18789 conflict on third onboard" fi -if grep -q "Sandbox '${SANDBOX_B}' created" <<<"$output3"; then +if openshell sandbox get "$SANDBOX_B" >/dev/null 2>&1; then pass "Sandbox '$SANDBOX_B' created" else fail "Sandbox '$SANDBOX_B' was not created" @@ -254,9 +338,74 @@ else fi # ══════════════════════════════════════════════════════════════════ -# Phase 5: Final cleanup +# Phase 5: Stale registry reconciliation # ══════════════════════════════════════════════════════════════════ -section "Phase 5: Final cleanup" +section "Phase 5: Stale registry reconciliation" +info "Deleting '$SANDBOX_A' directly in OpenShell to leave a stale NemoClaw registry entry..." + +openshell sandbox delete "$SANDBOX_A" 2>/dev/null || true + +if [ -f "$REGISTRY" ] && grep -q "$SANDBOX_A" "$REGISTRY"; then + pass "Registry still contains stale '$SANDBOX_A' entry" +else + fail "Registry was unexpectedly cleaned before status reconciliation" +fi + +STATUS_LOG="$(mktemp)" +nemoclaw "$SANDBOX_A" status >"$STATUS_LOG" 2>&1 +status_exit=$? +status_output="$(cat "$STATUS_LOG")" +rm -f "$STATUS_LOG" + +if [ "$status_exit" -eq 0 ]; then + pass "Stale sandbox status exited 0" +else + fail "Stale sandbox status exited $status_exit (expected 0)" +fi + +if grep -q "Removed stale local registry entry" <<<"$status_output"; then + pass "Stale registry entry was reconciled during status" +else + fail "Stale registry reconciliation message missing" +fi + +if [ -f "$REGISTRY" ] && grep -q "$SANDBOX_A" "$REGISTRY"; then + fail "Registry still contains '$SANDBOX_A' after status reconciliation" +else + pass "Registry entry for '$SANDBOX_A' removed after status reconciliation" +fi + +# ══════════════════════════════════════════════════════════════════ +# Phase 6: Gateway rebuild lifecycle messaging +# ══════════════════════════════════════════════════════════════════ +section "Phase 6: Gateway rebuild lifecycle messaging" +info "Destroying the NemoClaw gateway to verify current lifecycle guidance..." + +openshell forward stop 18789 2>/dev/null || true +openshell gateway destroy -g nemoclaw 2>/dev/null || true + +GATEWAY_LOG="$(mktemp)" +nemoclaw "$SANDBOX_B" status >"$GATEWAY_LOG" 2>&1 +gateway_status_exit=$? +gateway_status_output="$(cat "$GATEWAY_LOG")" +rm -f "$GATEWAY_LOG" + +if [ "$gateway_status_exit" -eq 0 ]; then + pass "Post-destroy status exited 0" +else + fail "Post-destroy status exited $gateway_status_exit (expected 0)" +fi + +if grep -q "gateway is no longer configured after restart/rebuild" <<<"$gateway_status_output"; then + pass "Gateway rebuild guidance surfaced after destroying gateway" +else + fail "Gateway rebuild guidance missing after destroying gateway" +fi + +# ══════════════════════════════════════════════════════════════════ +# Phase 7: Final cleanup +# ══════════════════════════════════════════════════════════════════ +section "Phase 7: Final cleanup" nemoclaw "$SANDBOX_A" destroy --yes 2>/dev/null || true nemoclaw "$SANDBOX_B" destroy --yes 2>/dev/null || true @@ -285,9 +434,6 @@ fi pass "Final cleanup complete" -# ══════════════════════════════════════════════════════════════════ -# Summary -# ══════════════════════════════════════════════════════════════════ echo "" echo "========================================" echo " Double Onboard E2E Results:" @@ -297,7 +443,7 @@ echo " Total: $TOTAL" echo "========================================" if [ "$FAIL" -eq 0 ]; then - printf '\n\033[1;32m Double onboard PASSED — stale state recovery verified.\033[0m\n' + printf '\n\033[1;32m Double onboard and lifecycle recovery PASSED.\033[0m\n' exit 0 else printf '\n\033[1;31m %d test(s) failed.\033[0m\n' "$FAIL" From ad8ff9aa426743ecf08f8667eec477a91f544c0e Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 20:07:19 -0400 Subject: [PATCH 13/19] test: exercise gateway stop recovery in e2e --- test/e2e/test-double-onboard.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/e2e/test-double-onboard.sh b/test/e2e/test-double-onboard.sh index 8e8b248b00..9ec685c8c5 100755 --- a/test/e2e/test-double-onboard.sh +++ b/test/e2e/test-double-onboard.sh @@ -376,13 +376,13 @@ else fi # ══════════════════════════════════════════════════════════════════ -# Phase 6: Gateway rebuild lifecycle messaging +# Phase 6: Gateway runtime recovery # ══════════════════════════════════════════════════════════════════ -section "Phase 6: Gateway rebuild lifecycle messaging" -info "Destroying the NemoClaw gateway to verify current lifecycle guidance..." +section "Phase 6: Gateway runtime recovery" +info "Stopping the NemoClaw gateway runtime to verify current recovery behavior..." openshell forward stop 18789 2>/dev/null || true -openshell gateway destroy -g nemoclaw 2>/dev/null || true +openshell gateway stop -g nemoclaw 2>/dev/null || true GATEWAY_LOG="$(mktemp)" nemoclaw "$SANDBOX_B" status >"$GATEWAY_LOG" 2>&1 @@ -393,13 +393,13 @@ rm -f "$GATEWAY_LOG" if [ "$gateway_status_exit" -eq 0 ]; then pass "Post-destroy status exited 0" else - fail "Post-destroy status exited $gateway_status_exit (expected 0)" + fail "Post-stop status exited $gateway_status_exit (expected 0)" fi -if grep -q "gateway is no longer configured after restart/rebuild" <<<"$gateway_status_output"; then - pass "Gateway rebuild guidance surfaced after destroying gateway" +if grep -q "Recovered NemoClaw gateway runtime" <<<"$gateway_status_output"; then + pass "Gateway runtime recovered during status after stop" else - fail "Gateway rebuild guidance missing after destroying gateway" + fail "Gateway runtime recovery message missing after gateway stop" fi # ══════════════════════════════════════════════════════════════════ From c8e7ebb466cc52744e8ea45df54e1945c75803a0 Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 20:25:24 -0400 Subject: [PATCH 14/19] fix: harden gateway lifecycle ansi matching and path isolation test --- bin/nemoclaw.js | 4 +- test/cli.test.js | 116 ++++++++++++++++++++++++++++++++++ test/e2e-gateway-isolation.sh | 6 +- 3 files changed, 122 insertions(+), 4 deletions(-) diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 128ab6fade..f173f852f5 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -190,12 +190,12 @@ function printGatewayLifecycleHint(output = "", sandboxName = "", writer = conso writer(" Try re-establishing the NemoClaw gateway/runtime first. If the sandbox is still unreachable, recreate just that sandbox with `nemoclaw onboard`."); return; } - if (/Connection refused|transport error/i.test(output)) { + if (/Connection refused|transport error/i.test(cleanOutput)) { writer(` The sandbox '${sandboxName}' may still exist, but the current gateway/runtime is not reachable.`); writer(" Check `openshell status`, verify the active gateway, and retry."); return; } - if (/Missing gateway auth token|device identity required/i.test(output)) { + if (/Missing gateway auth token|device identity required/i.test(cleanOutput)) { writer(" The gateway is reachable, but the current auth or device identity state is not usable."); writer(" Verify the active gateway and retry after re-establishing the runtime."); } diff --git a/test/cli.test.js b/test/cli.test.js index cf13c3ab4a..7cfb06e0dd 100644 --- a/test/cli.test.js +++ b/test/cli.test.js @@ -368,6 +368,122 @@ describe("CLI dispatch", () => { expect(r.out.includes("verify the active gateway")).toBeTruthy(); }, 25000); + it("matches ANSI-decorated gateway transport errors when printing lifecycle hints", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-ansi-transport-hint-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " printf '\\033[31mError: trans\\033[0mport error: Connec\\033[33mtion refused\\033[0m\\n' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Server Status'", + " echo", + " echo ' Gateway: openshell'", + " echo ' Status: Disconnected'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " printf 'Gateway Info\\n\\n Gateway: openshell\\n'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"select\" ] && [ \"$3\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }, 25000); + + expect(r.code).toBe(0); + expect(r.out.includes("current gateway/runtime is not reachable")).toBeTruthy(); + }, 25000); + + it("matches ANSI-decorated gateway auth errors when printing lifecycle hints", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-ansi-auth-hint-")); + const localBin = path.join(home, "bin"); + const registryDir = path.join(home, ".nemoclaw"); + fs.mkdirSync(localBin, { recursive: true }); + fs.mkdirSync(registryDir, { recursive: true }); + fs.writeFileSync( + path.join(registryDir, "sandboxes.json"), + JSON.stringify({ + sandboxes: { + alpha: { + name: "alpha", + model: "test-model", + provider: "nvidia-prod", + gpuEnabled: false, + policies: [], + }, + }, + defaultSandbox: "alpha", + }), + { mode: 0o600 } + ); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + "if [ \"$1\" = \"sandbox\" ] && [ \"$2\" = \"get\" ] && [ \"$3\" = \"alpha\" ]; then", + " printf '\\033[31mMissing gateway auth\\033[0m token\\n' >&2", + " exit 1", + "fi", + "if [ \"$1\" = \"status\" ]; then", + " echo 'Server Status'", + " echo", + " echo ' Gateway: openshell'", + " echo ' Status: Disconnected'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"info\" ] && [ \"$3\" = \"-g\" ] && [ \"$4\" = \"nemoclaw\" ]; then", + " printf 'Gateway Info\\n\\n Gateway: openshell\\n'", + " exit 0", + "fi", + "if [ \"$1\" = \"gateway\" ] && [ \"$2\" = \"select\" ] && [ \"$3\" = \"nemoclaw\" ]; then", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 } + ); + + const r = runWithEnv("alpha status", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }, 25000); + + expect(r.code).toBe(0); + expect(r.out.includes("Verify the active gateway and retry after re-establishing the runtime.")).toBeTruthy(); + }, 25000); + it("explains unrecoverable gateway trust rotation after restart", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-identity-drift-")); const localBin = path.join(home, "bin"); diff --git a/test/e2e-gateway-isolation.sh b/test/e2e-gateway-isolation.sh index afaf90aad1..101b97130b 100755 --- a/test/e2e-gateway-isolation.sh +++ b/test/e2e-gateway-isolation.sh @@ -125,8 +125,10 @@ fi # ── Test 7: Entrypoint PATH is locked to system dirs ───────────── info "7. Entrypoint locks PATH to system directories" -# Run the entrypoint preamble (up to the PATH export) and verify the result -OUT=$(run_as_root "bash -c 'source <(head -21 /usr/local/bin/nemoclaw-start) 2>/dev/null; echo \$PATH'") +# Run the entrypoint preamble up to the PATH export and verify the result. +# Match by content, not by line count, so later header/comment edits do not +# break the test contract. +OUT=$(run_as_root "bash -c 'source <(sed -n \"1,/^export PATH=/p\" /usr/local/bin/nemoclaw-start) 2>/dev/null; echo \$PATH'") if echo "$OUT" | grep -q "^/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin$"; then pass "PATH is locked to system directories" else From 385fe6664988b6adcd09174977638004030e8b93 Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 20:26:58 -0400 Subject: [PATCH 15/19] fix: remove recommended tag from NVIDIA Endpoints onboarding option --- bin/lib/onboard.js | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index 7d17685c45..7b2a5a2b65 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -1600,9 +1600,7 @@ async function setupNim(gpu) { const options = []; options.push({ key: "build", - label: - "NVIDIA Endpoints" + - (!ollamaRunning && !(EXPERIMENTAL && vllmRunning) ? " (recommended)" : ""), + label: "NVIDIA Endpoints", }); options.push({ key: "openai", label: "OpenAI" }); options.push({ key: "custom", label: "Other OpenAI-compatible endpoint" }); From 6f370b72c3f2fefc4fa6d62a9773a9b6559cd90d Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 20:34:57 -0400 Subject: [PATCH 16/19] test: accept explicit gateway lifecycle outcomes --- test/e2e/test-double-onboard.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/test/e2e/test-double-onboard.sh b/test/e2e/test-double-onboard.sh index 9ec685c8c5..6f7ede5a96 100755 --- a/test/e2e/test-double-onboard.sh +++ b/test/e2e/test-double-onboard.sh @@ -376,10 +376,10 @@ else fi # ══════════════════════════════════════════════════════════════════ -# Phase 6: Gateway runtime recovery +# Phase 6: Gateway lifecycle response # ══════════════════════════════════════════════════════════════════ -section "Phase 6: Gateway runtime recovery" -info "Stopping the NemoClaw gateway runtime to verify current recovery behavior..." +section "Phase 6: Gateway lifecycle response" +info "Stopping the NemoClaw gateway runtime to verify current lifecycle behavior..." openshell forward stop 18789 2>/dev/null || true openshell gateway stop -g nemoclaw 2>/dev/null || true @@ -391,15 +391,19 @@ gateway_status_output="$(cat "$GATEWAY_LOG")" rm -f "$GATEWAY_LOG" if [ "$gateway_status_exit" -eq 0 ]; then - pass "Post-destroy status exited 0" + pass "Post-stop status exited 0" else fail "Post-stop status exited $gateway_status_exit (expected 0)" fi -if grep -q "Recovered NemoClaw gateway runtime" <<<"$gateway_status_output"; then - pass "Gateway runtime recovered during status after stop" +if grep -qE \ + "Recovered NemoClaw gateway runtime|Removed stale local registry entry|gateway is no longer configured after restart/rebuild|gateway is still refusing connections after restart|gateway trust material rotated after restart" \ + <<<"$gateway_status_output"; then + pass "Gateway lifecycle response was explicit after gateway stop" else - fail "Gateway runtime recovery message missing after gateway stop" + fail "Gateway lifecycle response was not explicit after gateway stop" + info "Observed status output:" + printf '%s\n' "$gateway_status_output" | sed 's/^/ /' fi # ══════════════════════════════════════════════════════════════════ From 489ffc5d6b05348113802e453be8dd2a783a86da Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 20:39:51 -0400 Subject: [PATCH 17/19] fix: strip local python artifacts from sandbox build context --- bin/lib/onboard.js | 2 ++ scripts/setup.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index 30fa7fb70a..472324d741 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -1484,6 +1484,8 @@ async function createSandbox(gpu, model, provider, preferredInferenceApi = null) run(`cp -r "${path.join(ROOT, "nemoclaw-blueprint")}" "${buildCtx}/nemoclaw-blueprint"`); run(`cp -r "${path.join(ROOT, "scripts")}" "${buildCtx}/scripts"`); run(`rm -rf "${buildCtx}/nemoclaw/node_modules"`, { ignoreError: true }); + run(`rm -rf "${buildCtx}/nemoclaw-blueprint/.venv" "${buildCtx}/nemoclaw-blueprint/.pytest_cache"`, { ignoreError: true }); + run(`find "${buildCtx}/nemoclaw-blueprint" -type d -name __pycache__ -prune -exec rm -rf {} +`, { ignoreError: true }); // Create sandbox (use -- echo to avoid dropping into interactive shell) // Pass the base policy so sandbox starts in proxy mode (required for policy updates later) diff --git a/scripts/setup.sh b/scripts/setup.sh index 99cd40f2ff..1e4f281210 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -201,6 +201,8 @@ cp -r "$REPO_DIR/nemoclaw" "$BUILD_CTX/nemoclaw" cp -r "$REPO_DIR/nemoclaw-blueprint" "$BUILD_CTX/nemoclaw-blueprint" cp -r "$REPO_DIR/scripts" "$BUILD_CTX/scripts" rm -rf "$BUILD_CTX/nemoclaw/node_modules" +rm -rf "$BUILD_CTX/nemoclaw-blueprint/.venv" "$BUILD_CTX/nemoclaw-blueprint/.pytest_cache" +find "$BUILD_CTX/nemoclaw-blueprint" -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true # Capture full output to a temp file so we can filter for display but still # detect failures. The raw log is kept on failure for debugging. From fa5e7d99d3ca90ec0a6b4496110578b6af593fe8 Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 20:45:32 -0400 Subject: [PATCH 18/19] test: align gateway cleanup assertions with merged recovery flow --- test/gateway-cleanup.test.js | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/test/gateway-cleanup.test.js b/test/gateway-cleanup.test.js index 5043a23737..799680048a 100644 --- a/test/gateway-cleanup.test.js +++ b/test/gateway-cleanup.test.js @@ -23,16 +23,17 @@ describe("gateway cleanup: Docker volumes removed on failure (#17)", () => { it("onboard.js: volume cleanup runs on gateway start failure", () => { const content = fs.readFileSync(path.join(ROOT, "bin/lib/onboard.js"), "utf-8"); - // The startGateway function should call destroyGateway after a failed start - const startGwBlock = content.match(/async function startGateway[\s\S]*?^}/m); + const startGwBlock = content.match(/async function startGatewayWithOptions[\s\S]*?^}/m); expect(startGwBlock).toBeTruthy(); - // Count calls to destroyGateway — should be at least 3: - // 1. pre-cleanup before start - // 2. after start failure - // 3. after health check failure - const calls = (startGwBlock[0].match(/destroyGateway\(\)/g) || []).length; - expect(calls).toBeGreaterThanOrEqual(3); + // Current behavior: + // 1. stale gateway metadata is destroyed directly before start, if present + // 2. destroyGateway() runs after start failure + // 3. destroyGateway() runs after health check failure + expect(startGwBlock[0].includes('if (hasStaleGateway(gwInfo))')).toBe(true); + expect(startGwBlock[0].includes('runOpenshell(["gateway", "destroy", "-g", GATEWAY_NAME]')).toBe(true); + const destroyCalls = (startGwBlock[0].match(/destroyGateway\(\)/g) || []).length; + expect(destroyCalls).toBeGreaterThanOrEqual(2); }); it("uninstall.sh: includes Docker volume cleanup", () => { From 51bbca0cc2430f9b6c9968fa90b5b1fd58c59618 Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 25 Mar 2026 20:58:55 -0400 Subject: [PATCH 19/19] test: run double onboard e2e against repo cli --- test/e2e/test-double-onboard.sh | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/test/e2e/test-double-onboard.sh b/test/e2e/test-double-onboard.sh index 6f7ede5a96..6b96462f23 100755 --- a/test/e2e/test-double-onboard.sh +++ b/test/e2e/test-double-onboard.sh @@ -44,12 +44,20 @@ info() { printf '\033[1;34m [info]\033[0m %s\n' "$1"; } SANDBOX_A="e2e-double-a" SANDBOX_B="e2e-double-b" REGISTRY="$HOME/.nemoclaw/sandboxes.json" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" FAKE_HOST="127.0.0.1" FAKE_PORT="${NEMOCLAW_FAKE_PORT:-18080}" FAKE_BASE_URL="http://${FAKE_HOST}:${FAKE_PORT}/v1" FAKE_LOG="$(mktemp)" FAKE_PID="" +if command -v node >/dev/null 2>&1 && [ -f "$REPO_ROOT/bin/nemoclaw.js" ]; then + NEMOCLAW_CMD=(node "$REPO_ROOT/bin/nemoclaw.js") +else + NEMOCLAW_CMD=(nemoclaw) +fi + # shellcheck disable=SC2329 cleanup() { if [ -n "$FAKE_PID" ] && kill -0 "$FAKE_PID" 2>/dev/null; then @@ -148,20 +156,24 @@ run_onboard() { env_args+=("NEMOCLAW_RECREATE_SANDBOX=1") fi - env "${env_args[@]}" nemoclaw onboard --non-interactive >"$log_file" 2>&1 + env "${env_args[@]}" "${NEMOCLAW_CMD[@]}" onboard --non-interactive >"$log_file" 2>&1 RUN_ONBOARD_EXIT=$? RUN_ONBOARD_OUTPUT="$(cat "$log_file")" rm -f "$log_file" } +run_nemoclaw() { + "${NEMOCLAW_CMD[@]}" "$@" +} + # ══════════════════════════════════════════════════════════════════ # Phase 0: Pre-cleanup # ══════════════════════════════════════════════════════════════════ section "Phase 0: Pre-cleanup" info "Destroying any leftover test sandboxes/gateway from previous runs..." -if command -v nemoclaw >/dev/null 2>&1; then - nemoclaw "$SANDBOX_A" destroy --yes 2>/dev/null || true - nemoclaw "$SANDBOX_B" destroy --yes 2>/dev/null || true +if [ -x "$REPO_ROOT/bin/nemoclaw.js" ] || command -v nemoclaw >/dev/null 2>&1; then + run_nemoclaw "$SANDBOX_A" destroy --yes 2>/dev/null || true + run_nemoclaw "$SANDBOX_B" destroy --yes 2>/dev/null || true fi openshell sandbox delete "$SANDBOX_A" 2>/dev/null || true openshell sandbox delete "$SANDBOX_B" 2>/dev/null || true @@ -188,8 +200,8 @@ else exit 1 fi -if command -v nemoclaw >/dev/null 2>&1; then - pass "nemoclaw CLI installed" +if [ -x "$REPO_ROOT/bin/nemoclaw.js" ] || command -v nemoclaw >/dev/null 2>&1; then + pass "nemoclaw CLI available" else fail "nemoclaw CLI not found — cannot continue" exit 1 @@ -352,7 +364,7 @@ else fi STATUS_LOG="$(mktemp)" -nemoclaw "$SANDBOX_A" status >"$STATUS_LOG" 2>&1 +run_nemoclaw "$SANDBOX_A" status >"$STATUS_LOG" 2>&1 status_exit=$? status_output="$(cat "$STATUS_LOG")" rm -f "$STATUS_LOG" @@ -385,7 +397,7 @@ openshell forward stop 18789 2>/dev/null || true openshell gateway stop -g nemoclaw 2>/dev/null || true GATEWAY_LOG="$(mktemp)" -nemoclaw "$SANDBOX_B" status >"$GATEWAY_LOG" 2>&1 +run_nemoclaw "$SANDBOX_B" status >"$GATEWAY_LOG" 2>&1 gateway_status_exit=$? gateway_status_output="$(cat "$GATEWAY_LOG")" rm -f "$GATEWAY_LOG" @@ -411,8 +423,8 @@ fi # ══════════════════════════════════════════════════════════════════ section "Phase 7: Final cleanup" -nemoclaw "$SANDBOX_A" destroy --yes 2>/dev/null || true -nemoclaw "$SANDBOX_B" destroy --yes 2>/dev/null || true +run_nemoclaw "$SANDBOX_A" destroy --yes 2>/dev/null || true +run_nemoclaw "$SANDBOX_B" destroy --yes 2>/dev/null || true openshell sandbox delete "$SANDBOX_A" 2>/dev/null || true openshell sandbox delete "$SANDBOX_B" 2>/dev/null || true openshell forward stop 18789 2>/dev/null || true