diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index be320ad8fa..650f572bd2 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -12,6 +12,7 @@ const nim = require("./nim"); const policies = require("./policies"); const HOST_GATEWAY_URL = "http://host.openshell.internal"; const EXPERIMENTAL = process.env.NEMOCLAW_EXPERIMENTAL === "1"; +const DEFAULT_GATEWAY_NAME = "nemoclaw"; // ── Helpers ────────────────────────────────────────────────────── @@ -21,6 +22,10 @@ function step(n, total, msg) { console.log(` ${"─".repeat(50)}`); } +function shellQuote(value) { + return `'${String(value).replace(/'/g, `'\\''`)}'`; +} + function isDockerRunning() { try { runCapture("docker info", { ignoreError: false }); @@ -82,22 +87,78 @@ async function preflight() { return gpu; } +/** + * Fully tear down the gateway including Docker volumes that can cause + * "Corrupted cluster state" on subsequent runs. + */ +function gatewayVolumeCandidates(gatewayName = DEFAULT_GATEWAY_NAME) { + return [`openshell-cluster-${gatewayName}`]; +} + +function cleanupGatewayVolumes(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) { + const removedVolumes = []; + const failedVolumes = []; + + for (const volumeName of gatewayVolumeCandidates(gatewayName)) { + const inspectResult = runFn(`docker volume inspect ${shellQuote(volumeName)} >/dev/null 2>&1`, { + ignoreError: true, + stdio: "ignore", + }); + if (inspectResult.status !== 0) continue; + + const removeResult = runFn(`docker volume rm -f ${shellQuote(volumeName)} >/dev/null 2>&1`, { + ignoreError: true, + stdio: "ignore", + }); + if (removeResult.status === 0) removedVolumes.push(volumeName); + else failedVolumes.push(volumeName); + } + + return { removedVolumes, failedVolumes }; +} + +function manualGatewayVolumeCleanupCommand(volumeNames) { + return `docker volume rm -f ${volumeNames.map(shellQuote).join(" ")}`; +} + +function reportGatewayCleanupResult(cleanupResult) { + if (cleanupResult.failedVolumes.length > 0) { + console.error(" Automatic cleanup could not remove all stale Docker volumes."); + console.error(` Run: ${manualGatewayVolumeCleanupCommand(cleanupResult.failedVolumes)}`); + return; + } + + console.error(" Stale state removed. Please rerun the installer."); +} + +function destroyGateway(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) { + runFn(`openshell gateway destroy -g ${gatewayName} 2>/dev/null || true`, { ignoreError: true }); + return cleanupGatewayVolumes(runFn, gatewayName); +} + // ── Step 2: Gateway ────────────────────────────────────────────── async function startGateway(gpu) { step(2, 7, "Starting OpenShell gateway"); - // Destroy old gateway - run("openshell gateway destroy -g nemoclaw 2>/dev/null || true", { ignoreError: true }); + // Destroy old gateway and clean up any leftover Docker state from previous failures + destroyGateway(); - const gwArgs = ["--name", "nemoclaw"]; + const gwArgs = ["--name", DEFAULT_GATEWAY_NAME]; // Do NOT pass --gpu here. On DGX Spark (and most GPU hosts), inference is // routed through a host-side provider (Ollama, vLLM, or cloud API) — the // sandbox itself does not need direct GPU access. Passing --gpu causes // FailedPrecondition errors when the gateway's k3s device plugin cannot // allocate GPUs. See: https://build.nvidia.com/spark/nemoclaw/instructions - run(`openshell gateway start ${gwArgs.join(" ")}`, { ignoreError: false }); + const startResult = run(`openshell gateway start ${gwArgs.join(" ")}`, { ignoreError: true }); + if (startResult.status !== 0) { + console.error(" Gateway failed to start. Cleaning up stale state..."); + const cleanupResult = destroyGateway(); + reportGatewayCleanupResult(cleanupResult); + console.error(" If the error persists, run: openshell gateway info"); + process.exit(1); + } // Verify health for (let i = 0; i < 5; i++) { @@ -107,7 +168,10 @@ async function startGateway(gpu) { break; } if (i === 4) { - console.error(" Gateway failed to start. Run: openshell gateway info"); + console.error(" Gateway health check failed. Cleaning up..."); + const cleanupResult = destroyGateway(); + reportGatewayCleanupResult(cleanupResult); + console.error(" If the error persists, run: openshell gateway info"); process.exit(1); } require("child_process").spawnSync("sleep", ["2"]); @@ -501,4 +565,11 @@ async function onboard() { printDashboard(sandboxName, model, provider); } -module.exports = { onboard }; +module.exports = { + cleanupGatewayVolumes, + destroyGateway, + gatewayVolumeCandidates, + manualGatewayVolumeCleanupCommand, + onboard, + reportGatewayCleanupResult, +}; diff --git a/test/onboard.test.js b/test/onboard.test.js new file mode 100644 index 0000000000..57605ab074 --- /dev/null +++ b/test/onboard.test.js @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +const { describe, it } = require("node:test"); +const assert = require("node:assert/strict"); +const { + cleanupGatewayVolumes, + gatewayVolumeCandidates, + manualGatewayVolumeCleanupCommand, +} = require("../bin/lib/onboard"); + +describe("gateway cleanup helpers", () => { + it("uses the known OpenShell volume name for the default gateway", () => { + assert.deepEqual(gatewayVolumeCandidates(), ["openshell-cluster-nemoclaw"]); + }); + + it("removes known gateway volumes when they exist", () => { + const commands = []; + const runFn = (cmd) => { + commands.push(cmd); + if (cmd.includes("docker volume inspect")) return { status: 0 }; + if (cmd.includes("docker volume rm -f")) return { status: 0 }; + return { status: 1 }; + }; + + const result = cleanupGatewayVolumes(runFn); + + assert.deepEqual(result, { + removedVolumes: ["openshell-cluster-nemoclaw"], + failedVolumes: [], + }); + assert.equal(commands.length, 2); + }); + + it("returns the exact manual recovery command when automatic cleanup fails", () => { + const runFn = (cmd) => { + if (cmd.includes("docker volume inspect")) return { status: 0 }; + if (cmd.includes("docker volume rm -f")) return { status: 1 }; + return { status: 1 }; + }; + + const result = cleanupGatewayVolumes(runFn); + + assert.deepEqual(result, { + removedVolumes: [], + failedVolumes: ["openshell-cluster-nemoclaw"], + }); + assert.equal( + manualGatewayVolumeCleanupCommand(result.failedVolumes), + "docker volume rm -f 'openshell-cluster-nemoclaw'" + ); + }); +});