From 228baa9314a4331fee2df004b5a58d9bcac3bda1 Mon Sep 17 00:00:00 2001 From: peteryuqin Date: Mon, 16 Mar 2026 21:50:14 -0400 Subject: [PATCH 1/2] fix: clean up Docker volumes after failed gateway start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a failed gateway start, leftover Docker volumes from openshell-cluster-nemoclaw cause "Corrupted cluster state" errors on subsequent runs, requiring manual `docker volume rm` to recover. Extract destroyGateway() that runs both `openshell gateway destroy` and removes orphaned Docker volumes. Call it: 1. Before starting the gateway (existing pre-cleanup) 2. After a failed gateway start (new — ensures clean retry) 3. After a failed health check (new — same cleanup path) The error messages now tell the user to simply rerun the installer instead of requiring manual Docker volume management. Fixes #17 Co-Authored-By: Claude Opus 4.6 Signed-off-by: peteryuqin --- bin/lib/onboard.js | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index be320ad8fa..998a5649dd 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -82,13 +82,23 @@ async function preflight() { return gpu; } +/** + * Fully tear down the gateway including Docker volumes that can cause + * "Corrupted cluster state" on subsequent runs. + */ +function destroyGateway() { + run("openshell gateway destroy -g nemoclaw 2>/dev/null || true", { ignoreError: true }); + // Remove leftover Docker volumes that openshell gateway destroy may miss + run('docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | xargs -r docker volume rm 2>/dev/null || true', { ignoreError: true }); +} + // ── Step 2: Gateway ────────────────────────────────────────────── async function startGateway(gpu) { step(2, 7, "Starting OpenShell gateway"); - // Destroy old gateway - run("openshell gateway destroy -g nemoclaw 2>/dev/null || true", { ignoreError: true }); + // Destroy old gateway and clean up any leftover Docker state from previous failures + destroyGateway(); const gwArgs = ["--name", "nemoclaw"]; // Do NOT pass --gpu here. On DGX Spark (and most GPU hosts), inference is @@ -97,7 +107,14 @@ async function startGateway(gpu) { // FailedPrecondition errors when the gateway's k3s device plugin cannot // allocate GPUs. See: https://build.nvidia.com/spark/nemoclaw/instructions - run(`openshell gateway start ${gwArgs.join(" ")}`, { ignoreError: false }); + const startResult = run(`openshell gateway start ${gwArgs.join(" ")}`, { ignoreError: true }); + if (startResult.status !== 0) { + console.error(" Gateway failed to start. Cleaning up stale state..."); + destroyGateway(); + console.error(" Stale state removed. Please rerun the installer."); + console.error(" If the error persists, run: openshell gateway info"); + process.exit(1); + } // Verify health for (let i = 0; i < 5; i++) { @@ -107,7 +124,9 @@ async function startGateway(gpu) { break; } if (i === 4) { - console.error(" Gateway failed to start. Run: openshell gateway info"); + console.error(" Gateway health check failed. Cleaning up..."); + destroyGateway(); + console.error(" Run the installer again. If the error persists: openshell gateway info"); process.exit(1); } require("child_process").spawnSync("sleep", ["2"]); From 4b123ef2ddf5499821d45d6bca51ba29b7825113 Mon Sep 17 00:00:00 2001 From: Kevin Jones Date: Wed, 18 Mar 2026 10:36:20 -0400 Subject: [PATCH 2/2] fix gateway cleanup fallback and tests --- bin/lib/onboard.js | 72 ++++++++++++++++++++++++++++++++++++++------ test/onboard.test.js | 53 ++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 10 deletions(-) create mode 100644 test/onboard.test.js diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index 998a5649dd..650f572bd2 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -12,6 +12,7 @@ const nim = require("./nim"); const policies = require("./policies"); const HOST_GATEWAY_URL = "http://host.openshell.internal"; const EXPERIMENTAL = process.env.NEMOCLAW_EXPERIMENTAL === "1"; +const DEFAULT_GATEWAY_NAME = "nemoclaw"; // ── Helpers ────────────────────────────────────────────────────── @@ -21,6 +22,10 @@ function step(n, total, msg) { console.log(` ${"─".repeat(50)}`); } +function shellQuote(value) { + return `'${String(value).replace(/'/g, `'\\''`)}'`; +} + function isDockerRunning() { try { runCapture("docker info", { ignoreError: false }); @@ -86,10 +91,49 @@ async function preflight() { * Fully tear down the gateway including Docker volumes that can cause * "Corrupted cluster state" on subsequent runs. */ -function destroyGateway() { - run("openshell gateway destroy -g nemoclaw 2>/dev/null || true", { ignoreError: true }); - // Remove leftover Docker volumes that openshell gateway destroy may miss - run('docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | xargs -r docker volume rm 2>/dev/null || true', { ignoreError: true }); +function gatewayVolumeCandidates(gatewayName = DEFAULT_GATEWAY_NAME) { + return [`openshell-cluster-${gatewayName}`]; +} + +function cleanupGatewayVolumes(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) { + const removedVolumes = []; + const failedVolumes = []; + + for (const volumeName of gatewayVolumeCandidates(gatewayName)) { + const inspectResult = runFn(`docker volume inspect ${shellQuote(volumeName)} >/dev/null 2>&1`, { + ignoreError: true, + stdio: "ignore", + }); + if (inspectResult.status !== 0) continue; + + const removeResult = runFn(`docker volume rm -f ${shellQuote(volumeName)} >/dev/null 2>&1`, { + ignoreError: true, + stdio: "ignore", + }); + if (removeResult.status === 0) removedVolumes.push(volumeName); + else failedVolumes.push(volumeName); + } + + return { removedVolumes, failedVolumes }; +} + +function manualGatewayVolumeCleanupCommand(volumeNames) { + return `docker volume rm -f ${volumeNames.map(shellQuote).join(" ")}`; +} + +function reportGatewayCleanupResult(cleanupResult) { + if (cleanupResult.failedVolumes.length > 0) { + console.error(" Automatic cleanup could not remove all stale Docker volumes."); + console.error(` Run: ${manualGatewayVolumeCleanupCommand(cleanupResult.failedVolumes)}`); + return; + } + + console.error(" Stale state removed. Please rerun the installer."); +} + +function destroyGateway(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) { + runFn(`openshell gateway destroy -g ${gatewayName} 2>/dev/null || true`, { ignoreError: true }); + return cleanupGatewayVolumes(runFn, gatewayName); } // ── Step 2: Gateway ────────────────────────────────────────────── @@ -100,7 +144,7 @@ async function startGateway(gpu) { // Destroy old gateway and clean up any leftover Docker state from previous failures destroyGateway(); - const gwArgs = ["--name", "nemoclaw"]; + const gwArgs = ["--name", DEFAULT_GATEWAY_NAME]; // Do NOT pass --gpu here. On DGX Spark (and most GPU hosts), inference is // routed through a host-side provider (Ollama, vLLM, or cloud API) — the // sandbox itself does not need direct GPU access. Passing --gpu causes @@ -110,8 +154,8 @@ async function startGateway(gpu) { const startResult = run(`openshell gateway start ${gwArgs.join(" ")}`, { ignoreError: true }); if (startResult.status !== 0) { console.error(" Gateway failed to start. Cleaning up stale state..."); - destroyGateway(); - console.error(" Stale state removed. Please rerun the installer."); + const cleanupResult = destroyGateway(); + reportGatewayCleanupResult(cleanupResult); console.error(" If the error persists, run: openshell gateway info"); process.exit(1); } @@ -125,8 +169,9 @@ async function startGateway(gpu) { } if (i === 4) { console.error(" Gateway health check failed. Cleaning up..."); - destroyGateway(); - console.error(" Run the installer again. If the error persists: openshell gateway info"); + const cleanupResult = destroyGateway(); + reportGatewayCleanupResult(cleanupResult); + console.error(" If the error persists, run: openshell gateway info"); process.exit(1); } require("child_process").spawnSync("sleep", ["2"]); @@ -520,4 +565,11 @@ async function onboard() { printDashboard(sandboxName, model, provider); } -module.exports = { onboard }; +module.exports = { + cleanupGatewayVolumes, + destroyGateway, + gatewayVolumeCandidates, + manualGatewayVolumeCleanupCommand, + onboard, + reportGatewayCleanupResult, +}; diff --git a/test/onboard.test.js b/test/onboard.test.js new file mode 100644 index 0000000000..57605ab074 --- /dev/null +++ b/test/onboard.test.js @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +const { describe, it } = require("node:test"); +const assert = require("node:assert/strict"); +const { + cleanupGatewayVolumes, + gatewayVolumeCandidates, + manualGatewayVolumeCleanupCommand, +} = require("../bin/lib/onboard"); + +describe("gateway cleanup helpers", () => { + it("uses the known OpenShell volume name for the default gateway", () => { + assert.deepEqual(gatewayVolumeCandidates(), ["openshell-cluster-nemoclaw"]); + }); + + it("removes known gateway volumes when they exist", () => { + const commands = []; + const runFn = (cmd) => { + commands.push(cmd); + if (cmd.includes("docker volume inspect")) return { status: 0 }; + if (cmd.includes("docker volume rm -f")) return { status: 0 }; + return { status: 1 }; + }; + + const result = cleanupGatewayVolumes(runFn); + + assert.deepEqual(result, { + removedVolumes: ["openshell-cluster-nemoclaw"], + failedVolumes: [], + }); + assert.equal(commands.length, 2); + }); + + it("returns the exact manual recovery command when automatic cleanup fails", () => { + const runFn = (cmd) => { + if (cmd.includes("docker volume inspect")) return { status: 0 }; + if (cmd.includes("docker volume rm -f")) return { status: 1 }; + return { status: 1 }; + }; + + const result = cleanupGatewayVolumes(runFn); + + assert.deepEqual(result, { + removedVolumes: [], + failedVolumes: ["openshell-cluster-nemoclaw"], + }); + assert.equal( + manualGatewayVolumeCleanupCommand(result.failedVolumes), + "docker volume rm -f 'openshell-cluster-nemoclaw'" + ); + }); +});