Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 77 additions & 6 deletions bin/lib/onboard.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ const nim = require("./nim");
const policies = require("./policies");
const HOST_GATEWAY_URL = "http://host.openshell.internal";
const EXPERIMENTAL = process.env.NEMOCLAW_EXPERIMENTAL === "1";
const DEFAULT_GATEWAY_NAME = "nemoclaw";

// ── Helpers ──────────────────────────────────────────────────────

Expand All @@ -21,6 +22,10 @@ function step(n, total, msg) {
console.log(` ${"─".repeat(50)}`);
}

function shellQuote(value) {
return `'${String(value).replace(/'/g, `'\\''`)}'`;
}

function isDockerRunning() {
try {
runCapture("docker info", { ignoreError: false });
Expand Down Expand Up @@ -82,22 +87,78 @@ async function preflight() {
return gpu;
}

/**
* Fully tear down the gateway including Docker volumes that can cause
* "Corrupted cluster state" on subsequent runs.
*/
function gatewayVolumeCandidates(gatewayName = DEFAULT_GATEWAY_NAME) {
return [`openshell-cluster-${gatewayName}`];
}

function cleanupGatewayVolumes(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) {
const removedVolumes = [];
const failedVolumes = [];

for (const volumeName of gatewayVolumeCandidates(gatewayName)) {
const inspectResult = runFn(`docker volume inspect ${shellQuote(volumeName)} >/dev/null 2>&1`, {
ignoreError: true,
stdio: "ignore",
});
if (inspectResult.status !== 0) continue;
Comment on lines +98 to +107
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Handle Docker-unavailable cases explicitly before reporting cleanup success.

Right now, a non-zero docker volume inspect is treated as "volume absent" (Line 107), so Docker daemon/CLI failures can end up reported as successful stale-state cleanup.

🔧 Proposed fix
 function cleanupGatewayVolumes(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) {
   const removedVolumes = [];
   const failedVolumes = [];
+  const dockerReady = runFn("docker info >/dev/null 2>&1", {
+    ignoreError: true,
+    stdio: "ignore",
+  });
+  if (dockerReady.status !== 0) {
+    return { removedVolumes, failedVolumes: gatewayVolumeCandidates(gatewayName) };
+  }
 
   for (const volumeName of gatewayVolumeCandidates(gatewayName)) {
     const inspectResult = runFn(`docker volume inspect ${shellQuote(volumeName)} >/dev/null 2>&1`, {
       ignoreError: true,
       stdio: "ignore",
     });
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@bin/lib/onboard.js` around lines 98 - 107, cleanupGatewayVolumes currently
treats any non-zero result from runFn(`docker volume inspect ...`) as "volume
absent", which hides Docker-daemon/CLI failures; change the logic in
cleanupGatewayVolumes (and its use of runFn and
gatewayVolumeCandidates/DEFAULT_GATEWAY_NAME) so that when inspectResult.status
!== 0 you first verify Docker is reachable (e.g., call runFn('docker info' or
'docker version') and check its status/output) and if Docker is unavailable
return/throw an error (or propagate failure) instead of continuing; only treat
non-zero inspect as "absent" when Docker is confirmed working. Ensure callers
handle the propagated error or failure status.


const removeResult = runFn(`docker volume rm -f ${shellQuote(volumeName)} >/dev/null 2>&1`, {
ignoreError: true,
stdio: "ignore",
});
if (removeResult.status === 0) removedVolumes.push(volumeName);
else failedVolumes.push(volumeName);
}

return { removedVolumes, failedVolumes };
}

function manualGatewayVolumeCleanupCommand(volumeNames) {
return `docker volume rm -f ${volumeNames.map(shellQuote).join(" ")}`;
}

function reportGatewayCleanupResult(cleanupResult) {
if (cleanupResult.failedVolumes.length > 0) {
console.error(" Automatic cleanup could not remove all stale Docker volumes.");
console.error(` Run: ${manualGatewayVolumeCleanupCommand(cleanupResult.failedVolumes)}`);
return;
}

console.error(" Stale state removed. Please rerun the installer.");
}

function destroyGateway(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) {
runFn(`openshell gateway destroy -g ${gatewayName} 2>/dev/null || true`, { ignoreError: true });
return cleanupGatewayVolumes(runFn, gatewayName);
Comment on lines +134 to +136
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Quote gatewayName in destroyGateway command construction.

gatewayName is interpolated directly into a shell command on Line 135. Even if current callers use a constant, this exported helper should be safe by default.

🔧 Proposed fix
 function destroyGateway(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) {
-  runFn(`openshell gateway destroy -g ${gatewayName} 2>/dev/null || true`, { ignoreError: true });
+  runFn(`openshell gateway destroy -g ${shellQuote(gatewayName)} 2>/dev/null || true`, { ignoreError: true });
   return cleanupGatewayVolumes(runFn, gatewayName);
 }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
function destroyGateway(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) {
runFn(`openshell gateway destroy -g ${gatewayName} 2>/dev/null || true`, { ignoreError: true });
return cleanupGatewayVolumes(runFn, gatewayName);
function destroyGateway(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) {
runFn(`openshell gateway destroy -g ${shellQuote(gatewayName)} 2>/dev/null || true`, { ignoreError: true });
return cleanupGatewayVolumes(runFn, gatewayName);
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@bin/lib/onboard.js` around lines 134 - 136, The command string in
destroyGateway interpolates gatewayName directly; change it to shell-quote
gatewayName before interpolation to avoid injection — e.g., escape any single
quotes in gatewayName and wrap it in single quotes when building the command
passed to runFn (reference function destroyGateway and runFn); leave the call to
cleanupGatewayVolumes(gatewayName) unchanged.

}

// ── Step 2: Gateway ──────────────────────────────────────────────

async function startGateway(gpu) {
step(2, 7, "Starting OpenShell gateway");

// Destroy old gateway
run("openshell gateway destroy -g nemoclaw 2>/dev/null || true", { ignoreError: true });
// Destroy old gateway and clean up any leftover Docker state from previous failures
destroyGateway();

const gwArgs = ["--name", "nemoclaw"];
const gwArgs = ["--name", DEFAULT_GATEWAY_NAME];
// Do NOT pass --gpu here. On DGX Spark (and most GPU hosts), inference is
// routed through a host-side provider (Ollama, vLLM, or cloud API) — the
// sandbox itself does not need direct GPU access. Passing --gpu causes
// FailedPrecondition errors when the gateway's k3s device plugin cannot
// allocate GPUs. See: https://build.nvidia.com/spark/nemoclaw/instructions

run(`openshell gateway start ${gwArgs.join(" ")}`, { ignoreError: false });
const startResult = run(`openshell gateway start ${gwArgs.join(" ")}`, { ignoreError: true });
if (startResult.status !== 0) {
console.error(" Gateway failed to start. Cleaning up stale state...");
const cleanupResult = destroyGateway();
reportGatewayCleanupResult(cleanupResult);
console.error(" If the error persists, run: openshell gateway info");
process.exit(1);
}

// Verify health
for (let i = 0; i < 5; i++) {
Expand All @@ -107,7 +168,10 @@ async function startGateway(gpu) {
break;
}
if (i === 4) {
console.error(" Gateway failed to start. Run: openshell gateway info");
console.error(" Gateway health check failed. Cleaning up...");
const cleanupResult = destroyGateway();
reportGatewayCleanupResult(cleanupResult);
console.error(" If the error persists, run: openshell gateway info");
process.exit(1);
}
require("child_process").spawnSync("sleep", ["2"]);
Expand Down Expand Up @@ -501,4 +565,11 @@ async function onboard() {
printDashboard(sandboxName, model, provider);
}

module.exports = { onboard };
module.exports = {
cleanupGatewayVolumes,
destroyGateway,
gatewayVolumeCandidates,
manualGatewayVolumeCleanupCommand,
onboard,
reportGatewayCleanupResult,
};
53 changes: 53 additions & 0 deletions test/onboard.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

const { describe, it } = require("node:test");
const assert = require("node:assert/strict");
const {
cleanupGatewayVolumes,
gatewayVolumeCandidates,
manualGatewayVolumeCleanupCommand,
} = require("../bin/lib/onboard");

describe("gateway cleanup helpers", () => {
it("uses the known OpenShell volume name for the default gateway", () => {
assert.deepEqual(gatewayVolumeCandidates(), ["openshell-cluster-nemoclaw"]);
});

it("removes known gateway volumes when they exist", () => {
const commands = [];
const runFn = (cmd) => {
commands.push(cmd);
if (cmd.includes("docker volume inspect")) return { status: 0 };
if (cmd.includes("docker volume rm -f")) return { status: 0 };
return { status: 1 };
};

const result = cleanupGatewayVolumes(runFn);

assert.deepEqual(result, {
removedVolumes: ["openshell-cluster-nemoclaw"],
failedVolumes: [],
});
assert.equal(commands.length, 2);
});

it("returns the exact manual recovery command when automatic cleanup fails", () => {
const runFn = (cmd) => {
if (cmd.includes("docker volume inspect")) return { status: 0 };
if (cmd.includes("docker volume rm -f")) return { status: 1 };
return { status: 1 };
};

const result = cleanupGatewayVolumes(runFn);

assert.deepEqual(result, {
removedVolumes: [],
failedVolumes: ["openshell-cluster-nemoclaw"],
});
assert.equal(
manualGatewayVolumeCleanupCommand(result.failedVolumes),
"docker volume rm -f 'openshell-cluster-nemoclaw'"
);
});
});