-
Notifications
You must be signed in to change notification settings - Fork 2.4k
fix: clean up Docker volumes after failed gateway start #95
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -12,6 +12,7 @@ const nim = require("./nim"); | |||||||||||||||
| const policies = require("./policies"); | ||||||||||||||||
| const HOST_GATEWAY_URL = "http://host.openshell.internal"; | ||||||||||||||||
| const EXPERIMENTAL = process.env.NEMOCLAW_EXPERIMENTAL === "1"; | ||||||||||||||||
| const DEFAULT_GATEWAY_NAME = "nemoclaw"; | ||||||||||||||||
|
|
||||||||||||||||
| // ── Helpers ────────────────────────────────────────────────────── | ||||||||||||||||
|
|
||||||||||||||||
|
|
@@ -21,6 +22,10 @@ function step(n, total, msg) { | |||||||||||||||
| console.log(` ${"─".repeat(50)}`); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| function shellQuote(value) { | ||||||||||||||||
| return `'${String(value).replace(/'/g, `'\\''`)}'`; | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| function isDockerRunning() { | ||||||||||||||||
| try { | ||||||||||||||||
| runCapture("docker info", { ignoreError: false }); | ||||||||||||||||
|
|
@@ -82,22 +87,78 @@ async function preflight() { | |||||||||||||||
| return gpu; | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| /** | ||||||||||||||||
| * Fully tear down the gateway including Docker volumes that can cause | ||||||||||||||||
| * "Corrupted cluster state" on subsequent runs. | ||||||||||||||||
| */ | ||||||||||||||||
| function gatewayVolumeCandidates(gatewayName = DEFAULT_GATEWAY_NAME) { | ||||||||||||||||
| return [`openshell-cluster-${gatewayName}`]; | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| function cleanupGatewayVolumes(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) { | ||||||||||||||||
| const removedVolumes = []; | ||||||||||||||||
| const failedVolumes = []; | ||||||||||||||||
|
|
||||||||||||||||
| for (const volumeName of gatewayVolumeCandidates(gatewayName)) { | ||||||||||||||||
| const inspectResult = runFn(`docker volume inspect ${shellQuote(volumeName)} >/dev/null 2>&1`, { | ||||||||||||||||
| ignoreError: true, | ||||||||||||||||
| stdio: "ignore", | ||||||||||||||||
| }); | ||||||||||||||||
| if (inspectResult.status !== 0) continue; | ||||||||||||||||
|
|
||||||||||||||||
| const removeResult = runFn(`docker volume rm -f ${shellQuote(volumeName)} >/dev/null 2>&1`, { | ||||||||||||||||
| ignoreError: true, | ||||||||||||||||
| stdio: "ignore", | ||||||||||||||||
| }); | ||||||||||||||||
| if (removeResult.status === 0) removedVolumes.push(volumeName); | ||||||||||||||||
| else failedVolumes.push(volumeName); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| return { removedVolumes, failedVolumes }; | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| function manualGatewayVolumeCleanupCommand(volumeNames) { | ||||||||||||||||
| return `docker volume rm -f ${volumeNames.map(shellQuote).join(" ")}`; | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| function reportGatewayCleanupResult(cleanupResult) { | ||||||||||||||||
| if (cleanupResult.failedVolumes.length > 0) { | ||||||||||||||||
| console.error(" Automatic cleanup could not remove all stale Docker volumes."); | ||||||||||||||||
| console.error(` Run: ${manualGatewayVolumeCleanupCommand(cleanupResult.failedVolumes)}`); | ||||||||||||||||
| return; | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| console.error(" Stale state removed. Please rerun the installer."); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| function destroyGateway(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) { | ||||||||||||||||
| runFn(`openshell gateway destroy -g ${gatewayName} 2>/dev/null || true`, { ignoreError: true }); | ||||||||||||||||
| return cleanupGatewayVolumes(runFn, gatewayName); | ||||||||||||||||
|
Comment on lines
+134
to
+136
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quote
🔧 Proposed fix function destroyGateway(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) {
- runFn(`openshell gateway destroy -g ${gatewayName} 2>/dev/null || true`, { ignoreError: true });
+ runFn(`openshell gateway destroy -g ${shellQuote(gatewayName)} 2>/dev/null || true`, { ignoreError: true });
return cleanupGatewayVolumes(runFn, gatewayName);
}📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| // ── Step 2: Gateway ────────────────────────────────────────────── | ||||||||||||||||
|
|
||||||||||||||||
| async function startGateway(gpu) { | ||||||||||||||||
| step(2, 7, "Starting OpenShell gateway"); | ||||||||||||||||
|
|
||||||||||||||||
| // Destroy old gateway | ||||||||||||||||
| run("openshell gateway destroy -g nemoclaw 2>/dev/null || true", { ignoreError: true }); | ||||||||||||||||
| // Destroy old gateway and clean up any leftover Docker state from previous failures | ||||||||||||||||
| destroyGateway(); | ||||||||||||||||
|
|
||||||||||||||||
| const gwArgs = ["--name", "nemoclaw"]; | ||||||||||||||||
| const gwArgs = ["--name", DEFAULT_GATEWAY_NAME]; | ||||||||||||||||
| // Do NOT pass --gpu here. On DGX Spark (and most GPU hosts), inference is | ||||||||||||||||
| // routed through a host-side provider (Ollama, vLLM, or cloud API) — the | ||||||||||||||||
| // sandbox itself does not need direct GPU access. Passing --gpu causes | ||||||||||||||||
| // FailedPrecondition errors when the gateway's k3s device plugin cannot | ||||||||||||||||
| // allocate GPUs. See: https://build.nvidia.com/spark/nemoclaw/instructions | ||||||||||||||||
|
|
||||||||||||||||
| run(`openshell gateway start ${gwArgs.join(" ")}`, { ignoreError: false }); | ||||||||||||||||
| const startResult = run(`openshell gateway start ${gwArgs.join(" ")}`, { ignoreError: true }); | ||||||||||||||||
| if (startResult.status !== 0) { | ||||||||||||||||
| console.error(" Gateway failed to start. Cleaning up stale state..."); | ||||||||||||||||
| const cleanupResult = destroyGateway(); | ||||||||||||||||
| reportGatewayCleanupResult(cleanupResult); | ||||||||||||||||
| console.error(" If the error persists, run: openshell gateway info"); | ||||||||||||||||
| process.exit(1); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| // Verify health | ||||||||||||||||
| for (let i = 0; i < 5; i++) { | ||||||||||||||||
|
|
@@ -107,7 +168,10 @@ async function startGateway(gpu) { | |||||||||||||||
| break; | ||||||||||||||||
| } | ||||||||||||||||
| if (i === 4) { | ||||||||||||||||
| console.error(" Gateway failed to start. Run: openshell gateway info"); | ||||||||||||||||
| console.error(" Gateway health check failed. Cleaning up..."); | ||||||||||||||||
| const cleanupResult = destroyGateway(); | ||||||||||||||||
| reportGatewayCleanupResult(cleanupResult); | ||||||||||||||||
| console.error(" If the error persists, run: openshell gateway info"); | ||||||||||||||||
| process.exit(1); | ||||||||||||||||
| } | ||||||||||||||||
| require("child_process").spawnSync("sleep", ["2"]); | ||||||||||||||||
|
|
@@ -501,4 +565,11 @@ async function onboard() { | |||||||||||||||
| printDashboard(sandboxName, model, provider); | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| module.exports = { onboard }; | ||||||||||||||||
| module.exports = { | ||||||||||||||||
| cleanupGatewayVolumes, | ||||||||||||||||
| destroyGateway, | ||||||||||||||||
| gatewayVolumeCandidates, | ||||||||||||||||
| manualGatewayVolumeCleanupCommand, | ||||||||||||||||
| onboard, | ||||||||||||||||
| reportGatewayCleanupResult, | ||||||||||||||||
| }; | ||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| const { describe, it } = require("node:test"); | ||
| const assert = require("node:assert/strict"); | ||
| const { | ||
| cleanupGatewayVolumes, | ||
| gatewayVolumeCandidates, | ||
| manualGatewayVolumeCleanupCommand, | ||
| } = require("../bin/lib/onboard"); | ||
|
|
||
| describe("gateway cleanup helpers", () => { | ||
| it("uses the known OpenShell volume name for the default gateway", () => { | ||
| assert.deepEqual(gatewayVolumeCandidates(), ["openshell-cluster-nemoclaw"]); | ||
| }); | ||
|
|
||
| it("removes known gateway volumes when they exist", () => { | ||
| const commands = []; | ||
| const runFn = (cmd) => { | ||
| commands.push(cmd); | ||
| if (cmd.includes("docker volume inspect")) return { status: 0 }; | ||
| if (cmd.includes("docker volume rm -f")) return { status: 0 }; | ||
| return { status: 1 }; | ||
| }; | ||
|
|
||
| const result = cleanupGatewayVolumes(runFn); | ||
|
|
||
| assert.deepEqual(result, { | ||
| removedVolumes: ["openshell-cluster-nemoclaw"], | ||
| failedVolumes: [], | ||
| }); | ||
| assert.equal(commands.length, 2); | ||
| }); | ||
|
|
||
| it("returns the exact manual recovery command when automatic cleanup fails", () => { | ||
| const runFn = (cmd) => { | ||
| if (cmd.includes("docker volume inspect")) return { status: 0 }; | ||
| if (cmd.includes("docker volume rm -f")) return { status: 1 }; | ||
| return { status: 1 }; | ||
| }; | ||
|
|
||
| const result = cleanupGatewayVolumes(runFn); | ||
|
|
||
| assert.deepEqual(result, { | ||
| removedVolumes: [], | ||
| failedVolumes: ["openshell-cluster-nemoclaw"], | ||
| }); | ||
| assert.equal( | ||
| manualGatewayVolumeCleanupCommand(result.failedVolumes), | ||
| "docker volume rm -f 'openshell-cluster-nemoclaw'" | ||
| ); | ||
| }); | ||
| }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Handle Docker-unavailable cases explicitly before reporting cleanup success.
Right now, a non-zero
docker volume inspectis treated as "volume absent" (Line 107), so Docker daemon/CLI failures can end up reported as successful stale-state cleanup.🔧 Proposed fix
function cleanupGatewayVolumes(runFn = run, gatewayName = DEFAULT_GATEWAY_NAME) { const removedVolumes = []; const failedVolumes = []; + const dockerReady = runFn("docker info >/dev/null 2>&1", { + ignoreError: true, + stdio: "ignore", + }); + if (dockerReady.status !== 0) { + return { removedVolumes, failedVolumes: gatewayVolumeCandidates(gatewayName) }; + } for (const volumeName of gatewayVolumeCandidates(gatewayName)) { const inspectResult = runFn(`docker volume inspect ${shellQuote(volumeName)} >/dev/null 2>&1`, { ignoreError: true, stdio: "ignore", });🤖 Prompt for AI Agents