Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion cmd/cli/create/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
package create

import (
"bufio"
"fmt"
"os"
"strings"

"github.com/NVIDIA/holodeck/api/holodeck/v1alpha1"
"github.com/NVIDIA/holodeck/internal/instances"
Expand Down Expand Up @@ -167,14 +169,72 @@ func (m command) run(c *cli.Context, opts *options) error {
if opts.provision {
err := runProvision(m.log, opts)
if err != nil {
return fmt.Errorf("failed to provision: %v", err)
// Handle provisioning failure with user interaction
return m.handleProvisionFailure(instanceID, opts.cachePath, err)
}
}

m.log.Info("\nCreated instance %s", instanceID)
return nil
}

func (m *command) handleProvisionFailure(instanceID, cachePath string, provisionErr error) error {
m.log.Info("\n❌ Provisioning failed: %v\n", provisionErr)

// Check if we're in a non-interactive environment
if os.Getenv("CI") == "true" || os.Getenv("HOLODECK_NONINTERACTIVE") == "true" {
m.log.Info("\n💡 To clean up the failed instance, run:")
m.log.Info(" holodeck delete %s\n", instanceID)
m.log.Info("💡 To list all instances:")
m.log.Info(" holodeck list\n")
return fmt.Errorf("provisioning failed: %w", provisionErr)
}

// Ask user if they want to delete the failed instance
reader := bufio.NewReader(os.Stdin)
m.log.Info("\n❓ Would you like to delete the failed instance? (y/N): ")

response, err := reader.ReadString('\n')
if err != nil {
m.log.Info("Failed to read user input: %v", err)
return m.provideCleanupInstructions(instanceID, provisionErr)
}

response = strings.TrimSpace(strings.ToLower(response))

if response == "y" || response == "yes" {
// Delete the instance
manager := instances.NewManager(m.log, cachePath)
if err := manager.DeleteInstance(instanceID); err != nil {
m.log.Info("Failed to delete instance: %v", err)
return m.provideCleanupInstructions(instanceID, provisionErr)
}

m.log.Info("✅ Successfully deleted failed instance %s\n", instanceID)
return fmt.Errorf("provisioning failed and instance was deleted: %w", provisionErr)
}

return m.provideCleanupInstructions(instanceID, provisionErr)
}

func (m *command) provideCleanupInstructions(instanceID string, provisionErr error) error {
m.log.Info("\n💡 The instance was created but provisioning failed.")
m.log.Info(" You can manually investigate or clean up using the following commands:\n")
m.log.Info(" To delete this specific instance:")
m.log.Info(" holodeck delete %s\n", instanceID)
m.log.Info(" To list all instances:")
m.log.Info(" holodeck list\n")
m.log.Info(" To see instance details:")
m.log.Info(" holodeck status %s\n", instanceID)

m.log.Info("\n💡 Additional debugging tips:")
m.log.Info(" - Review the provisioning logs above for specific errors")
m.log.Info(" - Check cloud provider console for instance status")
m.log.Info(" - SSH into the instance to investigate further")

return fmt.Errorf("provisioning failed: %w", provisionErr)
}

func runProvision(log *logger.FunLogger, opts *options) error {
var hostUrl string

Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ Examples:
holodeck status <instance-id>

# Delete an environment
holodeck delete -i <instance-id>
holodeck delete <instance-id>

# Use a custom cache directory
holodeck --cachepath /path/to/cache create -f env.yaml`
Expand Down
12 changes: 12 additions & 0 deletions pkg/provisioner/templates/container-toolkit.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ install_packages_with_retry nvidia-container-toolkit nvidia-container-toolkit-ba

# Configure container runtime
sudo nvidia-ctk runtime configure --runtime={{.ContainerRuntime}} --set-as-default --enable-cdi={{.EnableCDI}}

# Verify CNI configuration is preserved after nvidia-ctk
echo "Verifying CNI configuration after nvidia-ctk..."
if [ "{{.ContainerRuntime}}" = "containerd" ]; then
if ! sudo grep -q 'bin_dir = "/opt/cni/bin:/usr/libexec/cni"' /etc/containerd/config.toml; then
echo "WARNING: CNI bin_dir configuration may have been modified by nvidia-ctk"
echo "Restoring CNI paths..."
# This is a safeguard, but nvidia-ctk should preserve existing CNI config
Copy link

Copilot AI Jul 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sed command on line 51 is complex and error-prone. Consider using a more robust configuration management approach or validating that the sed pattern correctly targets only the intended configuration section.

Copilot uses AI. Check for mistakes.
sudo sed -i '/\[plugins."io.containerd.grpc.v1.cri".cni\]/,/\[/{s|bin_dir = .*|bin_dir = "/opt/cni/bin:/usr/libexec/cni"|g}' /etc/containerd/config.toml
fi
fi

sudo systemctl restart {{.ContainerRuntime}}
`

Expand Down
15 changes: 15 additions & 0 deletions pkg/provisioner/templates/container-toolkit_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package templates

import (
Expand Down
107 changes: 64 additions & 43 deletions pkg/provisioner/templates/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,26 +199,19 @@ sudo tar Cxzvf /opt/cni/bin ${CNI_TAR}
# Configure containerd
sudo mkdir -p /etc/containerd

# Generate base configuration
sudo containerd config default | sudo tee /etc/containerd/config.toml > /dev/null

# Configure based on version
if [ "$MAJOR_VERSION" = "2" ]; then
# Containerd 2.x configuration
cat <<EOF | sudo tee /etc/containerd/config.toml > /dev/null
# Create unified configuration that works for both 1.x and 2.x
# Start with a minimal config and add only what's needed
cat <<'EOF' | sudo tee /etc/containerd/config.toml > /dev/null
# /etc/containerd/config.toml (managed by Holodeck)
version = 2
root = "/var/lib/containerd"
state = "/run/containerd"

[grpc]
address = "/run/containerd/containerd.sock"
uid = 0
gid = 0

[plugins]
[plugins."io.containerd.grpc.v1.cri"]
sandbox_image = "registry.k8s.io/pause:3.9"
systemd_cgroup = true
[plugins."io.containerd.grpc.v1.cri".cni]
Copy link

Copilot AI Jul 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The systemd_cgroup configuration has been removed from the CRI plugin configuration. This is likely needed for proper cgroup management with systemd-based systems and should be preserved.

Copilot uses AI. Check for mistakes.
# Include both locations to survive distro variance
bin_dir = "/opt/cni/bin:/usr/libexec/cni"
conf_dir = "/etc/cni/net.d"
[plugins."io.containerd.grpc.v1.cri".containerd]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
Expand All @@ -229,35 +222,18 @@ state = "/run/containerd"
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
endpoint = ["https://registry-1.docker.io"]
EOF
else
# Containerd 1.x configuration
cat <<EOF | sudo tee /etc/containerd/config.toml > /dev/null
version = 1
root = "/var/lib/containerd"
state = "/run/containerd"

[grpc]
address = "/run/containerd/containerd.sock"
uid = 0
gid = 0

[plugins]
[plugins."io.containerd.grpc.v1.cri"]
sandbox_image = "registry.k8s.io/pause:3.9"
systemd_cgroup = true
[plugins."io.containerd.grpc.v1.cri".containerd]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runtime.v1.linux"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
SystemdCgroup = true
[plugins."io.containerd.grpc.v1.cri".registry]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
endpoint = ["https://registry-1.docker.io"]
EOF
fi

# Ensure CNI directories exist
sudo mkdir -p /etc/cni/net.d
sudo mkdir -p /opt/cni/bin

# Ensure containerd directories exist
Comment on lines 227 to +234
Copy link

Copilot AI Jul 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The unified configuration removes version-specific handling but still uses hardcoded paths. Consider adding validation that these paths are appropriate for the target system, especially since the comment mentions 'distro variance'.

Copilot uses AI. Check for mistakes.
sudo mkdir -p /var/lib/containerd
sudo mkdir -p /run/containerd

# Set up systemd service for containerd
sudo curl -fsSL "https://raw.githubusercontent.com/containerd/containerd/main/containerd.service" -o /etc/systemd/system/containerd.service
Expand All @@ -284,9 +260,22 @@ ExecStartPre=/bin/mkdir -p /run/containerd
ExecStartPre=/bin/chmod 711 /run/containerd
EOF

# Ensure containerd is not running with stale config
sudo systemctl stop containerd || true

# Reload systemd and start containerd
sudo systemctl daemon-reload
sudo systemctl enable --now containerd
echo "Starting containerd service..."
if ! sudo systemctl enable --now containerd; then
echo "ERROR: Failed to start containerd service"
echo "Checking service status..."
sudo systemctl status containerd || true
echo "Checking journal logs..."
sudo journalctl -xeu containerd -n 50 || true
echo "Checking config file syntax..."
sudo containerd config dump || true
exit 1
fi

# Wait for containerd to be ready
timeout=60
Expand All @@ -307,11 +296,43 @@ containerd --version
runc --version
sudo ctr version

# Verify CNI configuration
echo "Verifying containerd CNI configuration..."
if ! sudo grep -q 'bin_dir = "/opt/cni/bin:/usr/libexec/cni"' /etc/containerd/config.toml; then
echo "ERROR: CNI bin_dir not properly configured in containerd"
exit 1
fi

if ! sudo grep -q 'conf_dir = "/etc/cni/net.d"' /etc/containerd/config.toml; then
echo "ERROR: CNI conf_dir not properly configured in containerd"
exit 1
fi

if ! sudo grep -q 'SystemdCgroup = true' /etc/containerd/config.toml; then
echo "ERROR: SystemdCgroup not enabled in containerd config"
exit 1
fi
Comment on lines +311 to +314
Copy link

Copilot AI Jul 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The verification checks for 'SystemdCgroup = true' but this configuration is not present in the generated config.toml template above. The SystemdCgroup setting was removed from the configuration but the verification still expects it.

Suggested change
if ! sudo grep -q 'SystemdCgroup = true' /etc/containerd/config.toml; then
echo "ERROR: SystemdCgroup not enabled in containerd config"
exit 1
fi
// Removed outdated verification logic for SystemdCgroup = true

Copilot uses AI. Check for mistakes.

# Verify with crictl
if command -v crictl &> /dev/null; then
echo "Checking CRI configuration..."
sudo crictl info | grep -E "cni|Cni" || true
fi

# Note about nvidia-container-toolkit compatibility
echo ""
echo "Note: This containerd configuration is designed to be compatible with nvidia-container-toolkit."
echo "When nvidia-ctk runtime configure is run later, it will:"
echo " - Add nvidia runtime configuration"
echo " - Preserve our CNI settings (bin_dir and conf_dir)"
echo " - May change default_runtime_name to 'nvidia'"
echo "This is expected and will not affect CNI functionality."

# Test containerd functionality
sudo ctr images pull docker.io/library/hello-world:latest
sudo ctr run --rm docker.io/library/hello-world:latest test

# Containerd installation completed successfully!
echo "Containerd installation and CNI configuration completed successfully!"
`

type Containerd struct {
Expand All @@ -322,7 +343,7 @@ func NewContainerd(env v1alpha1.Environment) *Containerd {
var version string

if env.Spec.ContainerRuntime.Version == "" {
version = "1.7.26"
version = "1.7.28"
} else {
// remove the 'v' prefix from the version if it exists
version = strings.TrimPrefix(env.Spec.ContainerRuntime.Version, "v")
Expand Down
Loading
Loading