From 58809a69baec5538d1d958d289c24034ca131a72 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Wed, 30 Jul 2025 12:28:29 +0200 Subject: [PATCH 1/3] [no-relnote] Fix containerd and CNI install Signed-off-by: Carlos Eduardo Arango Gutierrez --- .../templates/container-toolkit.go | 12 ++ pkg/provisioner/templates/containerd.go | 107 +++++++++++------- pkg/provisioner/templates/kubernetes.go | 6 + tests/data/test_aws.yml | 2 +- 4 files changed, 83 insertions(+), 44 deletions(-) diff --git a/pkg/provisioner/templates/container-toolkit.go b/pkg/provisioner/templates/container-toolkit.go index 3c41dce1b..bf40fc964 100644 --- a/pkg/provisioner/templates/container-toolkit.go +++ b/pkg/provisioner/templates/container-toolkit.go @@ -39,6 +39,18 @@ install_packages_with_retry nvidia-container-toolkit nvidia-container-toolkit-ba # Configure container runtime sudo nvidia-ctk runtime configure --runtime={{.ContainerRuntime}} --set-as-default --enable-cdi={{.EnableCDI}} + +# Verify CNI configuration is preserved after nvidia-ctk +echo "Verifying CNI configuration after nvidia-ctk..." +if [ "{{.ContainerRuntime}}" = "containerd" ]; then + if ! sudo grep -q 'bin_dir = "/opt/cni/bin:/usr/libexec/cni"' /etc/containerd/config.toml; then + echo "WARNING: CNI bin_dir configuration may have been modified by nvidia-ctk" + echo "Restoring CNI paths..." + # This is a safeguard, but nvidia-ctk should preserve existing CNI config + sudo sed -i '/\[plugins."io.containerd.grpc.v1.cri".cni\]/,/\[/{s|bin_dir = .*|bin_dir = "/opt/cni/bin:/usr/libexec/cni"|g}' /etc/containerd/config.toml + fi +fi + sudo systemctl restart {{.ContainerRuntime}} ` diff --git a/pkg/provisioner/templates/containerd.go b/pkg/provisioner/templates/containerd.go index 2ac0df691..b1216e27c 100644 --- a/pkg/provisioner/templates/containerd.go +++ b/pkg/provisioner/templates/containerd.go @@ -199,26 +199,19 @@ sudo tar Cxzvf /opt/cni/bin ${CNI_TAR} # Configure containerd sudo mkdir -p /etc/containerd -# Generate base configuration -sudo containerd config default | sudo tee /etc/containerd/config.toml > /dev/null - -# Configure based on version -if [ "$MAJOR_VERSION" = "2" ]; then - # Containerd 2.x configuration - cat < /dev/null +# Create unified configuration that works for both 1.x and 2.x +# Start with a minimal config and add only what's needed +cat <<'EOF' | sudo tee /etc/containerd/config.toml > /dev/null +# /etc/containerd/config.toml (managed by Holodeck) version = 2 -root = "/var/lib/containerd" -state = "/run/containerd" - -[grpc] - address = "/run/containerd/containerd.sock" - uid = 0 - gid = 0 [plugins] [plugins."io.containerd.grpc.v1.cri"] sandbox_image = "registry.k8s.io/pause:3.9" - systemd_cgroup = true + [plugins."io.containerd.grpc.v1.cri".cni] + # Include both locations to survive distro variance + bin_dir = "/opt/cni/bin:/usr/libexec/cni" + conf_dir = "/etc/cni/net.d" [plugins."io.containerd.grpc.v1.cri".containerd] [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] @@ -229,35 +222,18 @@ state = "/run/containerd" [plugins."io.containerd.grpc.v1.cri".registry.mirrors] [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"] endpoint = ["https://registry-1.docker.io"] -EOF -else - # Containerd 1.x configuration - cat < /dev/null -version = 1 -root = "/var/lib/containerd" -state = "/run/containerd" [grpc] address = "/run/containerd/containerd.sock" - uid = 0 - gid = 0 - -[plugins] - [plugins."io.containerd.grpc.v1.cri"] - sandbox_image = "registry.k8s.io/pause:3.9" - systemd_cgroup = true - [plugins."io.containerd.grpc.v1.cri".containerd] - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] - runtime_type = "io.containerd.runtime.v1.linux" - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] - SystemdCgroup = true - [plugins."io.containerd.grpc.v1.cri".registry] - [plugins."io.containerd.grpc.v1.cri".registry.mirrors] - [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"] - endpoint = ["https://registry-1.docker.io"] EOF -fi + +# Ensure CNI directories exist +sudo mkdir -p /etc/cni/net.d +sudo mkdir -p /opt/cni/bin + +# Ensure containerd directories exist +sudo mkdir -p /var/lib/containerd +sudo mkdir -p /run/containerd # Set up systemd service for containerd sudo curl -fsSL "https://raw.githubusercontent.com/containerd/containerd/main/containerd.service" -o /etc/systemd/system/containerd.service @@ -284,9 +260,22 @@ ExecStartPre=/bin/mkdir -p /run/containerd ExecStartPre=/bin/chmod 711 /run/containerd EOF +# Ensure containerd is not running with stale config +sudo systemctl stop containerd || true + # Reload systemd and start containerd sudo systemctl daemon-reload -sudo systemctl enable --now containerd +echo "Starting containerd service..." +if ! sudo systemctl enable --now containerd; then + echo "ERROR: Failed to start containerd service" + echo "Checking service status..." + sudo systemctl status containerd || true + echo "Checking journal logs..." + sudo journalctl -xeu containerd -n 50 || true + echo "Checking config file syntax..." + sudo containerd config dump || true + exit 1 +fi # Wait for containerd to be ready timeout=60 @@ -307,11 +296,43 @@ containerd --version runc --version sudo ctr version +# Verify CNI configuration +echo "Verifying containerd CNI configuration..." +if ! sudo grep -q 'bin_dir = "/opt/cni/bin:/usr/libexec/cni"' /etc/containerd/config.toml; then + echo "ERROR: CNI bin_dir not properly configured in containerd" + exit 1 +fi + +if ! sudo grep -q 'conf_dir = "/etc/cni/net.d"' /etc/containerd/config.toml; then + echo "ERROR: CNI conf_dir not properly configured in containerd" + exit 1 +fi + +if ! sudo grep -q 'SystemdCgroup = true' /etc/containerd/config.toml; then + echo "ERROR: SystemdCgroup not enabled in containerd config" + exit 1 +fi + +# Verify with crictl +if command -v crictl &> /dev/null; then + echo "Checking CRI configuration..." + sudo crictl info | grep -E "cni|Cni" || true +fi + +# Note about nvidia-container-toolkit compatibility +echo "" +echo "Note: This containerd configuration is designed to be compatible with nvidia-container-toolkit." +echo "When nvidia-ctk runtime configure is run later, it will:" +echo " - Add nvidia runtime configuration" +echo " - Preserve our CNI settings (bin_dir and conf_dir)" +echo " - May change default_runtime_name to 'nvidia'" +echo "This is expected and will not affect CNI functionality." + # Test containerd functionality sudo ctr images pull docker.io/library/hello-world:latest sudo ctr run --rm docker.io/library/hello-world:latest test -# Containerd installation completed successfully! +echo "Containerd installation and CNI configuration completed successfully!" ` type Containerd struct { @@ -322,7 +343,7 @@ func NewContainerd(env v1alpha1.Environment) *Containerd { var version string if env.Spec.ContainerRuntime.Version == "" { - version = "1.7.26" + version = "1.7.28" } else { // remove the 'v' prefix from the version if it exists version = strings.TrimPrefix(env.Spec.ContainerRuntime.Version, "v") diff --git a/pkg/provisioner/templates/kubernetes.go b/pkg/provisioner/templates/kubernetes.go index 127f71a32..3dc6d41fe 100644 --- a/pkg/provisioner/templates/kubernetes.go +++ b/pkg/provisioner/templates/kubernetes.go @@ -223,6 +223,10 @@ scheduler: --- apiVersion: kubelet.config.k8s.io/v1beta1 kind: KubeletConfiguration +cgroupDriver: systemd +{{- if .IsUbuntu }} +resolvConf: /run/systemd/resolve/resolv.conf +{{- end }} {{- if .ParsedFeatureGates }} featureGates: {{- range $key, $value := .ParsedFeatureGates }} @@ -272,6 +276,7 @@ type KubeadmConfig struct { PodSubnet string FeatureGates string // Feature gates as comma-separated string RuntimeConfig string // Runtime config (for feature gates) resource.k8s.io/v1beta1=true + IsUbuntu bool // Whether the system is Ubuntu (for resolvConf) } func NewKubernetes(env v1alpha1.Environment) (*Kubernetes, error) { @@ -374,6 +379,7 @@ func NewKubeadmConfig(env v1alpha1.Environment) (*KubeadmConfig, error) { PodSubnet: "192.168.0.0/16", // Default subnet, modify if needed FeatureGates: featureGates, // Convert slice to string for kubeadm RuntimeConfig: "resource.k8s.io/v1beta1=true", // Example runtime config + IsUbuntu: true, // Default to true for Ubuntu-based deployments } if env.Spec.Kubernetes.KubernetesVersion == "" { diff --git a/tests/data/test_aws.yml b/tests/data/test_aws.yml index f31025f52..0aca6ff52 100644 --- a/tests/data/test_aws.yml +++ b/tests/data/test_aws.yml @@ -22,4 +22,4 @@ spec: install: true kubernetes: install: true - installer: kubeadm + installer: kubeadm \ No newline at end of file From 52985dc6b27487f3f26db7a5f6c88ad033da77c7 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Wed, 30 Jul 2025 15:53:53 +0200 Subject: [PATCH 2/3] [no-relnote] enhance create UX Signed-off-by: Carlos Eduardo Arango Gutierrez --- cmd/cli/create/create.go | 62 +++++++++++++++++++++++++++++++++++++++- cmd/cli/main.go | 2 +- tests/data/test_aws.yml | 2 +- 3 files changed, 63 insertions(+), 3 deletions(-) diff --git a/cmd/cli/create/create.go b/cmd/cli/create/create.go index 2ee824bcf..dd3275724 100644 --- a/cmd/cli/create/create.go +++ b/cmd/cli/create/create.go @@ -17,8 +17,10 @@ package create import ( + "bufio" "fmt" "os" + "strings" "github.com/NVIDIA/holodeck/api/holodeck/v1alpha1" "github.com/NVIDIA/holodeck/internal/instances" @@ -167,7 +169,8 @@ func (m command) run(c *cli.Context, opts *options) error { if opts.provision { err := runProvision(m.log, opts) if err != nil { - return fmt.Errorf("failed to provision: %v", err) + // Handle provisioning failure with user interaction + return m.handleProvisionFailure(instanceID, opts.cachePath, err) } } @@ -175,6 +178,63 @@ func (m command) run(c *cli.Context, opts *options) error { return nil } +func (m *command) handleProvisionFailure(instanceID, cachePath string, provisionErr error) error { + m.log.Info("\nāŒ Provisioning failed: %v\n", provisionErr) + + // Check if we're in a non-interactive environment + if os.Getenv("CI") == "true" || os.Getenv("HOLODECK_NONINTERACTIVE") == "true" { + m.log.Info("\nšŸ’” To clean up the failed instance, run:") + m.log.Info(" holodeck delete %s\n", instanceID) + m.log.Info("šŸ’” To list all instances:") + m.log.Info(" holodeck list\n") + return fmt.Errorf("provisioning failed: %w", provisionErr) + } + + // Ask user if they want to delete the failed instance + reader := bufio.NewReader(os.Stdin) + m.log.Info("\nā“ Would you like to delete the failed instance? (y/N): ") + + response, err := reader.ReadString('\n') + if err != nil { + m.log.Info("Failed to read user input: %v", err) + return m.provideCleanupInstructions(instanceID, provisionErr) + } + + response = strings.TrimSpace(strings.ToLower(response)) + + if response == "y" || response == "yes" { + // Delete the instance + manager := instances.NewManager(m.log, cachePath) + if err := manager.DeleteInstance(instanceID); err != nil { + m.log.Info("Failed to delete instance: %v", err) + return m.provideCleanupInstructions(instanceID, provisionErr) + } + + m.log.Info("āœ… Successfully deleted failed instance %s\n", instanceID) + return fmt.Errorf("provisioning failed and instance was deleted: %w", provisionErr) + } + + return m.provideCleanupInstructions(instanceID, provisionErr) +} + +func (m *command) provideCleanupInstructions(instanceID string, provisionErr error) error { + m.log.Info("\nšŸ’” The instance was created but provisioning failed.") + m.log.Info(" You can manually investigate or clean up using the following commands:\n") + m.log.Info(" To delete this specific instance:") + m.log.Info(" holodeck delete %s\n", instanceID) + m.log.Info(" To list all instances:") + m.log.Info(" holodeck list\n") + m.log.Info(" To see instance details:") + m.log.Info(" holodeck status %s\n", instanceID) + + m.log.Info("\nšŸ’” Additional debugging tips:") + m.log.Info(" - Review the provisioning logs above for specific errors") + m.log.Info(" - Check cloud provider console for instance status") + m.log.Info(" - SSH into the instance to investigate further") + + return fmt.Errorf("provisioning failed: %w", provisionErr) +} + func runProvision(log *logger.FunLogger, opts *options) error { var hostUrl string diff --git a/cmd/cli/main.go b/cmd/cli/main.go index 390c28fb3..6247f53c1 100644 --- a/cmd/cli/main.go +++ b/cmd/cli/main.go @@ -66,7 +66,7 @@ Examples: holodeck status # Delete an environment - holodeck delete -i + holodeck delete # Use a custom cache directory holodeck --cachepath /path/to/cache create -f env.yaml` diff --git a/tests/data/test_aws.yml b/tests/data/test_aws.yml index 0aca6ff52..f31025f52 100644 --- a/tests/data/test_aws.yml +++ b/tests/data/test_aws.yml @@ -22,4 +22,4 @@ spec: install: true kubernetes: install: true - installer: kubeadm \ No newline at end of file + installer: kubeadm From 35da47117f1181f154ab2be6903790af2c885642 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Wed, 30 Jul 2025 16:01:39 +0200 Subject: [PATCH 3/3] [no-relnote] update unit tests Signed-off-by: Carlos Eduardo Arango Gutierrez --- .../templates/container-toolkit_test.go | 15 ++++ pkg/provisioner/templates/containerd_test.go | 71 ++++++++++++------- pkg/provisioner/templates/crio_test.go | 15 ++++ pkg/provisioner/templates/docker_test.go | 15 ++++ 4 files changed, 91 insertions(+), 25 deletions(-) diff --git a/pkg/provisioner/templates/container-toolkit_test.go b/pkg/provisioner/templates/container-toolkit_test.go index 38c7e797b..48e51ad48 100644 --- a/pkg/provisioner/templates/container-toolkit_test.go +++ b/pkg/provisioner/templates/container-toolkit_test.go @@ -1,3 +1,18 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package templates import ( diff --git a/pkg/provisioner/templates/containerd_test.go b/pkg/provisioner/templates/containerd_test.go index bd842a441..4d8ebfe68 100644 --- a/pkg/provisioner/templates/containerd_test.go +++ b/pkg/provisioner/templates/containerd_test.go @@ -1,3 +1,18 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package templates import ( @@ -11,8 +26,8 @@ import ( func TestNewContainerd_Defaults(t *testing.T) { env := v1alpha1.Environment{} c := NewContainerd(env) - if c.Version != "1.7.26" { - t.Errorf("expected default Version to be '1.7.26', got '%s'", c.Version) + if c.Version != "1.7.28" { + t.Errorf("expected default Version to be '1.7.28', got '%s'", c.Version) } } @@ -39,8 +54,8 @@ func TestNewContainerd_EmptyVersion(t *testing.T) { }, } c := NewContainerd(env) - if c.Version != "1.7.26" { - t.Errorf("expected default Version to be '1.7.26' when empty, got '%s'", c.Version) + if c.Version != "1.7.28" { + t.Errorf("expected default Version to be '1.7.28' when empty, got '%s'", c.Version) } } @@ -74,26 +89,29 @@ func TestContainerd_Execute_Version1(t *testing.T) { } out := buf.String() - // Test version detection - if !strings.Contains(out, "MAJOR_VERSION=$(echo $CONTAINERD_VERSION | cut -d. -f1)") { - t.Error("template output missing version detection") - } - - // Test 1.x specific configuration - if !strings.Contains(out, "version = 1") { - t.Error("template output missing version 1 configuration") + // Test unified configuration (we now use version = 2 for all containerd versions) + if !strings.Contains(out, "version = 2") { + t.Error("template output missing version 2 configuration") } - if !strings.Contains(out, "runtime_type = \"io.containerd.runtime.v1.linux\"") { - t.Error("template output missing 1.x runtime configuration") + if !strings.Contains(out, "runtime_type = \"io.containerd.runc.v2\"") { + t.Error("template output missing runc v2 runtime configuration") } // Test common configuration - if !strings.Contains(out, "systemd_cgroup = true") { - t.Error("template output missing systemd cgroup configuration") + if !strings.Contains(out, "SystemdCgroup = true") { + t.Error("template output missing SystemdCgroup configuration") } if !strings.Contains(out, "sandbox_image = \"registry.k8s.io/pause:3.9\"") { t.Error("template output missing sandbox image configuration") } + + // Test CNI configuration + if !strings.Contains(out, "bin_dir = \"/opt/cni/bin:/usr/libexec/cni\"") { + t.Error("template output missing CNI bin_dir configuration") + } + if !strings.Contains(out, "conf_dir = \"/etc/cni/net.d\"") { + t.Error("template output missing CNI conf_dir configuration") + } } func TestContainerd_Execute_Version2(t *testing.T) { @@ -112,26 +130,29 @@ func TestContainerd_Execute_Version2(t *testing.T) { } out := buf.String() - // Test version detection - if !strings.Contains(out, "MAJOR_VERSION=$(echo $CONTAINERD_VERSION | cut -d. -f1)") { - t.Error("template output missing version detection") - } - - // Test 2.x specific configuration + // Test unified configuration (we now use version = 2 for all containerd versions) if !strings.Contains(out, "version = 2") { t.Error("template output missing version 2 configuration") } if !strings.Contains(out, "runtime_type = \"io.containerd.runc.v2\"") { - t.Error("template output missing 2.x runtime configuration") + t.Error("template output missing runc v2 runtime configuration") } // Test common configuration - if !strings.Contains(out, "systemd_cgroup = true") { - t.Error("template output missing systemd cgroup configuration") + if !strings.Contains(out, "SystemdCgroup = true") { + t.Error("template output missing SystemdCgroup configuration") } if !strings.Contains(out, "sandbox_image = \"registry.k8s.io/pause:3.9\"") { t.Error("template output missing sandbox image configuration") } + + // Test CNI configuration + if !strings.Contains(out, "bin_dir = \"/opt/cni/bin:/usr/libexec/cni\"") { + t.Error("template output missing CNI bin_dir configuration") + } + if !strings.Contains(out, "conf_dir = \"/etc/cni/net.d\"") { + t.Error("template output missing CNI conf_dir configuration") + } } func TestContainerd_Execute_SystemChecks(t *testing.T) { diff --git a/pkg/provisioner/templates/crio_test.go b/pkg/provisioner/templates/crio_test.go index 16a5c1dad..9ce64b1d6 100644 --- a/pkg/provisioner/templates/crio_test.go +++ b/pkg/provisioner/templates/crio_test.go @@ -1,3 +1,18 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package templates import ( diff --git a/pkg/provisioner/templates/docker_test.go b/pkg/provisioner/templates/docker_test.go index 6c2cd02a8..83e7ba9cf 100644 --- a/pkg/provisioner/templates/docker_test.go +++ b/pkg/provisioner/templates/docker_test.go @@ -1,3 +1,18 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package templates import (