diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index e4a0989f2..e3aba35ad 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -182,9 +182,19 @@ func (p *Provisioner) Run(env v1alpha1.Environment) error { // resetConnection resets the ssh connection, and retries if it fails to connect func (p *Provisioner) resetConnection() error { - // Close the current ssh connection - if err := p.Client.Close(); err != nil { - return fmt.Errorf("failed to close ssh client: %v", err) + // Check if the connection is still active before closing + if p.Client != nil { + // Try to create a new session to check if connection is alive + session, err := p.Client.NewSession() + if err == nil { + session.Close() // nolint:errcheck, gosec + // Connection is alive, close it + if err := p.Client.Close(); err != nil { + return fmt.Errorf("failed to close ssh client: %w", err) + } + } + // If we get here, either the connection was already closed or we couldn't create a session + p.Client = nil } return nil diff --git a/pkg/provisioner/templates/container-toolkit.go b/pkg/provisioner/templates/container-toolkit.go index e5145accd..ddf377381 100644 --- a/pkg/provisioner/templates/container-toolkit.go +++ b/pkg/provisioner/templates/container-toolkit.go @@ -39,9 +39,6 @@ install_packages_with_retry nvidia-container-toolkit # Configure container runtime sudo nvidia-ctk runtime configure --runtime={{.ContainerRuntime}} --set-as-default --enable-cdi={{.EnableCDI}} sudo systemctl restart {{.ContainerRuntime}} - -# safely close the ssh connection -exit 0 ` type ContainerToolkit struct { diff --git a/pkg/provisioner/templates/container-toolkit_test.go b/pkg/provisioner/templates/container-toolkit_test.go index 17396b24a..38c7e797b 100644 --- a/pkg/provisioner/templates/container-toolkit_test.go +++ b/pkg/provisioner/templates/container-toolkit_test.go @@ -60,9 +60,4 @@ func TestContainerToolkit_Execute(t *testing.T) { if !strings.Contains(out, "nvidia-ctk runtime configure --runtime=containerd --set-as-default --enable-cdi=true") { t.Errorf("template output missing expected runtime config: %s", out) } - - // Test safe exit - if !strings.Contains(out, "exit 0") { - t.Errorf("template output missing safe exit: %s", out) - } } diff --git a/pkg/provisioner/templates/containerd.go b/pkg/provisioner/templates/containerd.go index 17751b8ad..d1cd222b5 100644 --- a/pkg/provisioner/templates/containerd.go +++ b/pkg/provisioner/templates/containerd.go @@ -289,7 +289,6 @@ sudo systemctl daemon-reload sudo systemctl enable --now containerd # Wait for containerd to be ready -echo "Waiting for containerd to be ready..." timeout=60 while ! sudo ctr version &>/dev/null; do if [ $timeout -le 0 ]; then @@ -304,20 +303,15 @@ done sudo chmod 666 /run/containerd/containerd.sock # Verify installation -echo "Verifying installation..." containerd --version runc --version sudo ctr version # Test containerd functionality -echo "Testing containerd functionality..." sudo ctr images pull docker.io/library/hello-world:latest sudo ctr run --rm docker.io/library/hello-world:latest test -echo "Containerd installation completed successfully!" - -# safely close the ssh connection -exit 0 +# Containerd installation completed successfully! ` type Containerd struct { diff --git a/pkg/provisioner/templates/containerd_test.go b/pkg/provisioner/templates/containerd_test.go index 9a80013cd..839fbf55f 100644 --- a/pkg/provisioner/templates/containerd_test.go +++ b/pkg/provisioner/templates/containerd_test.go @@ -199,11 +199,6 @@ func TestContainerd_Execute_SystemChecks(t *testing.T) { t.Error("template output missing temporary directory creation") } - // Test safe exit - if !strings.Contains(out, "exit 0") { - t.Error("template output missing safe exit") - } - // Test error handling if !strings.Contains(out, "Error: Failed to download containerd tarball") { t.Error("template output missing download error handling") diff --git a/pkg/provisioner/templates/crio.go b/pkg/provisioner/templates/crio.go index 2858bc097..baf57f23b 100644 --- a/pkg/provisioner/templates/crio.go +++ b/pkg/provisioner/templates/crio.go @@ -38,9 +38,6 @@ apt install -y cri-o # Start and enable Service systemctl daemon-reload systemctl start crio.service - -# safely close the ssh connection -exit 0 ` type CriO struct { diff --git a/pkg/provisioner/templates/crio_test.go b/pkg/provisioner/templates/crio_test.go index 55c8bb893..16a5c1dad 100644 --- a/pkg/provisioner/templates/crio_test.go +++ b/pkg/provisioner/templates/crio_test.go @@ -43,9 +43,4 @@ func TestCriO_Execute(t *testing.T) { if !strings.Contains(out, "systemctl start crio.service") { t.Errorf("template output missing crio start: %s", out) } - - // Test safe exit - if !strings.Contains(out, "exit 0") { - t.Errorf("template output missing safe exit: %s", out) - } } diff --git a/pkg/provisioner/templates/docker.go b/pkg/provisioner/templates/docker.go index 8b1163871..cffdfeb68 100644 --- a/pkg/provisioner/templates/docker.go +++ b/pkg/provisioner/templates/docker.go @@ -74,8 +74,64 @@ sudo systemctl restart docker sudo usermod -aG docker $USER newgrp docker -# safely close the ssh connection -exit 0 +# Install cri-dockerd +CRI_DOCKERD_VERSION="0.3.17" +CRI_DOCKERD_ARCH="amd64" +CRI_DOCKERD_URL="https://github.com/Mirantis/cri-dockerd/releases/download/v${CRI_DOCKERD_VERSION}/cri-dockerd-${CRI_DOCKERD_VERSION}.${CRI_DOCKERD_ARCH}.tgz" + +# Download and install cri-dockerd +curl -L ${CRI_DOCKERD_URL} | sudo tar xzv -C /usr/local/bin --strip-components=1 + +# Create systemd service file for cri-dockerd +sudo tee /etc/systemd/system/cri-docker.service < ubuntu@${INSTANCE_ENDPOINT_HOST}" - -# safely close the ssh connection -exit 0 ` const microk8sTemplate = ` - : ${INSTANCE_ENDPOINT_HOST:={{.K8sEndpointHost}}} # Install microk8s @@ -179,9 +181,6 @@ sudo snap alias microk8s.kubectl kubectl echo "Microk8s {{.Version}} installed successfully" echo "you can now access the cluster with:" echo "ssh -i ubuntu@${INSTANCE_ENDPOINT_HOST}" - -# safely close the ssh connection -exit 0 ` const kubeadmTemplate = `apiVersion: kubeadm.k8s.io/v1beta4 diff --git a/pkg/provisioner/templates/kubernetes_test.go b/pkg/provisioner/templates/kubernetes_test.go index 41bf7dd82..87065b432 100644 --- a/pkg/provisioner/templates/kubernetes_test.go +++ b/pkg/provisioner/templates/kubernetes_test.go @@ -286,10 +286,6 @@ func TestKubernetes_Execute(t *testing.T) { if tt.checkTemplate { assert.Contains(t, out, tt.expectedString) } - - if tt.checkSafeExit { - assert.Contains(t, out, "exit 0", "template output missing safe exit") - } }) } } diff --git a/pkg/provisioner/templates/nv-driver.go b/pkg/provisioner/templates/nv-driver.go index fce3d3417..639351ff7 100644 --- a/pkg/provisioner/templates/nv-driver.go +++ b/pkg/provisioner/templates/nv-driver.go @@ -34,7 +34,7 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_ sudo dpkg -i cuda-keyring_1.1-1_all.deb with_retry 3 10s sudo apt-get update -install_packages_with_retry nvidia-driver{{if .Version}}={{.Version}}{{else if .Branch}}-{{.Branch}}{{end}} +install_packages_with_retry cuda-drivers{{if .Version}}={{.Version}}{{else if .Branch}}-{{.Branch}}{{end}} # Check if NVIDIA module is loaded, if not load it if ! lsmod | grep -q "^nvidia "; then @@ -46,9 +46,6 @@ sudo nvidia-persistenced --persistence-mode # Quick check to see if the driver is installed nvidia-smi - -# safely close the ssh connection -exit 0 ` type NvDriver v1alpha1.NVIDIADriver diff --git a/pkg/provisioner/templates/nv-driver_test.go b/pkg/provisioner/templates/nv-driver_test.go index 502bfed9a..0dec801e7 100644 --- a/pkg/provisioner/templates/nv-driver_test.go +++ b/pkg/provisioner/templates/nv-driver_test.go @@ -48,7 +48,7 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_ sudo dpkg -i cuda-keyring_1.1-1_all.deb with_retry 3 10s sudo apt-get update -install_packages_with_retry nvidia-driver=123.4.5 +install_packages_with_retry cuda-drivers=123.4.5 # Check if NVIDIA module is loaded, if not load it if ! lsmod | grep -q "^nvidia "; then @@ -60,9 +60,6 @@ sudo nvidia-persistenced --persistence-mode # Quick check to see if the driver is installed nvidia-smi - -# safely close the ssh connection -exit 0 `, }, { @@ -79,7 +76,7 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_ sudo dpkg -i cuda-keyring_1.1-1_all.deb with_retry 3 10s sudo apt-get update -install_packages_with_retry nvidia-driver-550 +install_packages_with_retry cuda-drivers-550 # Check if NVIDIA module is loaded, if not load it if ! lsmod | grep -q "^nvidia "; then @@ -91,9 +88,6 @@ sudo nvidia-persistenced --persistence-mode # Quick check to see if the driver is installed nvidia-smi - -# safely close the ssh connection -exit 0 `, }, { @@ -111,7 +105,7 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_ sudo dpkg -i cuda-keyring_1.1-1_all.deb with_retry 3 10s sudo apt-get update -install_packages_with_retry nvidia-driver=123.4.5 +install_packages_with_retry cuda-drivers=123.4.5 # Check if NVIDIA module is loaded, if not load it if ! lsmod | grep -q "^nvidia "; then @@ -123,9 +117,6 @@ sudo nvidia-persistenced --persistence-mode # Quick check to see if the driver is installed nvidia-smi - -# safely close the ssh connection -exit 0 `, }, } @@ -140,9 +131,6 @@ exit 0 // Compare trimmed strings to avoid whitespace issues require.EqualValues(t, strings.TrimSpace(tc.expectedOutput), strings.TrimSpace(output.String())) - - // Test safe exit - require.Contains(t, output.String(), "exit 0", "template output missing safe exit") }) } diff --git a/tests/aws_test.go b/tests/aws_test.go index fa34d669f..ed0c66035 100644 --- a/tests/aws_test.go +++ b/tests/aws_test.go @@ -196,8 +196,18 @@ var _ = Describe("AWS Environment", func() { // Ensure client is properly closed after test defer func() { - if err := p.Client.Close(); err != nil { - state.log.Error(err) + if p.Client != nil { + // Try to create a new session to check if connection is alive + session, err := p.Client.NewSession() + if err == nil { + session.Close() // nolint:errcheck, gosec + // Connection is alive, close it + if err := p.Client.Close(); err != nil { + Expect(err).NotTo(HaveOccurred(), "Failed to close ssh client") + } + } + // If we get here, either the connection was already closed or we couldn't create a session + p.Client = nil } }() diff --git a/tests/data/test_aws.yml b/tests/data/test_aws.yml index e159c7bef..6e04b6507 100644 --- a/tests/data/test_aws.yml +++ b/tests/data/test_aws.yml @@ -9,7 +9,7 @@ spec: keyName: cnt-ci privateKey: /home/runner/.cache/key instance: - type: m4.xlarge + type: g4dn.xlarge region: us-west-1 ingressIpRanges: - 18.190.12.32/32 @@ -22,7 +22,11 @@ spec: architecture: amd64 containerRuntime: install: true - name: containerd + name: docker + nvidiaContainerToolkit: + install: true + nvidiaDriver: + install: true kubernetes: install: true installer: kubeadm