Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions pkg/provisioner/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,19 @@ func (p *Provisioner) Run(env v1alpha1.Environment) error {

// resetConnection resets the ssh connection, and retries if it fails to connect
func (p *Provisioner) resetConnection() error {
// Close the current ssh connection
if err := p.Client.Close(); err != nil {
return fmt.Errorf("failed to close ssh client: %v", err)
// Check if the connection is still active before closing
if p.Client != nil {
// Try to create a new session to check if connection is alive
session, err := p.Client.NewSession()
if err == nil {
session.Close() // nolint:errcheck, gosec
// Connection is alive, close it
if err := p.Client.Close(); err != nil {
return fmt.Errorf("failed to close ssh client: %w", err)
}
}
// If we get here, either the connection was already closed or we couldn't create a session
p.Client = nil
}

return nil
Expand Down
3 changes: 0 additions & 3 deletions pkg/provisioner/templates/container-toolkit.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ install_packages_with_retry nvidia-container-toolkit
# Configure container runtime
sudo nvidia-ctk runtime configure --runtime={{.ContainerRuntime}} --set-as-default --enable-cdi={{.EnableCDI}}
sudo systemctl restart {{.ContainerRuntime}}

# safely close the ssh connection
exit 0
`

type ContainerToolkit struct {
Expand Down
5 changes: 0 additions & 5 deletions pkg/provisioner/templates/container-toolkit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,4 @@ func TestContainerToolkit_Execute(t *testing.T) {
if !strings.Contains(out, "nvidia-ctk runtime configure --runtime=containerd --set-as-default --enable-cdi=true") {
t.Errorf("template output missing expected runtime config: %s", out)
}

// Test safe exit
if !strings.Contains(out, "exit 0") {
t.Errorf("template output missing safe exit: %s", out)
}
}
8 changes: 1 addition & 7 deletions pkg/provisioner/templates/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ sudo systemctl daemon-reload
sudo systemctl enable --now containerd

# Wait for containerd to be ready
echo "Waiting for containerd to be ready..."
timeout=60
while ! sudo ctr version &>/dev/null; do
if [ $timeout -le 0 ]; then
Expand All @@ -304,20 +303,15 @@ done
sudo chmod 666 /run/containerd/containerd.sock

# Verify installation
echo "Verifying installation..."
containerd --version
runc --version
sudo ctr version

# Test containerd functionality
echo "Testing containerd functionality..."
sudo ctr images pull docker.io/library/hello-world:latest
sudo ctr run --rm docker.io/library/hello-world:latest test

echo "Containerd installation completed successfully!"

# safely close the ssh connection
exit 0
# Containerd installation completed successfully!
`

type Containerd struct {
Expand Down
5 changes: 0 additions & 5 deletions pkg/provisioner/templates/containerd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,11 +199,6 @@ func TestContainerd_Execute_SystemChecks(t *testing.T) {
t.Error("template output missing temporary directory creation")
}

// Test safe exit
if !strings.Contains(out, "exit 0") {
t.Error("template output missing safe exit")
}

// Test error handling
if !strings.Contains(out, "Error: Failed to download containerd tarball") {
t.Error("template output missing download error handling")
Expand Down
3 changes: 0 additions & 3 deletions pkg/provisioner/templates/crio.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ apt install -y cri-o
# Start and enable Service
systemctl daemon-reload
systemctl start crio.service

# safely close the ssh connection
exit 0
`

type CriO struct {
Expand Down
5 changes: 0 additions & 5 deletions pkg/provisioner/templates/crio_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,4 @@ func TestCriO_Execute(t *testing.T) {
if !strings.Contains(out, "systemctl start crio.service") {
t.Errorf("template output missing crio start: %s", out)
}

// Test safe exit
if !strings.Contains(out, "exit 0") {
t.Errorf("template output missing safe exit: %s", out)
}
}
60 changes: 58 additions & 2 deletions pkg/provisioner/templates/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,64 @@ sudo systemctl restart docker
sudo usermod -aG docker $USER
newgrp docker

# safely close the ssh connection
exit 0
# Install cri-dockerd
CRI_DOCKERD_VERSION="0.3.17"
CRI_DOCKERD_ARCH="amd64"
CRI_DOCKERD_URL="https://github.com/Mirantis/cri-dockerd/releases/download/v${CRI_DOCKERD_VERSION}/cri-dockerd-${CRI_DOCKERD_VERSION}.${CRI_DOCKERD_ARCH}.tgz"

# Download and install cri-dockerd
curl -L ${CRI_DOCKERD_URL} | sudo tar xzv -C /usr/local/bin --strip-components=1
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Problem: Security vulnerability in curl command execution. The command downloads and directly pipes to tar without verifying the package integrity.

Suggested Change: Add checksum verification before extracting the downloaded package to prevent potential supply chain attacks.

Severity (1 - 4): 4 - CRITICAL

Line: 83

Suggested change
curl -L ${CRI_DOCKERD_URL} | sudo tar xzv -C /usr/local/bin --strip-components=1
curl -L ${CRI_DOCKERD_URL} -o /tmp/cri-dockerd.tgz && echo "${CRI_DOCKERD_SHA256} /tmp/cri-dockerd.tgz" | sha256sum --check && sudo tar xzv -f /tmp/cri-dockerd.tgz -C /usr/local/bin --strip-components=1 && rm /tmp/cri-dockerd.tgz

Generated by Claude 3.5 Sonnet

Was this helpful? 👍 👎


# Create systemd service file for cri-dockerd
sudo tee /etc/systemd/system/cri-docker.service <<EOF
[Unit]
Description=CRI Interface for Docker Application Container Engine
Documentation=https://docs.mirantis.com
After=network-online.target firewalld.service docker.service
Wants=network-online.target
Requires=cri-docker.socket

[Service]
Type=notify
ExecStart=/usr/local/bin/cri-dockerd --container-runtime-endpoint fd://
ExecReload=/bin/kill -s HUP $MAINPID
TimeoutSec=0
RestartSec=2
Restart=always
StartLimitBurst=3
StartLimitInterval=60s
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
TasksMax=infinity
Delegate=yes
KillMode=process

[Install]
WantedBy=multi-user.target
EOF

# Create socket file for cri-dockerd
sudo tee /etc/systemd/system/cri-docker.socket <<EOF
[Unit]
Description=CRI Docker Socket for the API
PartOf=cri-docker.service

[Socket]
ListenStream=/run/cri-dockerd.sock
SocketMode=0660
SocketUser=root
SocketGroup=docker

[Install]
WantedBy=sockets.target
EOF

# Enable and start cri-dockerd
sudo systemctl daemon-reload
sudo systemctl enable cri-docker.service
sudo systemctl enable cri-docker.socket
sudo systemctl start cri-docker.service
`

type Docker struct {
Expand Down
20 changes: 17 additions & 3 deletions pkg/provisioner/templates/docker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ func TestDocker_Execute(t *testing.T) {
t.Fatalf("Execute failed: %v", err)
}
out := buf.String()

// Test Docker installation
if !strings.Contains(out, "docker-ce=$DOCKER_VERSION") {
t.Errorf("template output missing expected docker version install command: %s", out)
}
Expand All @@ -55,8 +57,20 @@ func TestDocker_Execute(t *testing.T) {
t.Errorf("template output missing enable docker: %s", out)
}

// Test safe exit
if !strings.Contains(out, "exit 0") {
t.Errorf("template output missing safe exit: %s", out)
// Test cri-dockerd installation
if !strings.Contains(out, "CRI_DOCKERD_VERSION=\"0.3.17\"") {
t.Errorf("template output missing cri-dockerd version: %s", out)
}
if !strings.Contains(out, "curl -L ${CRI_DOCKERD_URL} | sudo tar xzv -C /usr/local/bin --strip-components=1") {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Problem: Direct curl and pipe to sudo command is potentially unsafe as it downloads and executes content from the internet without verification.

Suggested Change: Should include checksum verification of downloaded content before execution.

Severity (1 - 4): 4 - CRITICAL

Line: 64

Suggested change
if !strings.Contains(out, "curl -L ${CRI_DOCKERD_URL} | sudo tar xzv -C /usr/local/bin --strip-components=1") {
if !strings.Contains(out, "curl -L ${CRI_DOCKERD_URL} -o cri-dockerd.tar.gz && echo \"${CRI_DOCKERD_SHA256} cri-dockerd.tar.gz\" | sha256sum -c && sudo tar xzf cri-dockerd.tar.gz -C /usr/local/bin --strip-components=1")

Generated by Claude 3.5 Sonnet

Was this helpful? 👍 👎

t.Errorf("template output missing cri-dockerd installation command: %s", out)
}
if !strings.Contains(out, "systemctl enable cri-docker.service") {
t.Errorf("template output missing enable cri-docker service: %s", out)
}
if !strings.Contains(out, "systemctl enable cri-docker.socket") {
t.Errorf("template output missing enable cri-docker socket: %s", out)
}
if !strings.Contains(out, "systemctl start cri-docker.service") {
t.Errorf("template output missing start cri-docker service: %s", out)
}
}
3 changes: 0 additions & 3 deletions pkg/provisioner/templates/kernel.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,6 @@ if [ "${CURRENT_KERNEL}" != "${KERNEL_VERSION}" ]; then
# Run the reboot command with nohup to avoid abrupt SSH closure issues
nohup sudo reboot &
fi

# safely close the ssh connection
exit 0
{{- end }}
`

Expand Down
4 changes: 0 additions & 4 deletions pkg/provisioner/templates/kernel_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,6 @@ func TestKernelTemplateContent(t *testing.T) {
name: "reboot command",
contains: "nohup sudo reboot",
},
{
name: "safe exit",
contains: "# safely close the ssh connection\nexit 0",
},
}

for _, tt := range tests {
Expand Down
25 changes: 12 additions & 13 deletions pkg/provisioner/templates/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
)

const KubeadmTemplate = `

# Install kubeadm, kubectl, and k8s-cni
: ${K8S_VERSION:={{.Version}}}
: ${CNI_PLUGINS_VERSION:={{.CniPluginsVersion}}}
Expand Down Expand Up @@ -105,18 +104,25 @@ export KUBECONFIG="${HOME}/.kube/config"
# Install Calico
# based on https://docs.tigera.io/calico/latest/getting-started/kubernetes/quickstart
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/tigera-operator.yaml
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG create -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/custom-resources.yaml

# Wait for Tigera operator to be ready
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG wait --for=condition=available --timeout=300s deployment/tigera-operator -n tigera-operator

# Wait for all necessary CRDs to be established
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/installations.operator.tigera.io
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/apiservers.operator.tigera.io
with_retry 5 10s kubectl --kubeconfig $KUBECONFIG wait --for=condition=established --timeout=300s crd/tigerastatuses.operator.tigera.io

# Apply custom resources with increased retry attempts
with_retry 10 15s kubectl --kubeconfig $KUBECONFIG apply -f https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/custom-resources.yaml

# Make single-node cluster schedulable
kubectl taint nodes --all node-role.kubernetes.io/control-plane:NoSchedule-
kubectl label node --all node-role.kubernetes.io/worker=
kubectl label node --all nvidia.com/holodeck.managed=true

# safely close the ssh connection
exit 0
`

const KindTemplate = `

: ${INSTANCE_ENDPOINT_HOST:={{.K8sEndpointHost}}}

KIND_CONFIG=""
Expand Down Expand Up @@ -155,13 +161,9 @@ with_retry 3 10s kind create cluster --name holodeck $KIND_CONFIG --kubeconfig="
echo "KIND installed successfully"
echo "you can now access the cluster with:"
echo "ssh -i <your-private-key> ubuntu@${INSTANCE_ENDPOINT_HOST}"

# safely close the ssh connection
exit 0
`

const microk8sTemplate = `

: ${INSTANCE_ENDPOINT_HOST:={{.K8sEndpointHost}}}

# Install microk8s
Expand All @@ -179,9 +181,6 @@ sudo snap alias microk8s.kubectl kubectl
echo "Microk8s {{.Version}} installed successfully"
echo "you can now access the cluster with:"
echo "ssh -i <your-private-key> ubuntu@${INSTANCE_ENDPOINT_HOST}"

# safely close the ssh connection
exit 0
`

const kubeadmTemplate = `apiVersion: kubeadm.k8s.io/v1beta4
Expand Down
4 changes: 0 additions & 4 deletions pkg/provisioner/templates/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,10 +286,6 @@ func TestKubernetes_Execute(t *testing.T) {
if tt.checkTemplate {
assert.Contains(t, out, tt.expectedString)
}

if tt.checkSafeExit {
assert.Contains(t, out, "exit 0", "template output missing safe exit")
}
})
}
}
Expand Down
5 changes: 1 addition & 4 deletions pkg/provisioner/templates/nv-driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_
sudo dpkg -i cuda-keyring_1.1-1_all.deb

with_retry 3 10s sudo apt-get update
install_packages_with_retry nvidia-driver{{if .Version}}={{.Version}}{{else if .Branch}}-{{.Branch}}{{end}}
install_packages_with_retry cuda-drivers{{if .Version}}={{.Version}}{{else if .Branch}}-{{.Branch}}{{end}}

# Check if NVIDIA module is loaded, if not load it
if ! lsmod | grep -q "^nvidia "; then
Expand All @@ -46,9 +46,6 @@ sudo nvidia-persistenced --persistence-mode

# Quick check to see if the driver is installed
nvidia-smi

# safely close the ssh connection
exit 0
`

type NvDriver v1alpha1.NVIDIADriver
Expand Down
18 changes: 3 additions & 15 deletions pkg/provisioner/templates/nv-driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_
sudo dpkg -i cuda-keyring_1.1-1_all.deb

with_retry 3 10s sudo apt-get update
install_packages_with_retry nvidia-driver=123.4.5
install_packages_with_retry cuda-drivers=123.4.5

# Check if NVIDIA module is loaded, if not load it
if ! lsmod | grep -q "^nvidia "; then
Expand All @@ -60,9 +60,6 @@ sudo nvidia-persistenced --persistence-mode

# Quick check to see if the driver is installed
nvidia-smi

# safely close the ssh connection
exit 0
`,
},
{
Expand All @@ -79,7 +76,7 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_
sudo dpkg -i cuda-keyring_1.1-1_all.deb

with_retry 3 10s sudo apt-get update
install_packages_with_retry nvidia-driver-550
install_packages_with_retry cuda-drivers-550

# Check if NVIDIA module is loaded, if not load it
if ! lsmod | grep -q "^nvidia "; then
Expand All @@ -91,9 +88,6 @@ sudo nvidia-persistenced --persistence-mode

# Quick check to see if the driver is installed
nvidia-smi

# safely close the ssh connection
exit 0
`,
},
{
Expand All @@ -111,7 +105,7 @@ wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_
sudo dpkg -i cuda-keyring_1.1-1_all.deb

with_retry 3 10s sudo apt-get update
install_packages_with_retry nvidia-driver=123.4.5
install_packages_with_retry cuda-drivers=123.4.5

# Check if NVIDIA module is loaded, if not load it
if ! lsmod | grep -q "^nvidia "; then
Expand All @@ -123,9 +117,6 @@ sudo nvidia-persistenced --persistence-mode

# Quick check to see if the driver is installed
nvidia-smi

# safely close the ssh connection
exit 0
`,
},
}
Expand All @@ -140,9 +131,6 @@ exit 0

// Compare trimmed strings to avoid whitespace issues
require.EqualValues(t, strings.TrimSpace(tc.expectedOutput), strings.TrimSpace(output.String()))

// Test safe exit
require.Contains(t, output.String(), "exit 0", "template output missing safe exit")
})

}
Expand Down
Loading
Loading