From 1f20b93d227897034f5c11daebc56cdbc006591a Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Sun, 15 Feb 2026 11:30:40 +0100 Subject: [PATCH 1/3] test(e2e): add ARM64 GPU end-to-end test on merge to main Add an E2E test that exercises the full GPU stack (driver, CTK, Docker, Kubernetes) on an ARM64 g5g.xlarge instance (Graviton2 + T4g GPU). The test intentionally omits image.architecture to validate that the architecture inference from instance type (added in #669) works end-to-end in production. The g5g instance type is arm64-only, so holodeck must infer arm64 and resolve the correct AMI automatically. This test only runs on merge to main (not on PRs) since g5g instances are more expensive than the standard x86_64 test fleet. The periodic cleanup workflow already covers us-east-1 where g5g is available. Changes: - tests/data/test_aws_arm64.yml: g5g.xlarge config, no explicit arch - tests/aws_test.go: new "arm64" labeled test entry - .github/workflows/e2e.yaml: e2e-test-arm64 job gated on main Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/workflows/e2e.yaml | 34 ++++++++++++++++++++++++++++++++++ tests/aws_test.go | 5 +++++ tests/data/test_aws_arm64.yml | 24 ++++++++++++++++++++++++ 3 files changed, 63 insertions(+) create mode 100644 tests/data/test_aws_arm64.yml diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 2433bef2b..12881191c 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -72,6 +72,40 @@ jobs: path: ginkgo.json retention-days: 15 + # ARM64 GPU E2E test — runs only on merge to main (g5g instances are expensive) + e2e-test-arm64: + runs-on: linux-amd64-cpu4 + if: github.ref == 'refs/heads/main' + name: E2E Test (arm64) + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Install Go + uses: actions/setup-go@v6 + with: + go-version: 'stable' + check-latest: true + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y make + + - name: Run ARM64 GPU e2e test + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + LOG_ARTIFACT_DIR: e2e_logs + run: | + e2e_ssh_key=$(mktemp) + echo "${{ secrets.AWS_SSH_KEY }}" > "$e2e_ssh_key" + chmod 600 "$e2e_ssh_key" + export E2E_SSH_KEY="$e2e_ssh_key" + make -f tests/Makefile test GINKGO_ARGS="--label-filter='arm64'" + integration-test: runs-on: linux-amd64-cpu4 if: ${{ github.event.workflow_run.conclusion == 'success' }} && ${{ github.event.workflow_run.event == 'push' }} diff --git a/tests/aws_test.go b/tests/aws_test.go index 90f499476..a0716b29b 100644 --- a/tests/aws_test.go +++ b/tests/aws_test.go @@ -210,6 +210,11 @@ var _ = DescribeTable("AWS Environment E2E", filePath: filepath.Join(packagePath, "data", "test_aws_k8s_latest.yml"), description: "Tests AWS environment with Kubernetes tracking master branch", }, Label("k8s-latest")), + Entry("ARM64 GPU Test", testConfig{ + name: "ARM64 GPU Test", + filePath: filepath.Join(packagePath, "data", "test_aws_arm64.yml"), + description: "Tests full GPU stack on ARM64 (g5g Graviton) with architecture inferred from instance type", + }, Label("arm64")), ) // Note: To run tests in parallel, use: ginkgo -p or --procs=N diff --git a/tests/data/test_aws_arm64.yml b/tests/data/test_aws_arm64.yml new file mode 100644 index 000000000..f207b2c43 --- /dev/null +++ b/tests/data/test_aws_arm64.yml @@ -0,0 +1,24 @@ +apiVersion: holodeck.nvidia.com/v1alpha1 +kind: Environment +metadata: + name: holodeck-aws-e2e-test-arm64 + description: "end-to-end test infrastructure for ARM64 (Graviton + GPU)" +spec: + provider: aws + auth: + keyName: cnt-ci + privateKey: /home/runner/.cache/key + instance: + type: g5g.xlarge + region: us-east-1 + # architecture intentionally omitted to exercise inference from instance type + containerRuntime: + install: true + name: docker + nvidiaContainerToolkit: + install: true + nvidiaDriver: + install: true + kubernetes: + install: true + installer: kubeadm From bcf592c17060a7ff6934b60dd97ad7cfa09e64e7 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Sun, 15 Feb 2026 13:38:23 +0100 Subject: [PATCH 2/3] fix(docker): detect host architecture for cri-dockerd download The Docker package-install template hardcoded CRI_DOCKERD_ARCH="amd64", causing an x86_64 binary to be downloaded on arm64 hosts. This results in "Exec format error" when systemd tries to start cri-docker.service. Replace the hardcoded value with runtime detection using uname -m, the same pattern already used by the git-source install path in the same template and by all other templates (containerd, kubernetes, CRI-O). Validated manually: full ARM64 stack (g5g.xlarge, NVIDIA T4G driver 575.57.08, Docker 29.2.1, cri-dockerd arm64 binary, CTK 1.18.2, Kubernetes v1.33.3) provisioned successfully. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/docker.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/provisioner/templates/docker.go b/pkg/provisioner/templates/docker.go index 7b869ba9d..e992e6a4d 100644 --- a/pkg/provisioner/templates/docker.go +++ b/pkg/provisioner/templates/docker.go @@ -149,7 +149,12 @@ holodeck_progress "$COMPONENT" 5 6 "Installing cri-dockerd" # Install cri-dockerd (idempotent) CRI_DOCKERD_VERSION="0.3.17" -CRI_DOCKERD_ARCH="amd64" +CRI_DOCKERD_ARCH="$(uname -m)" +case "${CRI_DOCKERD_ARCH}" in + x86_64|amd64) CRI_DOCKERD_ARCH="amd64" ;; + aarch64|arm64) CRI_DOCKERD_ARCH="arm64" ;; + *) holodeck_log "ERROR" "$COMPONENT" "Unsupported arch for cri-dockerd: ${CRI_DOCKERD_ARCH}"; exit 1 ;; +esac if [[ ! -f /usr/local/bin/cri-dockerd ]]; then CRI_DOCKERD_URL="https://github.com/Mirantis/cri-dockerd/releases/download/v${CRI_DOCKERD_VERSION}/cri-dockerd-${CRI_DOCKERD_VERSION}.${CRI_DOCKERD_ARCH}.tgz" From 4f44eed99f8aed9f24f2d5fcffda05568cb7ede7 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Sun, 15 Feb 2026 14:16:50 +0100 Subject: [PATCH 3/3] fix(crio): migrate to new CRI-O package repository and default version CRI-O migrated from pkgs.k8s.io/addons:/cri-o to the independent download.opensuse.org/repositories/isv:/cri-o repository. The old URL returns 403, breaking all CRI-O installations. Additionally, when no version is specified, the template produced a malformed URL with an empty version component. Now defaults to v1.33 and normalizes the version to vX.Y format. Reference: https://github.com/cri-o/packaging#readme Signed-off-by: Eduardo Arango Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/crio.go | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pkg/provisioner/templates/crio.go b/pkg/provisioner/templates/crio.go index 4923eedb6..1626bc963 100644 --- a/pkg/provisioner/templates/crio.go +++ b/pkg/provisioner/templates/crio.go @@ -59,18 +59,32 @@ holodeck_progress "$COMPONENT" 2 4 "Adding CRI-O repository" CRIO_VERSION="${DESIRED_VERSION}" +# Default to latest stable CRI-O if no version specified +if [[ -z "$CRIO_VERSION" ]]; then + CRIO_VERSION="v1.33" + holodeck_log "INFO" "$COMPONENT" "No version specified, defaulting to ${CRIO_VERSION}" +fi + +# Ensure version starts with 'v' and is in vX.Y format (strip patch if present) +CRIO_VERSION="${CRIO_VERSION#v}" +CRIO_VERSION="v$(echo "$CRIO_VERSION" | cut -d. -f1,2)" + +# CRI-O migrated from pkgs.k8s.io to download.opensuse.org +# See: https://github.com/cri-o/packaging#readme +CRIO_REPO_URL="https://download.opensuse.org/repositories/isv:/cri-o:/stable:/${CRIO_VERSION}" + # Add CRI-O repo (idempotent) if [[ ! -f /etc/apt/keyrings/cri-o-apt-keyring.gpg ]]; then sudo mkdir -p /etc/apt/keyrings holodeck_retry 3 "$COMPONENT" curl -fsSL \ - "https://pkgs.k8s.io/addons:/cri-o:/stable:/${CRIO_VERSION}/deb/Release.key" | \ + "${CRIO_REPO_URL}/deb/Release.key" | \ sudo gpg --dearmor -o /etc/apt/keyrings/cri-o-apt-keyring.gpg else holodeck_log "INFO" "$COMPONENT" "CRI-O GPG key already present" fi if [[ ! -f /etc/apt/sources.list.d/cri-o.list ]]; then - echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://pkgs.k8s.io/addons:/cri-o:/stable:/${CRIO_VERSION}/deb/ /" | \ + echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] ${CRIO_REPO_URL}/deb/ /" | \ sudo tee /etc/apt/sources.list.d/cri-o.list > /dev/null else holodeck_log "INFO" "$COMPONENT" "CRI-O repository already configured"