diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 2433bef2b..12881191c 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -72,6 +72,40 @@ jobs: path: ginkgo.json retention-days: 15 + # ARM64 GPU E2E test — runs only on merge to main (g5g instances are expensive) + e2e-test-arm64: + runs-on: linux-amd64-cpu4 + if: github.ref == 'refs/heads/main' + name: E2E Test (arm64) + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Install Go + uses: actions/setup-go@v6 + with: + go-version: 'stable' + check-latest: true + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y make + + - name: Run ARM64 GPU e2e test + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + LOG_ARTIFACT_DIR: e2e_logs + run: | + e2e_ssh_key=$(mktemp) + echo "${{ secrets.AWS_SSH_KEY }}" > "$e2e_ssh_key" + chmod 600 "$e2e_ssh_key" + export E2E_SSH_KEY="$e2e_ssh_key" + make -f tests/Makefile test GINKGO_ARGS="--label-filter='arm64'" + integration-test: runs-on: linux-amd64-cpu4 if: ${{ github.event.workflow_run.conclusion == 'success' }} && ${{ github.event.workflow_run.event == 'push' }} diff --git a/pkg/provisioner/templates/crio.go b/pkg/provisioner/templates/crio.go index 4923eedb6..1626bc963 100644 --- a/pkg/provisioner/templates/crio.go +++ b/pkg/provisioner/templates/crio.go @@ -59,18 +59,32 @@ holodeck_progress "$COMPONENT" 2 4 "Adding CRI-O repository" CRIO_VERSION="${DESIRED_VERSION}" +# Default to latest stable CRI-O if no version specified +if [[ -z "$CRIO_VERSION" ]]; then + CRIO_VERSION="v1.33" + holodeck_log "INFO" "$COMPONENT" "No version specified, defaulting to ${CRIO_VERSION}" +fi + +# Ensure version starts with 'v' and is in vX.Y format (strip patch if present) +CRIO_VERSION="${CRIO_VERSION#v}" +CRIO_VERSION="v$(echo "$CRIO_VERSION" | cut -d. -f1,2)" + +# CRI-O migrated from pkgs.k8s.io to download.opensuse.org +# See: https://github.com/cri-o/packaging#readme +CRIO_REPO_URL="https://download.opensuse.org/repositories/isv:/cri-o:/stable:/${CRIO_VERSION}" + # Add CRI-O repo (idempotent) if [[ ! -f /etc/apt/keyrings/cri-o-apt-keyring.gpg ]]; then sudo mkdir -p /etc/apt/keyrings holodeck_retry 3 "$COMPONENT" curl -fsSL \ - "https://pkgs.k8s.io/addons:/cri-o:/stable:/${CRIO_VERSION}/deb/Release.key" | \ + "${CRIO_REPO_URL}/deb/Release.key" | \ sudo gpg --dearmor -o /etc/apt/keyrings/cri-o-apt-keyring.gpg else holodeck_log "INFO" "$COMPONENT" "CRI-O GPG key already present" fi if [[ ! -f /etc/apt/sources.list.d/cri-o.list ]]; then - echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://pkgs.k8s.io/addons:/cri-o:/stable:/${CRIO_VERSION}/deb/ /" | \ + echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] ${CRIO_REPO_URL}/deb/ /" | \ sudo tee /etc/apt/sources.list.d/cri-o.list > /dev/null else holodeck_log "INFO" "$COMPONENT" "CRI-O repository already configured" diff --git a/pkg/provisioner/templates/docker.go b/pkg/provisioner/templates/docker.go index 7b869ba9d..e992e6a4d 100644 --- a/pkg/provisioner/templates/docker.go +++ b/pkg/provisioner/templates/docker.go @@ -149,7 +149,12 @@ holodeck_progress "$COMPONENT" 5 6 "Installing cri-dockerd" # Install cri-dockerd (idempotent) CRI_DOCKERD_VERSION="0.3.17" -CRI_DOCKERD_ARCH="amd64" +CRI_DOCKERD_ARCH="$(uname -m)" +case "${CRI_DOCKERD_ARCH}" in + x86_64|amd64) CRI_DOCKERD_ARCH="amd64" ;; + aarch64|arm64) CRI_DOCKERD_ARCH="arm64" ;; + *) holodeck_log "ERROR" "$COMPONENT" "Unsupported arch for cri-dockerd: ${CRI_DOCKERD_ARCH}"; exit 1 ;; +esac if [[ ! -f /usr/local/bin/cri-dockerd ]]; then CRI_DOCKERD_URL="https://github.com/Mirantis/cri-dockerd/releases/download/v${CRI_DOCKERD_VERSION}/cri-dockerd-${CRI_DOCKERD_VERSION}.${CRI_DOCKERD_ARCH}.tgz" diff --git a/tests/aws_test.go b/tests/aws_test.go index 90f499476..a0716b29b 100644 --- a/tests/aws_test.go +++ b/tests/aws_test.go @@ -210,6 +210,11 @@ var _ = DescribeTable("AWS Environment E2E", filePath: filepath.Join(packagePath, "data", "test_aws_k8s_latest.yml"), description: "Tests AWS environment with Kubernetes tracking master branch", }, Label("k8s-latest")), + Entry("ARM64 GPU Test", testConfig{ + name: "ARM64 GPU Test", + filePath: filepath.Join(packagePath, "data", "test_aws_arm64.yml"), + description: "Tests full GPU stack on ARM64 (g5g Graviton) with architecture inferred from instance type", + }, Label("arm64")), ) // Note: To run tests in parallel, use: ginkgo -p or --procs=N diff --git a/tests/data/test_aws_arm64.yml b/tests/data/test_aws_arm64.yml new file mode 100644 index 000000000..f207b2c43 --- /dev/null +++ b/tests/data/test_aws_arm64.yml @@ -0,0 +1,24 @@ +apiVersion: holodeck.nvidia.com/v1alpha1 +kind: Environment +metadata: + name: holodeck-aws-e2e-test-arm64 + description: "end-to-end test infrastructure for ARM64 (Graviton + GPU)" +spec: + provider: aws + auth: + keyName: cnt-ci + privateKey: /home/runner/.cache/key + instance: + type: g5g.xlarge + region: us-east-1 + # architecture intentionally omitted to exercise inference from instance type + containerRuntime: + install: true + name: docker + nvidiaContainerToolkit: + install: true + nvidiaDriver: + install: true + kubernetes: + install: true + installer: kubeadm