From 45292b3865773ea1cab5c75ab38935f8472e9864 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 18 Aug 2025 12:48:06 -0400 Subject: [PATCH] Add conda CUDA 13 workflows that projects can opt into. We need to rollout out CUDA 13 on a per project basis, so that means having the workflows being opt-in for the time being --- .github/workflows/conda-cpp-build-with13.yaml | 209 +++++++++++++++ .github/workflows/conda-cpp-tests-with13.yaml | 237 ++++++++++++++++++ 2 files changed, 446 insertions(+) create mode 100644 .github/workflows/conda-cpp-build-with13.yaml create mode 100644 .github/workflows/conda-cpp-tests-with13.yaml diff --git a/.github/workflows/conda-cpp-build-with13.yaml b/.github/workflows/conda-cpp-build-with13.yaml new file mode 100644 index 00000000..6a3afdc2 --- /dev/null +++ b/.github/workflows/conda-cpp-build-with13.yaml @@ -0,0 +1,209 @@ +on: + workflow_call: + inputs: + build_type: + description: "One of: [branch, nightly, pull-request]" + required: true + type: string + branch: + description: | + Git branch the workflow run targets. + This is required even when 'sha' is provided because it is also used for organizing artifacts. + type: string + date: + description: "Date (YYYY-MM-DD) this run is for. Used to organize artifacts produced by nightly builds" + type: string + sha: + description: "Full git commit SHA to check out" + type: string + repo: + description: "Git repo to check out, in '{org}/{repo}' form, e.g. 'rapidsai/cudf'" + type: string + node_type: + description: | + Suffix, without leading '-', indicating the type of machine to run jobs on (e.g., 'cpu4' or 'gpu-l4-latest-1'). + Runner labels are of the form '{operating_system}-{arch}-{node_type}'. + See https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md for a list + of valid values. + type: string + default: "cpu8" + script: + type: string + required: true + description: "Shell code to be executed in a step. Ideally this should just invoke a script managed in the repo the workflow runs from, like 'ci/build_cpp.sh'." + upload-artifacts: + type: boolean + default: true + required: false + description: "One of [true, false], true if artifacts should be uploaded to GitHub's artifact store" + matrix_filter: + description: | + jq expression which modifies the matrix. + For example, 'map(select(.ARCH == "amd64"))' to achieve "only run amd64 jobs". + type: string + default: "." + alternative-gh-token-secret-name: + type: string + required: false + description: | + If provided, should contain the name of a secret in the repo which holds a GitHub API token. + When this is non-empty, that secret's value is used in place of the default repo-level token + anywhere that environment variable GH_TOKEN is set. This is especially useful for downloading + artifacts from other private repos, which repo tokens do not have access to. + +defaults: + run: + shell: bash + +permissions: + actions: read + checks: none + contents: read + deployments: none + discussions: none + id-token: write + issues: none + packages: read + pages: none + pull-requests: read + repository-projects: none + security-events: none + statuses: none + +jobs: + compute-matrix: + runs-on: ubuntu-latest + outputs: + MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} + steps: + - name: Compute Build Matrix + id: compute-matrix + env: + MATRIX_FILTER: ${{ inputs.matrix_filter }} + run: | + set -eo pipefail + + # please keep the matrices sorted in ascending order by the following: + # + # [ARCH, PY_VER, CUDA_VER, LINUX_VER] + # + export MATRIX=" + # amd64 + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.0', LINUX_VER: 'rockylinux8' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.9.1', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.0.0', LINUX_VER: 'rockylinux8' } + " + + MATRIX="$( + yq -n -o json 'env(MATRIX)' | \ + jq -c "${MATRIX_FILTER} | if (. | length) > 0 then {include: .} else \"Error: Empty matrix\n\" | halt_error(1) end" + )" + + echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" + build: + name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, ${{ matrix.LINUX_VER }} + needs: compute-matrix + timeout-minutes: 480 + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} + runs-on: "linux-${{ matrix.ARCH }}-${{ inputs.node_type }}" + env: + RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts + container: + image: rapidsai/ci-conda:25.10-cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }} + env: + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + steps: + - uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 + with: + role-to-assume: ${{ vars.AWS_ROLE_ARN }} + aws-region: ${{ vars.AWS_REGION }} + role-duration-seconds: 43200 # 12h + - uses: actions/checkout@v4 + with: + repository: ${{ inputs.repo }} + ref: ${{ inputs.sha }} + fetch-depth: 0 + persist-credentials: true + - name: Standardize repository information + env: + RAPIDS_REPOSITORY: ${{ inputs.repo || github.repository }} + RAPIDS_REF_NAME: ${{ inputs.branch || github.ref_name }} + RAPIDS_NIGHTLY_DATE: ${{ inputs.date }} + run: | + { + echo "RAPIDS_REPOSITORY=${RAPIDS_REPOSITORY}" + echo "RAPIDS_SHA=$(git rev-parse HEAD)" + echo "RAPIDS_REF_NAME=${RAPIDS_REF_NAME}" + echo "RAPIDS_NIGHTLY_DATE=${RAPIDS_NIGHTLY_DATE}" + } >> "${GITHUB_ENV}" + + - name: Setup proxy cache + uses: nv-gha-runners/setup-proxy-cache@main + continue-on-error: true + + # This has to be AFTER the checkout step. It creates a telemetry-artifacts directory, + # and the checkout step would destroy it. + - name: Telemetry setup + uses: rapidsai/shared-actions/telemetry-dispatch-setup@main + continue-on-error: true + if: ${{ vars.TELEMETRY_ENABLED == 'true' }} + env: + # DOES NOT NEED alternative-gh-token-secret-name - github.token is enough and more limited + GH_TOKEN: ${{ github.token }} + with: + extra_attributes: "rapids.PACKAGER=conda,rapids.CUDA_VER=${{ matrix.CUDA_VER }},rapids.PY_VER=${{ matrix.PY_VER }},rapids.ARCH=${{ matrix.ARCH }},rapids.LINUX_VER=${{ matrix.LINUX_VER }}" + + + # Per the docs at https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user, + # checking '/rate_limit | jq .' should not itself count against any rate limits. + # + # gh CLI is pre-installed on Github-hosted runners, but may not be on self-hosted runners. + - name: Check GitHub API rate limits + run: | + if ! type gh >/dev/null; then + echo "'gh' CLI is not installed... skipping rate-limits check" + else + gh api /rate_limit | jq . + fi + env: + # NEEDS alternative-gh-token-secret-name - API limits need to be for whatever token is used for upload/download. Repo token may be a different pool for rate limits. + GH_TOKEN: ${{ inputs.alternative-gh-token-secret-name && secrets[inputs.alternative-gh-token-secret-name] || github.token }} # zizmor: ignore[overprovisioned-secrets] + - name: C++ build + run: ${{ inputs.script }} # zizmor: ignore[template-injection] + env: + STEP_NAME: "C++ build" + # NEEDS alternative-gh-token-secret-name - may require a token with more permissions + GH_TOKEN: ${{ inputs.alternative-gh-token-secret-name && secrets[inputs.alternative-gh-token-secret-name] || github.token }} # zizmor: ignore[overprovisioned-secrets] + - name: Get Package Name and Location + if: ${{ inputs.upload-artifacts }} + run: | + echo "RAPIDS_PACKAGE_NAME=$(RAPIDS_NO_PKG_EXTENSION=true rapids-package-name conda_cpp)" >> "${GITHUB_OUTPUT}" + echo "CONDA_OUTPUT_DIR=${RAPIDS_CONDA_BLD_OUTPUT_DIR}" >> "${GITHUB_OUTPUT}" + id: package-name + - name: Show files to be uploaded + if: ${{ inputs.upload-artifacts }} + env: + CONDA_OUTPUT_DIR: ${{ steps.package-name.outputs.CONDA_OUTPUT_DIR }} + run: | + echo "Contents of directory to be uploaded:" + ls -R "${CONDA_OUTPUT_DIR}" + - uses: actions/upload-artifact@v4 + if: ${{ inputs.upload-artifacts }} + with: + if-no-files-found: 'error' + name: ${{ steps.package-name.outputs.RAPIDS_PACKAGE_NAME }} + path: ${{ steps.package-name.outputs.CONDA_OUTPUT_DIR }} + - name: Upload additional artifacts + if: "!cancelled()" + run: rapids-upload-artifacts-dir "cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch)" + - name: Telemetry upload attributes + uses: rapidsai/shared-actions/telemetry-dispatch-stash-job-artifacts@main + continue-on-error: true + if: ${{ vars.TELEMETRY_ENABLED == 'true' }} + env: + # DOES NOT NEED alternative-gh-token-secret-name - github.token is enough and more limited + GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/conda-cpp-tests-with13.yaml b/.github/workflows/conda-cpp-tests-with13.yaml new file mode 100644 index 00000000..5b1efe81 --- /dev/null +++ b/.github/workflows/conda-cpp-tests-with13.yaml @@ -0,0 +1,237 @@ +on: + workflow_call: + inputs: + build_type: + description: "One of: [branch, nightly, pull-request]" + required: true + type: string + matrix_type: + description: "One of: [auto, nightly, pull-request]. 'auto' means 'choose a value based on what's provided via build_type'." + required: false + type: string + default: "auto" + branch: + description: | + Git branch the workflow run targets. + This is required even when 'sha' is provided because it is also used for organizing artifacts. + type: string + date: + description: "Date (YYYY-MM-DD) this run is for. Used to organize artifacts produced by nightly builds" + type: string + sha: + description: "Full git commit SHA to check out" + type: string + repo: + description: "Git repo to check out, in '{org}/{repo}' form, e.g. 'rapidsai/cudf'" + type: string + script: + type: string + required: true + description: "Shell code to be executed in a step. Ideally this should just invoke a script managed in the repo the workflow runs from, like 'ci/test_cpp.sh'." + matrix_filter: + description: | + jq expression which modifies the matrix. + For example, 'map(select(.ARCH == "amd64"))' to achieve "only run amd64 jobs". + type: string + default: "." + container-options: + description: | + Command-line arguments passed to 'docker run' when starting the container this workflow runs in. + This should be provided as a single string to be inlined into 'docker run', not an array. + For example, '--quiet --ulimit nofile=2048'. + required: false + type: string + default: "-e _NOOP" + build_workflow_name: + description: | + Name of a workflow file that produced artifacts to be downloaded in this run. + If not set (the default), artifact-handling scripts use RAPIDS-conventional defaults (like "build.yaml" when "build_type == nightly"). + required: false + type: string + alternative-gh-token-secret-name: + type: string + required: false + description: | + If provided, should contain the name of a secret in the repo which holds a GitHub API token. + When this is non-empty, that secret's value is used in place of the default repo-level token + anywhere that environment variable GH_TOKEN is set. This is especially useful for downloading + artifacts from other private repos, which repo tokens do not have access to. + +defaults: + run: + shell: bash + +permissions: + actions: read + checks: none + contents: read + deployments: none + discussions: none + id-token: write + issues: none + packages: read + pages: none + pull-requests: read + repository-projects: none + security-events: none + statuses: none + +jobs: + compute-matrix: + runs-on: ubuntu-latest + env: + BUILD_TYPE: ${{ inputs.build_type }} + MATRIX_TYPE: ${{ inputs.matrix_type }} + outputs: + MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} + steps: + - name: Validate Inputs + run: | + if [[ "$BUILD_TYPE" != "branch" ]] && [[ "$BUILD_TYPE" != "nightly" ]] && [[ "$BUILD_TYPE" != "pull-request" ]]; then + echo "Invalid build_type! Must be one of 'branch', 'nightly', or 'pull-request'." + exit 1 + fi + if [[ "$MATRIX_TYPE" != "auto" ]] && [[ "$MATRIX_TYPE" != "nightly" ]] && [[ "$MATRIX_TYPE" != "pull-request" ]]; then + echo "Invalid matrix_type! Must be one of 'auto', 'nightly', or 'pull-request'." + exit 1 + fi + - name: Compute C++ Test Matrix + id: compute-matrix + env: + MATRIX_FILTER: ${{ inputs.matrix_filter }} + run: | + set -eo pipefail + + # please keep the matrices sorted in ascending order by the following: + # + # [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER, DEPENDENCIES] + # + export MATRICES=" + pull-request: + # amd64 + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.2.2', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'oldest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LINUX_VER: 'ubuntu24.04', GPU: 'h100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.0', LINUX_VER: 'ubuntu24.04', GPU: 'h100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.0.1', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.0', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + nightly: + # amd64 + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'oldest' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.2.2', LINUX_VER: 'ubuntu22.04', GPU: 'h100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.0.1', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LINUX_VER: 'ubuntu24.04', GPU: 'h100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + # arm64 + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.2.2', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'oldest' } + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.2.2', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.9.1', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + " + + # only overwrite MATRIX_TYPE if it was set to 'auto' + if [[ "${MATRIX_TYPE}" == "auto" ]]; then + if [[ "${BUILD_TYPE}" == "branch" ]]; then + # Use the nightly matrix for branch tests + MATRIX_TYPE="nightly" + else + MATRIX_TYPE="${BUILD_TYPE}" + fi + fi + export MATRIX_TYPE + TEST_MATRIX=$(yq -n 'env(MATRICES) | .[strenv(MATRIX_TYPE)]') + export TEST_MATRIX + + MATRIX="$( + yq -n -o json 'env(TEST_MATRIX)' | \ + jq -c "${MATRIX_FILTER} | if (. | length) > 0 then {include: .} else \"Error: Empty matrix\n\" | halt_error(1) end" + )" + + echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" + tests: + name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, ${{ matrix.LINUX_VER }}, ${{ matrix.GPU }}, ${{ matrix.DRIVER }}-driver, ${{ matrix.DEPENDENCIES }}-deps + needs: compute-matrix + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} + runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-1" + env: + RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts + RAPIDS_DEPENDENCIES: ${{ matrix.DEPENDENCIES }} + RAPIDS_TESTS_DIR: ${{ github.workspace }}/test-results + container: + image: rapidsai/ci-conda:25.10-cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }} + options: ${{ inputs.container-options }} + env: + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + steps: + - uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 + with: + role-to-assume: ${{ vars.AWS_ROLE_ARN }} + aws-region: ${{ vars.AWS_REGION }} + role-duration-seconds: 43200 # 12h + + - uses: actions/checkout@v4 + with: + repository: ${{ inputs.repo }} + ref: ${{ inputs.sha }} + fetch-depth: 0 + persist-credentials: true + # This has to be AFTER the checkout step. It creates a telemetry-artifacts directory, + # and the checkout step would destroy it. + - name: Telemetry setup + uses: rapidsai/shared-actions/telemetry-dispatch-setup@main + continue-on-error: true + if: ${{ vars.TELEMETRY_ENABLED == 'true' }} + with: + extra_attributes: "rapids.PACKAGER=conda,rapids.CUDA_VER=${{ matrix.CUDA_VER }},rapids.PY_VER=${{ matrix.PY_VER }},rapids.ARCH=${{ matrix.ARCH }},rapids.LINUX_VER=${{ matrix.LINUX_VER }},rapids.GPU=${{ matrix.GPU }},rapids.DRIVER=${{ matrix.DRIVER }},rapids.DEPENDENCIES=${{ matrix.DEPENDENCIES }}" + env: + # DOES NOT NEED alternative-gh-token-secret_name - github.token is enough and more limited + GH_TOKEN: ${{ github.token }} + + - name: Standardize repository information + uses: rapidsai/shared-actions/rapids-github-info@main + with: + repo: ${{ inputs.repo }} + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + build_workflow_name: ${{ inputs.build_workflow_name }} + + - name: Setup proxy cache + uses: nv-gha-runners/setup-proxy-cache@main + continue-on-error: true + + # Per the docs at https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user, + # checking '/rate_limit | jq .' should not itself count against any rate limits. + # + # gh CLI is pre-installed on Github-hosted runners, but may not be on self-hosted runners. + - name: Check GitHub API rate limits + run: | + if ! type gh >/dev/null; then + echo "'gh' CLI is not installed... skipping rate-limits check" + else + gh api /rate_limit | jq . + fi + env: + # NEEDS alternative-gh-token-secret_name - API limits need to be for whatever token is used for upload/download. Repo token may be a different pool for rate limits. + GH_TOKEN: ${{ inputs.alternative-gh-token-secret-name && secrets[inputs.alternative-gh-token-secret-name] || github.token }} # zizmor: ignore[overprovisioned-secrets] + - name: C++ tests + run: ${{ inputs.script }} # zizmor: ignore[template-injection] + env: + # NEEDS alternative-gh-token-secret-name - may require a token with more permissions + GH_TOKEN: ${{ inputs.alternative-gh-token-secret-name && secrets[inputs.alternative-gh-token-secret-name] || github.token }} # zizmor: ignore[overprovisioned-secrets] + - name: Generate test report + uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4 + with: + paths: "${{ env.RAPIDS_TESTS_DIR }}/*.xml" + if: always() + - name: Upload additional artifacts + if: "!cancelled()" + run: rapids-upload-artifacts-dir "cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch)" + - name: Telemetry upload attributes + uses: rapidsai/shared-actions/telemetry-dispatch-stash-job-artifacts@main + continue-on-error: true + if: ${{ vars.TELEMETRY_ENABLED == 'true' }} + env: + # DOES NOT NEED alternative-gh-token-secret-name - github.token is enough and more limited + GH_TOKEN: ${{ github.token }}