From 271b0544be9a29ac7adacbbc51e99b472f142fca Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 23 Apr 2026 23:00:15 +0800 Subject: [PATCH] CI: add opt-in MI300X Triton jobs for PRs and main Keep MI35X as the default Triton PR path, add ci:triton-300x to start extra MI300X jobs on PRs, and run both architectures on main. Update the PR welcome comment to document the new label and main-branch behavior. --- .github/workflows/pr-welcome-comment.yaml | 1 + .github/workflows/triton-test.yaml | 155 +++++++++++++++++++++- 2 files changed, 150 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pr-welcome-comment.yaml b/.github/workflows/pr-welcome-comment.yaml index 84b462771a..dbb7dc7379 100644 --- a/.github/workflows/pr-welcome-comment.yaml +++ b/.github/workflows/pr-welcome-comment.yaml @@ -30,6 +30,7 @@ jobs: | Label | Tests | |-------|-------| + | \`ci:triton-300x\` | Run an additional Triton test job on MI300X in PRs; main branch always runs both MI35X and MI300X | | \`ci:sglang\` | SGLang integration tests | | \`ci:atom\` | ATOM benchmark (DeepSeek-R1 + GPT-OSS) | | \`ci:vllm\` | vLLM benchmark | diff --git a/.github/workflows/triton-test.yaml b/.github/workflows/triton-test.yaml index 236058d94a..3482ea0147 100644 --- a/.github/workflows/triton-test.yaml +++ b/.github/workflows/triton-test.yaml @@ -4,7 +4,8 @@ on: push: branches: [main] pull_request: - types: [opened, synchronize, reopened, ready_for_review] + # labeled: re-run when adding ci:triton-300x to start extra MI300X jobs + types: [opened, synchronize, reopened, ready_for_review, labeled] branches: [main] paths: - "aiter/ops/triton/**" @@ -22,7 +23,7 @@ concurrency: jobs: check-signal: - if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} + if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }} runs-on: ubuntu-latest steps: - name: Checkout code @@ -36,7 +37,7 @@ jobs: # Step 1: split triton tests into 8 shards, output triton_shard_0.list ... triton_shard_7.list split_triton_tests: - if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} + if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }} runs-on: ubuntu-latest needs: [check-signal] outputs: @@ -57,7 +58,7 @@ jobs: # Build Triton wheel once, shared by all shard jobs via artifact build-triton: - if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} + if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }} name: Build Triton Wheel runs-on: linux-aiter-mi35x-1 needs: [check-signal] @@ -112,7 +113,7 @@ jobs: # Step 2: MI35X matrix jobs triton: - if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} + if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }} name: Triton Tests (MI35X) / Shard ${{ matrix.shard }} runs-on: linux-aiter-mi35x-1 needs: [split_triton_tests, build-triton, check-signal] @@ -243,8 +244,150 @@ jobs: run: | docker rm -f triton_test || true + # Step 2b: MI300X matrix jobs (opt-in via ci:triton-300x on PRs, always on main) + triton-mi300x: + if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') && (github.ref == 'refs/heads/main' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ci:triton-300x'))) }} + name: Triton Tests (MI300X) / Shard ${{ matrix.shard }} + runs-on: linux-aiter-mi300x-1 + needs: [split_triton_tests, build-triton, check-signal] + strategy: + fail-fast: false + matrix: + shard: [0, 1, 2, 3, 4, 5, 6, 7] + env: + DOCKER_IMAGE: "rocm/pytorch:latest" + TRITON_TEST: "op_tests/triton_tests/" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 1 + submodules: 'recursive' + + - name: Download test shard lists + uses: actions/download-artifact@v4 + with: + name: triton_shards + + - name: Download Triton wheel + uses: actions/download-artifact@v4 + with: + name: triton-wheel + path: triton-wheels + + - name: List test shard files + run: | + ls -l triton_shard_*.list + + - name: Docker login + run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true + + - name: Export test file list for this shard as env + id: set_shard_files + run: | + TRITON_TEST=$(cat triton_shard_${{ matrix.shard }}.list) + echo "$TRITON_TEST" + echo "TRITON_TEST=$TRITON_TEST" >> $GITHUB_ENV + + - name: Run the container + run: | + set -ex + echo "Starting container: triton_test" + + if [ -f "/etc/podinfo/gha-render-devices" ]; then + DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) + else + DEVICE_FLAG="--device /dev/dri" + fi + + docker run -dt \ + --device=/dev/kfd $DEVICE_FLAG \ + --shm-size=16G \ + --group-add $(getent group render | cut -d: -f3) \ + --group-add $(getent group video | cut -d: -f3) \ + -v "${{ github.workspace }}:/workspace" \ + -w /workspace \ + --name triton_test \ + ${{ env.DOCKER_IMAGE }} + + - name: Setup pip config + run: | + docker exec -u root triton_test bash -c "pip config set global.default-timeout 60" + docker exec -u root triton_test bash -c "pip config set global.retries 10" + + - name: Setup Aiter and Triton + run: | + set -ex + echo "Setting up Aiter and Triton..." + docker exec \ + -e TRITON_WHEEL_DIR=/workspace/triton-wheels \ + -w /workspace \ + triton_test \ + ./.github/scripts/build_aiter_triton.sh + + - name: Install Pytest + run: | + set -ex + echo "Installing Pytest..." + docker exec \ + -w /workspace \ + triton_test \ + pip install pytest + + - name: Triton Tests + run: | + set -ex + echo "Running Triton Tests..." + docker exec -w /workspace triton_test mkdir -p test-reports + docker exec -w /workspace triton_test pytest -v ${TRITON_TEST} --junitxml=test-reports/triton.xml + + - name: Upload test logs + uses: actions/upload-artifact@v4 + if: success() + with: + name: triton-test-mi300x-shard-${{ matrix.shard }} + path: test-reports/triton.xml + retention-days: 7 + + - name: Cleanup container + if: always() + run: | + docker rm -f triton_test || true + + triton-mi300x-test-finish: + if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') && (github.ref == 'refs/heads/main' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ci:triton-300x'))) }} + name: Triton MI300X Test Results + runs-on: ubuntu-latest + needs: [triton-mi300x] + steps: + - name: Download all MI300X test reports + uses: actions/download-artifact@v4 + with: + pattern: triton-test-mi300x-shard-* + path: . + + - name: Check Triton MI300X Test Results + run: | + set -ex + echo "Checking Triton MI300X Test Results..." + all_passed=true + for shard in {0..7}; do + if [ ! -f triton-test-mi300x-shard-${shard}/triton.xml ]; then + echo "MI300X test report for shard ${shard} not found." + all_passed=false + break + fi + done + if [ "$all_passed" = true ]; then + echo "All MI300X tests passed." + else + echo "MI300X test failures or errors detected." + exit 1 + fi + triton-test-finish: - if: ${{ !github.event.pull_request || !github.event.pull_request.draft }} + if: ${{ (!github.event.pull_request || !github.event.pull_request.draft) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }} name: Triton Test Results runs-on: ubuntu-latest needs: [triton]