Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/pr-welcome-comment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:

| Label | Tests |
|-------|-------|
| \`ci:triton-300x\` | Run an additional Triton test job on MI300X in PRs; main branch always runs both MI35X and MI300X |
| \`ci:sglang\` | SGLang integration tests |
| \`ci:atom\` | ATOM benchmark (DeepSeek-R1 + GPT-OSS) |
| \`ci:vllm\` | vLLM benchmark |
Expand Down
155 changes: 149 additions & 6 deletions .github/workflows/triton-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ on:
push:
branches: [main]
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
# labeled: re-run when adding ci:triton-300x to start extra MI300X jobs
types: [opened, synchronize, reopened, ready_for_review, labeled]
Comment on lines 6 to +8
branches: [main]
paths:
- "aiter/ops/triton/**"
Expand All @@ -22,7 +23,7 @@ concurrency:

jobs:
check-signal:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }}
runs-on: ubuntu-latest
steps:
- name: Checkout code
Expand All @@ -36,7 +37,7 @@ jobs:

# Step 1: split triton tests into 8 shards, output triton_shard_0.list ... triton_shard_7.list
split_triton_tests:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }}
runs-on: ubuntu-latest
needs: [check-signal]
outputs:
Expand All @@ -57,7 +58,7 @@ jobs:

# Build Triton wheel once, shared by all shard jobs via artifact
build-triton:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }}
name: Build Triton Wheel
runs-on: linux-aiter-mi35x-1
needs: [check-signal]
Expand Down Expand Up @@ -112,7 +113,7 @@ jobs:

# Step 2: MI35X matrix jobs
triton:
if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }}
if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }}
name: Triton Tests (MI35X) / Shard ${{ matrix.shard }}
runs-on: linux-aiter-mi35x-1
needs: [split_triton_tests, build-triton, check-signal]
Expand Down Expand Up @@ -243,8 +244,150 @@ jobs:
run: |
docker rm -f triton_test || true

# Step 2b: MI300X matrix jobs (opt-in via ci:triton-300x on PRs, always on main)
triton-mi300x:
if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') && (github.ref == 'refs/heads/main' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ci:triton-300x'))) }}
name: Triton Tests (MI300X) / Shard ${{ matrix.shard }}
runs-on: linux-aiter-mi300x-1
needs: [split_triton_tests, build-triton, check-signal]
strategy:
fail-fast: false
matrix:
shard: [0, 1, 2, 3, 4, 5, 6, 7]
env:
DOCKER_IMAGE: "rocm/pytorch:latest"
TRITON_TEST: "op_tests/triton_tests/"

steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 1
submodules: 'recursive'

- name: Download test shard lists
uses: actions/download-artifact@v4
with:
name: triton_shards

- name: Download Triton wheel
uses: actions/download-artifact@v4
with:
name: triton-wheel
path: triton-wheels

- name: List test shard files
run: |
ls -l triton_shard_*.list

- name: Docker login
run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true

- name: Export test file list for this shard as env
id: set_shard_files
run: |
TRITON_TEST=$(cat triton_shard_${{ matrix.shard }}.list)
echo "$TRITON_TEST"
echo "TRITON_TEST=$TRITON_TEST" >> $GITHUB_ENV

- name: Run the container
run: |
set -ex
echo "Starting container: triton_test"

if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi

docker run -dt \
--device=/dev/kfd $DEVICE_FLAG \
--shm-size=16G \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name triton_test \
${{ env.DOCKER_IMAGE }}

- name: Setup pip config
run: |
docker exec -u root triton_test bash -c "pip config set global.default-timeout 60"
docker exec -u root triton_test bash -c "pip config set global.retries 10"

- name: Setup Aiter and Triton
run: |
set -ex
echo "Setting up Aiter and Triton..."
docker exec \
-e TRITON_WHEEL_DIR=/workspace/triton-wheels \
-w /workspace \
triton_test \
./.github/scripts/build_aiter_triton.sh

- name: Install Pytest
run: |
set -ex
echo "Installing Pytest..."
docker exec \
-w /workspace \
triton_test \
pip install pytest

- name: Triton Tests
run: |
set -ex
echo "Running Triton Tests..."
docker exec -w /workspace triton_test mkdir -p test-reports
docker exec -w /workspace triton_test pytest -v ${TRITON_TEST} --junitxml=test-reports/triton.xml

- name: Upload test logs
uses: actions/upload-artifact@v4
if: success()
with:
name: triton-test-mi300x-shard-${{ matrix.shard }}
path: test-reports/triton.xml
retention-days: 7

- name: Cleanup container
if: always()
run: |
docker rm -f triton_test || true

triton-mi300x-test-finish:
if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') && (github.ref == 'refs/heads/main' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ci:triton-300x'))) }}
name: Triton MI300X Test Results
runs-on: ubuntu-latest
needs: [triton-mi300x]
steps:
- name: Download all MI300X test reports
uses: actions/download-artifact@v4
with:
pattern: triton-test-mi300x-shard-*
path: .

- name: Check Triton MI300X Test Results
run: |
set -ex
echo "Checking Triton MI300X Test Results..."
all_passed=true
for shard in {0..7}; do
if [ ! -f triton-test-mi300x-shard-${shard}/triton.xml ]; then
echo "MI300X test report for shard ${shard} not found."
all_passed=false
break
fi
done
if [ "$all_passed" = true ]; then
echo "All MI300X tests passed."
else
echo "MI300X test failures or errors detected."
exit 1
fi

triton-test-finish:
if: ${{ !github.event.pull_request || !github.event.pull_request.draft }}
if: ${{ (!github.event.pull_request || !github.event.pull_request.draft) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }}
name: Triton Test Results
runs-on: ubuntu-latest
needs: [triton]
Expand Down
Loading