diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json index 26e24bc85..bc290a028 100644 --- a/.github/benchmark/oot_benchmark_models.json +++ b/.github/benchmark/oot_benchmark_models.json @@ -43,8 +43,8 @@ { "display": "Kimi-K2-Thinking-MXFP4 TP4", "dashboard_model": "Kimi-K2-Thinking-MXFP4-tp4", - "source_path": "amd/Kimi-K2-Thinking-MXFP4", - "path": "amd/Kimi-K2-Thinking-MXFP4", + "source_path": "amd/Kimi-K2-Thinking-MXFP4-AttnFP8", + "path": "amd/Kimi-K2-Thinking-MXFP4-AttnFP8", "prefix": "kimi-k2-thinking-mxfp4-tp4", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384", "bench_args": "", @@ -54,8 +54,8 @@ { "display": "Kimi-K2-Thinking-MXFP4 TP8", "dashboard_model": "Kimi-K2-Thinking-MXFP4-tp8", - "source_path": "amd/Kimi-K2-Thinking-MXFP4", - "path": "amd/Kimi-K2-Thinking-MXFP4", + "source_path": "amd/Kimi-K2-Thinking-MXFP4-AttnFP8", + "path": "amd/Kimi-K2-Thinking-MXFP4-AttnFP8", "prefix": "kimi-k2-thinking-mxfp4-tp8", "extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384", "bench_args": "", @@ -65,8 +65,8 @@ { "display": "Kimi-K2.5-MXFP4 TP4", "dashboard_model": "Kimi-K2.5-MXFP4-tp4", - "source_path": "amd/Kimi-K2.5-MXFP4", - "path": "amd/Kimi-K2.5-MXFP4", + "source_path": "amd/Kimi-K2.5-MXFP4-AttnFP8", + "path": "amd/Kimi-K2.5-MXFP4-AttnFP8", "prefix": "kimi-k25-mxfp4-tp4", "extra_args": "--trust-remote-code --tensor-parallel-size 4", "bench_args": "", @@ -76,8 +76,8 @@ { "display": "Kimi-K2.5-MXFP4 TP8", "dashboard_model": "Kimi-K2.5-MXFP4", - "source_path": "amd/Kimi-K2.5-MXFP4", - "path": "amd/Kimi-K2.5-MXFP4", + "source_path": "amd/Kimi-K2.5-MXFP4-AttnFP8", + "path": "amd/Kimi-K2.5-MXFP4-AttnFP8", "prefix": "kimi-k25-mxfp4-tp8", "extra_args": "--trust-remote-code --tensor-parallel-size 8", "bench_args": "", @@ -170,7 +170,7 @@ "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 32768 --max-model-len 16384", "bench_args": "", "runner": "atom-mi355-8gpu-oot-benchmark", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0\nATOM_USE_FLYDSL_GDR=1" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_CUSTOM_ALL_GATHER=0" }, { "display": "Qwen3-Next-80B-A3B-Instruct-FP8 TP4", diff --git a/.github/workflows/atom-vllm-benchmark.yaml b/.github/workflows/atom-vllm-benchmark.yaml index fbac83d68..4339eadd0 100644 --- a/.github/workflows/atom-vllm-benchmark.yaml +++ b/.github/workflows/atom-vllm-benchmark.yaml @@ -2,7 +2,7 @@ name: ATOM vLLM Benchmark concurrency: group: ${{ github.workflow }}-${{ github.repository }}-${{ github.ref_name }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + cancel-in-progress: false on: workflow_dispatch: @@ -399,7 +399,16 @@ jobs: excluded_pairs = set(model.get("excluded_input_output_pairs", [])) model_params = [] seen = set() - extra_concurrency = (128, 256) if str(model.get("prefix", "")).startswith("qwen3-5-") else () + prefix = str(model.get("prefix", "")) + extra_concurrency = ( + (128, 256) + if prefix.startswith(( + "qwen3-5-", + "kimi-k2-thinking-mxfp4-", + "kimi-k25-mxfp4-", + )) + else () + ) for param in params: variants = [param] diff --git a/.github/workflows/docker-release.yaml b/.github/workflows/docker-release.yaml index c2ed6e2b6..1e66bb7c9 100644 --- a/.github/workflows/docker-release.yaml +++ b/.github/workflows/docker-release.yaml @@ -56,6 +56,9 @@ on: description: "Build OOT vLLM image for manual runs (scheduled nightly runs always build OOT; default enabled so manual releases also publish OOT)" type: boolean default: true + oot_tag_suffix: + description: "Optional suffix appended to the OOT vLLM nightly tag for manual runs. Example: custom -> rocm/atom-dev:vllm-v-nightly_-custom. When set, vllm-latest is not updated." + default: "" build_sglang_image: description: "Build SGLang+ATOM image for manual runs (scheduled nightly runs always build SGLang+ATOM; default enabled so manual releases also publish SGLang+ATOM)" type: boolean @@ -112,6 +115,7 @@ jobs: echo "AITER_COMMIT: ${{ inputs.aiter_commit || env.AITER_COMMIT }}" echo "RCCL_REPO: ${{ inputs.rccl_repo || env.RCCL_REPO }}" echo "RCCL_BRANCH: ${{ inputs.rccl_branch || env.RCCL_BRANCH }}" + echo "OOT_TAG_SUFFIX: ${{ inputs.oot_tag_suffix || '' }}" echo "SGLANG_REPO: ${{ inputs.sglang_repo || env.SGLANG_REPO }}" echo "SGLANG_REF: ${{ inputs.sglang_ref || env.SGLANG_REF }}" echo "SGLANG_VERSION: ${{ inputs.sglang_version || env.SGLANG_VERSION }}" @@ -211,13 +215,26 @@ jobs: - name: Push OOT Docker image if: ${{ success() && (inputs.only_release_oot == true || (inputs.only_release_sglang != true && (github.event_name == 'schedule' || inputs.build_oot_image == true))) }} run: | + set -euo pipefail VLLM_VER="${{ env.VLLM_VERSION }}" + OOT_TAG_SUFFIX="${{ inputs.oot_tag_suffix || '' }}" OOT_TAG="vllm-v${VLLM_VER}-nightly_$(date +%Y%m%d)" OOT_LATEST_TAG="vllm-latest" + if [ -n "${OOT_TAG_SUFFIX}" ]; then + if [[ ! "${OOT_TAG_SUFFIX}" =~ ^[A-Za-z0-9_.-]+$ ]]; then + echo "Invalid oot_tag_suffix '${OOT_TAG_SUFFIX}'. Allowed characters: letters, numbers, '.', '_' and '-'." + exit 1 + fi + OOT_TAG="${OOT_TAG}-${OOT_TAG_SUFFIX}" + fi docker tag atom_oot_release:ci rocm/atom-dev:${OOT_TAG} docker push rocm/atom-dev:${OOT_TAG} - docker tag atom_oot_release:ci rocm/atom-dev:${OOT_LATEST_TAG} - docker push rocm/atom-dev:${OOT_LATEST_TAG} + if [ -z "${OOT_TAG_SUFFIX}" ]; then + docker tag atom_oot_release:ci rocm/atom-dev:${OOT_LATEST_TAG} + docker push rocm/atom-dev:${OOT_LATEST_TAG} + else + echo "Custom OOT tag suffix '${OOT_TAG_SUFFIX}' provided; skipping rocm/atom-dev:${OOT_LATEST_TAG} update." + fi - name: Build SGLang Docker image if: ${{ success() && (inputs.only_release_sglang == true || (inputs.only_release_oot != true && (github.event_name == 'schedule' || inputs.build_sglang_image == true))) }} diff --git a/docker/Dockerfile b/docker/Dockerfile index 06e68495a..0abf0fe2f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -323,7 +323,7 @@ ARG MAX_JOBS RUN pip install --upgrade setuptools_scm RUN echo "========== [Parallel] Building Aiter ==========" && \ - git clone --depth 1 $AITER_REPO /app/aiter-test && \ + git clone $AITER_REPO /app/aiter-test && \ cd /app/aiter-test && \ pip install -r requirements.txt && \ git checkout $AITER_COMMIT && \