From 609e7a91b01f427c3be562af52e3f9942b7a93db Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 23 Apr 2026 17:03:46 +0530 Subject: [PATCH 01/10] AMD GLM5.1 FP8 MTP Support on MI355X Signed-off-by: ajith-sirra-amd --- .github/configs/amd-master.yaml | 20 +++++ .../single_node/glm5.1_fp8_mi355x_mtp.sh | 88 +++++++++++++++++++ perf-changelog.yaml | 7 ++ 3 files changed, 115 insertions(+) create mode 100644 benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9e1f9834e..554819b68 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -357,6 +357,26 @@ glm5.1-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } +glm5.1-fp8-mi355x-sglang-mtp: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + model: zai-org/GLM-5-FP8 + model-prefix: glm5.1 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + kimik2.5-int4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 model: moonshotai/Kimi-K2.5 diff --git a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh new file mode 100644 index 000000000..17e289114 --- /dev/null +++ b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +set -x + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +# ROCm / SGLang performance tuning for MI355X +export SGLANG_ROCM_FUSED_DECODE_MLA=0 +export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export SAFETENSORS_FAST_GPU=1 +export SGLANG_ENABLE_SPEC_V2=1 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 32)) + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +pip install -U transformers + +python3 -m sglang.launch_server \ + --model-path $MODEL \ + --host=0.0.0.0 \ + --port $PORT \ + --tensor-parallel-size $TP \ + --trust-remote-code \ + --cuda-graph-max-bs $CONC \ + --context-length $CONTEXT_LENGTH \ + --mem-fraction-static 0.85 \ + --tool-call-parser glm47 \ + --reasoning-parser glm45 \ + --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ + --nsa-prefill-backend tilelang \ + --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS \ + --kv-cache-dtype fp8_e4m3 \ + --speculative-algorithm EAGLE \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 \ + --tokenizer-worker-num $((TP*2)) \ + --disable-radix-cache> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e27a2511a..575886049 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,10 @@ +- config-keys: + - glm5.1-fp8-mi355x-sglang-mtp + description: + - "Add GLM5.1 FP8 MTP MI355X SGLang Support" + - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" + pr-link: TO BE UPDATE + - config-keys: - dsr1-fp8-h100-dynamo-trt - dsr1-fp8-h100-dynamo-sglang From b9e979a1eb7eca3b449fce3843ca493bfcf522da Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 23 Apr 2026 17:38:34 +0530 Subject: [PATCH 02/10] AMD GLM5.1 FP8 MTP Support on MI355X Signed-off-by: ajith-sirra-amd --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 575886049..18904c51f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3,7 +3,7 @@ description: - "Add GLM5.1 FP8 MTP MI355X SGLang Support" - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" - pr-link: TO BE UPDATE + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122 - config-keys: - dsr1-fp8-h100-dynamo-trt From 5a9c06202e3c63a82819a0cf5ad81a9165d015d1 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 23 Apr 2026 17:50:12 +0530 Subject: [PATCH 03/10] AMD GLM5.1 FP8 MTP Support on MI355X Signed-off-by: ajith-sirra-amd --- benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh index 17e289114..504ba0184 100644 --- a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh @@ -36,8 +36,6 @@ fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -pip install -U transformers - python3 -m sglang.launch_server \ --model-path $MODEL \ --host=0.0.0.0 \ From 89764f7f17e213075ec3cae373c392f4ddf679c3 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Fri, 24 Apr 2026 13:25:42 +0530 Subject: [PATCH 04/10] AMD GLM5.1 FP8 MTP Support on MI355X - Merging with GLM5 Signed-off-by: ajith-sirra-amd --- .github/configs/amd-master.yaml | 69 ++++++++++++--- .../single_node/glm5.1_fp8_mi355x_mtp.sh | 86 ------------------- benchmarks/single_node/glm5_fp8_mi355x_mtp.sh | 15 ++-- perf-changelog.yaml | 2 +- 4 files changed, 63 insertions(+), 109 deletions(-) delete mode 100644 benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 554819b68..78b412281 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -245,6 +245,48 @@ qwen3.5-fp8-mi355x-sglang-mtp: - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } +qwen3.5-fp8-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + +qwen3.5-fp8-mi355x-atom-mtp: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + qwen3.5-fp4-mi355x-sglang: image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413 model: amd/Qwen3.5-397B-A17B-MXFP4 @@ -302,8 +344,8 @@ glm5-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 - model: zai-org/GLM-5-FP8 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + model: zai-org/GLM-5.1-FP8 model-prefix: glm5 runner: mi355x precision: fp8 @@ -313,11 +355,13 @@ glm5-fp8-mi355x-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } glm5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post @@ -357,25 +401,23 @@ glm5.1-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } -glm5.1-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 - model: zai-org/GLM-5-FP8 +glm5.1-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: amd/GLM-5.1-MXFP4 model-prefix: glm5.1 runner: mi355x - precision: fp8 - framework: sglang + precision: fp4 + framework: atom multinode: false seq-len-configs: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 256 } kimik2.5-int4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -1431,4 +1473,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - diff --git a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh deleted file mode 100644 index 504ba0184..000000000 --- a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env bash -set -x - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -hf download "$MODEL" - -# ROCm / SGLang performance tuning for MI355X -export SGLANG_ROCM_FUSED_DECODE_MLA=0 -export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -export SAFETENSORS_FAST_GPU=1 -export SGLANG_ENABLE_SPEC_V2=1 - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} -CONTEXT_LENGTH=$((ISL + OSL + 32)) - -EVAL_CONTEXT_ARGS="" -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -fi -# Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor - -python3 -m sglang.launch_server \ - --model-path $MODEL \ - --host=0.0.0.0 \ - --port $PORT \ - --tensor-parallel-size $TP \ - --trust-remote-code \ - --cuda-graph-max-bs $CONC \ - --context-length $CONTEXT_LENGTH \ - --mem-fraction-static 0.85 \ - --tool-call-parser glm47 \ - --reasoning-parser glm45 \ - --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ - --nsa-prefill-backend tilelang \ - --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS \ - --kv-cache-dtype fp8_e4m3 \ - --speculative-algorithm EAGLE \ - --speculative-num-steps 3 \ - --speculative-eagle-topk 1 \ - --speculative-num-draft-tokens 4 \ - --tokenizer-worker-num $((TP*2)) \ - --disable-radix-cache> $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -# Stop GPU monitoring -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh index f4b899011..504ba0184 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -x source "$(dirname "$0")/../benchmark_lib.sh" @@ -15,11 +16,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -# GLM-5 requires transformers with glm_moe_dsa model type support. -# However, the Image rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 doesn't provide this support. -python3 -m pip install -U --no-cache-dir \ - "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" - hf download "$MODEL" # ROCm / SGLang performance tuning for MI355X @@ -30,6 +26,7 @@ export SGLANG_ENABLE_SPEC_V2=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 32)) EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -45,9 +42,11 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ + --cuda-graph-max-bs $CONC \ + --context-length $CONTEXT_LENGTH \ + --mem-fraction-static 0.85 \ --tool-call-parser glm47 \ --reasoning-parser glm45 \ - --mem-fraction-static 0.85 \ --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ --nsa-prefill-backend tilelang \ --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS \ @@ -56,6 +55,7 @@ python3 -m sglang.launch_server \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens 4 \ + --tokenizer-worker-num $((TP*2)) \ --disable-radix-cache> $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -73,8 +73,7 @@ run_benchmark_serving \ --num-prompts "$((CONC * 10))" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --use-chat-template + --result-dir /workspace/ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 18904c51f..78601b1fd 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,5 +1,5 @@ - config-keys: - - glm5.1-fp8-mi355x-sglang-mtp + - glm5-fp8-mi355x-sglang-mtp description: - "Add GLM5.1 FP8 MTP MI355X SGLang Support" - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" From 80a61dc5d94006420de3ce5d9755273eec4ca131 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Fri, 24 Apr 2026 13:36:01 +0530 Subject: [PATCH 05/10] AMD GLM5.1 FP8 MTP Support on MI355X - Merging with GLM5 Signed-off-by: ajith-sirra-amd --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 78b412281..9593a3147 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -355,12 +355,12 @@ glm5-fp8-mi355x-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } glm5-fp8-mi355x-atom: From 52172fdb8690daf3f4dec5e799db3e0857c536a2 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Fri, 24 Apr 2026 15:03:46 +0530 Subject: [PATCH 06/10] AMD GLM5.1 FP8 MTP Support on MI355X - Merging with GLM5 Signed-off-by: ajith-sirra-amd --- .github/configs/amd-master.yaml | 2 +- perf-changelog.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9593a3147..4c8e8d715 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -345,7 +345,7 @@ glm5-fp8-mi355x-sglang: glm5-fp8-mi355x-sglang-mtp: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 - model: zai-org/GLM-5.1-FP8 + model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x precision: fp8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e5a17957f..1bea1067b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,7 +1,7 @@ - config-keys: - glm5-fp8-mi355x-sglang-mtp description: - - "Add GLM5.1 FP8 MTP MI355X SGLang Support" + - "Add GLM5 FP8 MTP MI355X SGLang Support" - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122 From 05ffd31a2bfb4abd04b77dfccd4b09660f9c51ad Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 30 Apr 2026 12:40:44 +0530 Subject: [PATCH 07/10] AMD GLM5 FP8 MTP Support on MI355X - Perf Change Log Signed-off-by: ajith-sirra-amd --- perf-changelog.yaml | 1633 +++++++++++++++++-------------------------- 1 file changed, 624 insertions(+), 1009 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 84a598aed..29c72bfe0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,10 +1,3 @@ -- config-keys: - - glm5-fp8-mi355x-sglang-mtp - description: - - "Add GLM5 FP8 MTP MI355X SGLang Support" - - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122 - - config-keys: - dsr1-fp8-h100-dynamo-trt - dsr1-fp8-h100-dynamo-sglang @@ -133,12 +126,6 @@ - "Extend concurrency to 128 for gptoss b200 TRT configurations" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/233 -- config-keys: - - gptoss-fp4-b200-trt - description: - - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/256 - - config-keys: - "*gb200-dynamo-sglang" description: @@ -162,22 +149,18 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/273 - config-keys: - - dsr1-fp4-b200-sglang - - dsr1-fp8-b200-sglang - - dsr1-fp8-h200-sglang + - gptoss-fp4-b200-trt description: - - "Update NVIDIA DeepSeek sglang Docker image from v0.5.5 to v0.5.6" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/276 - + - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/256 - config-keys: - - gptoss-fp4-b200-vllm - - gptoss-fp4-h100-vllm - - gptoss-fp4-h200-vllm + - dsr1-fp4-gb200-dynamo-trt + - dsr1-fp4-gb200-dynamo-sglang + - dsr1-fp8-gb200-dynamo-sglang description: - - "Update vLLM image from v0.11.2 to v0.13.0" - - "Add VLLM_MXFP4_USE_MARLIN=1 to H100 and H200 benchmark scripts" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/327 + - "Add more configurations for GB200 SGLang DSR1" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/335 - config-keys: - dsr1-fp4-mi355x-sglang @@ -185,22 +168,6 @@ - "Update MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post1" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/330 -- config-keys: - - dsr1-fp8-mi300x-sglang - - dsr1-fp8-mi325x-sglang - - dsr1-fp8-mi355x-sglang - description: - - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8 - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/332 - -- config-keys: - - dsr1-fp4-gb200-dynamo-trt - - dsr1-fp4-gb200-dynamo-sglang - - dsr1-fp8-gb200-dynamo-sglang - description: - - "Add more configurations for GB200 SGLang DSR1" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/335 - - config-keys: - dsr1-fp4-gb200-dynamo-sglang - dsr1-fp8-gb200-dynamo-sglang @@ -213,7 +180,31 @@ description: - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post2" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/369 + +- config-keys: + - dsr1-fp4-b200-sglang + - dsr1-fp8-b200-sglang + - dsr1-fp8-h200-sglang + description: + - "Update NVIDIA DeepSeek sglang Docker image from v0.5.5 to v0.5.6" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/276 + +- config-keys: + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-vllm + description: + - "Update vLLM image from v0.11.2 to v0.13.0" + - "Add VLLM_MXFP4_USE_MARLIN=1 to H100 and H200 benchmark scripts" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/327 +- config-keys: + - dsr1-fp8-mi300x-sglang + - dsr1-fp8-mi325x-sglang + - dsr1-fp8-mi355x-sglang + description: + - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/332 - config-keys: - gptoss-fp4-gb200-dynamo-trt @@ -224,14 +215,11 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/387 - config-keys: - - dsr1-fp4-b200-trt-mtp - - dsr1-fp8-b200-trt-mtp - - dsr1-fp8-h200-trt-mtp + - dsr1-fp8-mi355x-sglang-disagg description: - - Add MTP (Multi-Token Prediction) support for single-node TRT configs - - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/392 - + - "Add PD disaggregation (1P2D) for Mi355X" + - "Includes with and without speculative decoding" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/348 - config-keys: - dsr1-fp4-mi355x-sglang @@ -239,20 +227,21 @@ - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.7" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/395 -- config-keys: - - dsr1-fp8-mi355x-sglang-disagg - description: - - "Add PD disaggregation (1P2D) for Mi355X" - - "Includes with and without speculative decoding" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/409 - - config-keys: - dsr1-fp8-b200-sglang description: - "Adds TP4 configurations to DSR1-FP8 B200 SGLang deployment experiments" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/411 - - + +- config-keys: + - dsr1-fp4-b200-trt-mtp + - dsr1-fp8-b200-trt-mtp + - dsr1-fp8-h200-trt-mtp + description: + - Add MTP (Multi-Token Prediction) support for single-node TRT configs + - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/392 + - config-keys: - dsr1-fp8-mi355x-atom - dsr1-fp4-mi355x-atom @@ -271,22 +260,6 @@ - "Add HIP_VISIBLE_DEVICES env var for Ray compatibility in vLLM 0.14+" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/496 -- config-keys: - - dsr1-fp4-gb200-dynamo-trt - description: - - "Update Dynamo TRT image from 0.5.1-rc0.pre3 to 0.8.1.post2" - - "Update TRT configurations" - - "Refactor configurations to use CONFIG_FILE-based recipes instead of inline parameter settings" - - "Introduce srt-slurm workflow for launching Dynamo jobs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/510 - -- config-keys: - - gptoss-fp4-mi300x-vllm - - gptoss-fp4-mi325x-vllm - description: - - "Fix AITER env vars for vLLM v0.14.0 on AMD MI300X and MI325X" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/535 - - config-keys: - dsr1-fp8-h200-sglang description: @@ -302,7 +275,6 @@ - "Set --attention-backend aiter for AMD aiter attention backend" - "Update chunked-prefill-size and max-prefill-tokens from 196608 to 131072" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/544 - - config-keys: - dsr1-fp8-mi325x-sglang description: @@ -315,14 +287,11 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/545 - config-keys: - - dsr1-fp8-h200-dynamo-trt + - gptoss-fp4-mi300x-vllm + - gptoss-fp4-mi325x-vllm description: - - "Add DSR1 FP8 H200 Dynamo TRT-LLM disaggregated multinode configuration" - - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" - - "Runner: h200-dgxc with multinode and disagg enabled" - - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths" - - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/570 + - "Fix AITER env vars for vLLM v0.14.0 on AMD MI300X and MI325X" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/535 - config-keys: - dsr1-fp8-mi355x-sglang @@ -331,24 +300,6 @@ - "Key fix: Disables mla persistent kernel when not using fp8 kv_cache (https://github.com/sgl-project/sglang/pull/17327)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/572 -- config-keys: - - dsr1-fp8-h200-dynamo-sglang - description: - - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration" - - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime" - - "Runner: h200-multinode-slurm with multinode and disagg enabled" - - "Recipes sourced from srtslurm repo (recipes/h200/)" - - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)" - - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)" - - "Concurrency levels range from 1 to 2048 depending on configuration" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/582 - -- config-keys: - - dsr1-fp4-b300-dynamo-trt - description: - - "Add DSR1 FP4 B300 Dynamo TRT configurations" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/585 - - config-keys: # NVIDIA single-node - dsr1-fp4-b200-sglang @@ -378,17 +329,27 @@ - gptoss-fp4-mi355x-atom description: - Add official GSM8k eval results to GPT-OSS and DeepSeek R1 scenarios - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/587 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/558 evals-only: true - config-keys: - - dsr1-fp4-b200-dynamo-trt + - dsr1-fp8-h200-sglang description: - - "Update DSR1 FP4 B200 Dynamo TRT configurations" - - "Update TRTLLM version to 1.2.0rc6.post2" - - "Transform to use srt-slurm recipes" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/588 + - "Update H200 DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.9" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + +- config-keys: + - dsr1-fp4-b300-dynamo-trt + description: + - "Add DSR1 FP4 B300 Dynamo TRT configurations" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/585 +- config-keys: + - dsr1-fp4-mi355x-sglang + description: + - "Update SGLang image from v0.5.7 to v0.5.8 for DeepSeek-R1 FP4 on MI355x" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/595 + - config-keys: - dsr1-fp8-b200-trt description: @@ -400,14 +361,33 @@ - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges" - "Add TLLM_OVERRIDE_LAYER_NUM=61 to avoid OOM errors" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/594 - + +- config-keys: + - dsr1-fp4-b200-dynamo-trt + description: + - "Update DSR1 FP4 B200 Dynamo TRT configurations" + - "Update TRTLLM version to 1.2.0rc6.post2" + - "Transform to use srt-slurm recipes" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/588 - config-keys: - - dsr1-fp4-mi355x-sglang + - dsr1-fp8-h200-dynamo-trt description: - - "Update SGLang image from v0.5.7 to v0.5.8 for DeepSeek-R1 FP4 on MI355x" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/595 + - "Add DSR1 FP8 H200 Dynamo TRT-LLM disaggregated multinode configuration" + - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + - "Runner: h200-dgxc with multinode and disagg enabled" + - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths" + - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/570 +- config-keys: + - dsr1-fp4-gb200-dynamo-trt + description: + - "Update Dynamo TRT image from 0.5.1-rc0.pre3 to 0.8.1.post2" + - "Update TRT configurations" + - "Refactor configurations to use CONFIG_FILE-based recipes instead of inline parameter settings" + - "Introduce srt-slurm workflow for launching Dynamo jobs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/510 - config-keys: - dsr1-fp8-mi355x-sglang @@ -417,21 +397,25 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/613 - config-keys: - - dsr1-fp8-b200-dynamo-trt + - dsr1-fp8-h200-dynamo-sglang description: - - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/616 + - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration" + - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime" + - "Runner: h200-multinode-slurm with multinode and disagg enabled" + - "Recipes sourced from srtslurm repo (recipes/h200/)" + - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)" + - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)" + - "Concurrency levels range from 1 to 2048 depending on configuration" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/582 - config-keys: - - dsr1-fp8-gb200-dynamo-trt + - dsr1-fp4-b200-trt description: - - "Add DeepSeek R1 FP8 GB200 Dynamo TRT-LLM disaggregated multinode configurations" - - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2" - - "1k1k: 14 scenarios (7 MTP, 7 STP) with varying DP attention/TEP modes" - - "1k8k: 10 scenarios (5 MTP, 5 STP) for long output generation" - - "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads" - - "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/617 + - "Update TensorRT-LLM container from release:1.1.0rc2.post2 to release:1.2.0rc6.post2" + - "Change default MOE backend from DEEPGEMM to TRTLLM" + - "Add dynamic piecewise CUDA graphs for 1k1k (TEP8 and CONC64)" + - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/620 - config-keys: - dsr1-fp4-gb300-dynamo-trt @@ -442,15 +426,6 @@ - "Add gb300-nv runner and launch script for srt-slurm integration" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/618 -- config-keys: - - dsr1-fp4-b200-trt - description: - - "Update TensorRT-LLM container from release:1.1.0rc2.post2 to release:1.2.0rc6.post2" - - "Change default MOE backend from DEEPGEMM to TRTLLM" - - "Add dynamic piecewise CUDA graphs for 1k1k (TEP8 and CONC64)" - - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/620 - - config-keys: - dsr1-fp4-mi355x-sglang-disagg description: @@ -458,20 +433,21 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/622 - config-keys: - - dsr1-fp8-b200-sglang-mtp + - dsr1-fp8-gb200-dynamo-trt description: - - "Add MTP (Multi-Token Prediction) support for DeepSeek R1 FP8 B200 SGLang using EAGLE speculative decoding" - - "Image: lmsysorg/sglang:v0.5.8-cu130-amd64" - - "Add benchmark script dsr1_fp8_b200_mtp.sh with EAGLE speculative decoding (num-steps=2, draft-tokens=3, topk=1)" - - "Update launch_b200-dgxc.sh to support SPEC_SUFFIX for MTP script selection" - - "Configurations: TP=8, EP=1, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/626 + - "Add DeepSeek R1 FP8 GB200 Dynamo TRT-LLM disaggregated multinode configurations" + - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2" + - "1k1k: 14 scenarios (7 MTP, 7 STP) with varying DP attention/TEP modes" + - "1k8k: 10 scenarios (5 MTP, 5 STP) for long output generation" + - "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads" + - "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/617 - config-keys: - - dsr1-fp8-gb300-dynamo-trt + - dsr1-fp8-gb200-dynamo-trt description: - - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 8k1k and 1k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/627 + - "Fix model_prefix argument in yaml configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/646 - config-keys: - dsr1-fp8-b200-trt-mtp @@ -483,39 +459,10 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/632 - config-keys: - - dsr1-fp4-gb200-dynamo-sglang + - dsr1-fp8-gb300-dynamo-trt description: - - "Update SGLang image from v0.5.5.post2 to v0.5.8-cu130" - - "Add FP4 model path separation via SRT_SLURM_MODEL_PREFIX in launch script" - - "Refactor to use CONFIG_FILE-based srt-slurm recipes instead of inline parameters" - - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)" - - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/633 - -- config-keys: - - dsr1-fp8-gb200-dynamo-sglang - - dsr1-fp8-gb300-dynamo-sglang - description: - - "Update GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode" - - "Image: lmsysorg/sglang:v0.5.8-cu130" - - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/635 - -- config-keys: - - dsr1-fp4-gb300-dynamo-sglang - description: - - "Add GB300 FP4 Dynamo SGLang disaggregated multinode configuration" - - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime" - - "Recipes sourced from srt-slurm repo (recipes/gb300-fp4/ folder)" - - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)" - - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/636 - -- config-keys: - - dsr1-fp8-b300-dynamo-trt - description: - - "New B300 FP8 Dynamo TRT configurations" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/638 + - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 8k1k and 1k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/627 - config-keys: - gptoss-fp4-b200-trt @@ -525,12 +472,21 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/639 - config-keys: - - dsr1-fp8-h200-dynamo-sglang + - dsr1-fp8-mi355x-sglang-disagg description: - - "Add MTP (EAGLE speculative decoding) configs alongside STP" - - "Update container to lmsysorg/sglang:v0.5.8.post1-cu130" - - "Remove aggregated configs, keep only disaggregated" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/640 + - "Add --use-chat-template argument to benchmark_serving script" + - "Without this arg, MTP acceptance rates are artificially high for DeepSeek with MTP" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/647 + +- config-keys: + - dsr1-fp8-b200-sglang-mtp + description: + - "Add MTP (Multi-Token Prediction) support for DeepSeek R1 FP8 B200 SGLang using EAGLE speculative decoding" + - "Image: lmsysorg/sglang:v0.5.8-cu130-amd64" + - "Add benchmark script dsr1_fp8_b200_mtp.sh with EAGLE speculative decoding (num-steps=2, draft-tokens=3, topk=1)" + - "Update launch_b200-dgxc.sh to support SPEC_SUFFIX for MTP script selection" + - "Configurations: TP=8, EP=1, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/626 - config-keys: - dsr1-fp4-b200-trt-mtp @@ -541,35 +497,18 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/642 - config-keys: - - dsr1-fp8-h100-dynamo-sglang - description: - - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang STP disaggregated multinode configurations" - - "Image: lmsysorg/sglang:v0.5.8-cu130" - - "1k1k, 1k8k, 8k1k sequence lengths" - - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/643 - -- config-keys: - - dsr1-fp8-h100-dynamo-sglang - description: - - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang MTP disaggregated multinode configurations" - - "Image: lmsysorg/sglang:v0.5.8-cu130" - - "1k1k, 1k8k, 8k1k sequence lengths with MTP speculative decoding" - - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/644 - -- config-keys: - - dsr1-fp8-gb200-dynamo-trt + - dsr1-fp8-b200-dynamo-sglang description: - - "Fix model_prefix argument in yaml configs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/646 + - "Add DSR1 FP8 B200 disaggregated SGLang multinode configuration" + - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64" + - "9 recipes: 4x 1k1k + 5x 8k1k, low-latency and max-throughput profiles" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/658 - config-keys: - - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-gb300-dynamo-trt description: - - "Add --use-chat-template argument to benchmark_serving script" - - "Without this arg, MTP acceptance rates are artificially high for DeepSeek with MTP" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/649 + - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 1k8k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/654 - config-keys: - dsr1-fp8-h100-dynamo-trt @@ -578,18 +517,18 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/651 - config-keys: - - dsr1-fp8-gb300-dynamo-trt + - dsr1-fp8-h200-dynamo-sglang description: - - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 1k8k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/654 + - "Add MTP (EAGLE speculative decoding) configs alongside STP" + - "Update container to lmsysorg/sglang:v0.5.8.post1-cu130" + - "Remove aggregated configs, keep only disaggregated" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/640 - config-keys: - - dsr1-fp8-b200-dynamo-sglang + - dsr1-fp8-b300-dynamo-trt description: - - "Add DSR1 FP8 B200 disaggregated SGLang multinode configuration" - - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64" - - "9 recipes: 4x 1k1k + 5x 8k1k, low-latency and max-throughput profiles" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/658 + - "New B300 FP8 Dynamo TRT configurations" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/638 - config-keys: - dsr1-fp8-h100-dynamo-trt @@ -598,6 +537,25 @@ - "fix model_prefix bug from https://github.com/SemiAnalysisAI/InferenceX/pull/651" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/663 +- config-keys: + - dsr1-fp4-gb200-dynamo-sglang + description: + - "Update SGLang image from v0.5.5.post2 to v0.5.8-cu130" + - "Add FP4 model path separation via SRT_SLURM_MODEL_PREFIX in launch script" + - "Refactor to use CONFIG_FILE-based srt-slurm recipes instead of inline parameters" + - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)" + - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/633 + +- config-keys: + - dsr1-fp8-gb200-dynamo-sglang + - dsr1-fp8-gb300-dynamo-sglang + description: + - "Update GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode" + - "Image: lmsysorg/sglang:v0.5.8-cu130" + - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/635 + - config-keys: - dsr1-fp8-b200-dynamo-sglang-mtp description: @@ -607,13 +565,22 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/667 - config-keys: - - dsr1-fp4-b200-dynamo-sglang + - dsr1-fp8-h100-dynamo-sglang description: - - "Add DSR1 FP4 B200 Dynamo SGLang STP mode" - - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime" - - "1k1k configs: low-latency DEP (1P5D, 1P6D), max-throughput DEP (1P1D, 1P2D)" - - "8k1k configs: low-latency DEP/TEP (1P1D, 1P5D, 2P5D), TEP (1P1D), max-throughput DEP (7P2D)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/672 + - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang STP disaggregated multinode configurations" + - "Image: lmsysorg/sglang:v0.5.8-cu130" + - "1k1k, 1k8k, 8k1k sequence lengths" + - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/643 + +- config-keys: + - dsr1-fp8-h100-dynamo-sglang + description: + - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang MTP disaggregated multinode configurations" + - "Image: lmsysorg/sglang:v0.5.8-cu130" + - "1k1k, 1k8k, 8k1k sequence lengths with MTP speculative decoding" + - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/644 - config-keys: - dsr1-fp8-mi355x-atom-mtp @@ -624,6 +591,21 @@ - "Deepseek R1 with speculative decoding: 1k1k, 1k8k, 8k1k" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/673 +- config-keys: + - dsr1-fp4-b200-dynamo-sglang + description: + - "Add DSR1 FP4 B200 Dynamo SGLang STP mode" + - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime" + - "1k1k configs: low-latency DEP (1P5D, 1P6D), max-throughput DEP (1P1D, 1P2D)" + - "8k1k configs: low-latency DEP/TEP (1P1D, 1P5D, 2P5D), TEP (1P1D), max-throughput DEP (7P2D)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/672 + +- config-keys: + - dsr1-fp8-b200-dynamo-trt + description: + - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/616 + - config-keys: - dsr1-fp8-mi355x-sglang-disagg - dsr1-fp4-mi355x-sglang-disagg @@ -642,17 +624,22 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/683 - config-keys: - - dsr1-fp8-b200-dynamo-trt + - dsr1-fp4-gb300-dynamo-sglang description: - - "Update max_num_tokens and max_batch_size for min-latency decode workers" - - "See srt-slurm recipe changes: https://github.com/ishandhanani/srt-slurm/pull/173" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/686 + - "Add GB300 FP4 Dynamo SGLang disaggregated multinode configuration" + - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime" + - "Recipes sourced from srt-slurm repo (recipes/gb300-fp4/ folder)" + - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)" + - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/636 - config-keys: - - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-b200-dynamo-sglang-mtp description: - - "Add more sweep points for DSR1 FP8 both MTP and non-MTP 1k1k, 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/689 + - "Patches one missing concurrency point for " + - "DSR1 FP8 B200 disaggregated SGLang MTP multinode configuration. " + - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/691 - config-keys: - dsr1-fp8-b300-dynamo-trt @@ -661,25 +648,23 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/690 - config-keys: - - dsr1-fp8-b200-dynamo-sglang-mtp + - dsr1-fp8-mi355x-sglang-disagg description: - - "Patches one missing concurrency point for " - - "DSR1 FP8 B200 disaggregated SGLang MTP multinode configuration. " - - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/691 + - "Add more sweep points for DSR1 FP8 both MTP and non-MTP 1k1k, 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/689 - config-keys: - - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp8-b200-dynamo-trt description: - - "Add more sweep points for DSR1 FP4 both MTP and non-MTP 1k1k, 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692 - + - "Update max_num_tokens and max_batch_size for min-latency decode workers" + - "See srt-slurm recipe changes: https://github.com/ishandhanani/srt-slurm/pull/173" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/686 - config-keys: - dsr1-fp8-mi325x-sglang description: - "Update MI325X DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.8" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/695 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692 - config-keys: - dsr1-fp8-mi300x-sglang @@ -687,6 +672,12 @@ - "Update MI300X DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.8" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/696 +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg + description: + - "Add more sweep points for DSR1 FP4 both MTP and non-MTP 1k1k, 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692 + - config-keys: - dsr1-fp8-b200-dynamo-sglang-mtp description: @@ -741,15 +732,6 @@ - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/734 -- config-keys: - - kimik2.5-int4-b200-vllm - description: - - "Add Kimi-K2.5 INT4 vLLM benchmark for B200" - - "Model: moonshotai/Kimi-K2.5 with --mm-encoder-tp-mode data and --trust-remote-code" - - "Image: vllm/vllm-openai:v0.15.1" - - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/735 - - config-keys: - minimaxm2.5-fp8-mi355x-vllm description: @@ -761,13 +743,12 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/755 - config-keys: - - minimaxm2.5-fp8-b200-vllm + - qwen3.5-fp8-mi355x-sglang description: - - "Add MiniMax-M2.5 FP8 vLLM benchmark for B200" - - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code" - - "Image: vllm/vllm-openai:v0.17.0" - - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/757 + - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for MI355X" + - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218" + - "Uses triton attention backend, TP=8, concurrency 4-64" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/768 - config-keys: - qwen3.5-bf16-b200-sglang @@ -779,30 +760,12 @@ - "Set cuda-graph-max-bs to match concurrency, scheduler-recv-interval based on concurrency" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/758 -- config-keys: - - glm5-fp8-mi355x-sglang - description: - - "Add GLM-5 FP8 SGLang benchmark for MI355X" - - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends" - - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219" - - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/762 - -- config-keys: - - qwen3.5-fp8-mi355x-sglang - description: - - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for MI355X" - - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218" - - "Uses triton attention backend, TP=8, concurrency 4-64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/768 - - config-keys: - dsr1-fp8-mi355x-sglang-disagg description: - "Add more configs for MI355X FP8 Disagg" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/770 - - + - config-keys: - gptoss-fp4-mi300x-vllm - gptoss-fp4-mi325x-vllm @@ -811,6 +774,15 @@ - "Gains: ROCm skinny GEMM dispatch fix, MoRI EP all2all backend, KV cache shuffle + paged attention for AITER" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/781 +- config-keys: + - kimik2.5-int4-b200-vllm + description: + - "Add Kimi-K2.5 INT4 vLLM benchmark for B200" + - "Model: moonshotai/Kimi-K2.5 with --mm-encoder-tp-mode data and --trust-remote-code" + - "Image: vllm/vllm-openai:v0.15.1" + - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/735 + - config-keys: - gptoss-fp4-b200-vllm - gptoss-fp4-h100-vllm @@ -821,8 +793,7 @@ - "Gains: CUTLASS MoE optimizations (~8% throughput), FP4 kernel improvements (~4% E2E on B200), torch.compile cold-start fix" - "v0.15.1 includes fix for prefix cache hit rate of 0% on GPT-OSS hybrid attention models" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/789 - - + - config-keys: - dsr1-fp4-mi355x-atom - dsr1-fp4-mi355x-atom-mtp @@ -831,16 +802,16 @@ - "Comment out TP=4 configs, consolidate to TP=8 only" - "Extend concurrency range to conc-end: 256 across all sequence lengths (1k1k, 1k8k, 8k1k)" - "Fix MTP 1k8k conc-start from 256 to 4 to enable full concurrency sweep" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/792 - + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/699 + - config-keys: - - qwen3.5-fp8-b200-sglang + - glm5-fp8-mi355x-sglang description: - - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for B200" - - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64" - - "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner" - - "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804 + - "Add GLM-5 FP8 SGLang benchmark for MI355X" + - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends" + - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219" + - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX - config-keys: - gptoss-fp4-mi300x-vllm @@ -864,61 +835,11 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/816 - config-keys: - - qwen3.5-fp4-b200-sglang - description: - - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang benchmark config and launch script" - - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64" - - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" - - "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820 - -- config-keys: - - dsr1-fp8-mi355x-sglang-disagg - - dsr1-fp8-mi355x-sglang-disagg-mtp - - dsr1-fp4-mi355x-sglang-disagg - - dsr1-fp4-mi355x-sglang-disagg-mtp - description: - - "Add more sweep configs for MI355X FP8/FP4 Disagg" - - "Add TP/DP/EP size < 8 support " - - "Support DSR1-0528 MTP Disagg" - - "Bump SGL mori image to Feb 27" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823 - -- config-keys: - - kimik2.5-fp4-mi355x-vllm - description: - - "Add Kimi-K2.5 MXFP4 vLLM benchmark for MI355X" - - "Model: amd/Kimi-K2.5-MXFP4 with --mm-encoder-tp-mode data and --trust-remote-code" - - "Image: vllm/vllm-openai-rocm:v0.15.1" - - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/825 - -- config-keys: - - minimaxm2.5-fp4-mi355x-vllm - description: - - "Add MiniMax M2.5 MXFP4 vLLM benchmark for MI355X" - - "Model: amd/MiniMax-M2.5-MXFP4 with --trust-remote-code and --block-size=32" - - "Image: vllm/vllm-openai-rocm:v0.19.1" - - "Environment: VLLM_ROCM_USE_AITER=1" - - "Tp=1, TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/827 - -- config-keys: - - minimaxm2.5-fp8-h200-vllm + - minimaxm2.5-fp8-h200-vllm description: - "Add MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP4)" - "New benchmark script with --trust-remote-code for MiniMaxAI/MiniMax-M2.5" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/831 - -- config-keys: - - minimaxm2.5-fp8-h100-vllm - description: - - "Add MiniMax-M2.5 FP8 vLLM benchmark for H100" - - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code" - - "Image: vllm/vllm-openai:v0.16.0" - - "Switch from TP=8/EP=8 to TP=4/EP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k" - - "Script uses conditional --enable-expert-parallel based on EP_SIZE env var" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/832 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX - config-keys: - minimaxm2.5-fp8-mi325x-vllm @@ -940,13 +861,22 @@ - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/837 +- config-keys: + - kimik2.5-fp4-mi355x-vllm + description: + - "Add Kimi-K2.5 MXFP4 vLLM benchmark for MI355X" + - "Model: amd/Kimi-K2.5-MXFP4 with --mm-encoder-tp-mode data and --trust-remote-code" + - "Image: vllm/vllm-openai-rocm:v0.15.1" + - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/825 + - config-keys: - qwen3.5-bf16-mi325x-sglang description: - "Add Qwen3.5-397B-A17B BF16 SGLang benchmark for MI325X" - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x" - "Uses triton attention backend, TP=8, concurrency 4-64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/842 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX - config-keys: - qwen3.5-bf16-mi300x-sglang @@ -957,14 +887,13 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/843 - config-keys: - - kimik2.5-int4-h200-vllm + - qwen3.5-fp8-mi325x-sglang description: - - "Add Kimi-K2.5 INT4 vLLM benchmark for H200" - - "Model: moonshotai/Kimi-K2.5 with --reasoning-parser kimi_k2 and --trust-remote-code" - - "Image: vllm/vllm-openai:v0.16.0" + - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark for MI325X" + - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x" + - "Following AMD Andy Luo's recipe with triton attention backend" - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/847 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX - config-keys: - qwen3.5-fp8-mi300x-sglang @@ -976,23 +905,47 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/850 - config-keys: - - qwen3.5-fp8-mi325x-sglang + - kimik2.5-int4-h200-vllm description: - - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark for MI325X" - - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x" - - "Following AMD Andy Luo's recipe with triton attention backend" + - "Add Kimi-K2.5 INT4 vLLM benchmark for H200" + - "Model: moonshotai/Kimi-K2.5 with --reasoning-parser kimi_k2 and --trust-remote-code" + - "Image: vllm/vllm-openai:v0.16.0" - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/852 + - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839 + +- config-keys: + - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-mi355x-sglang-disagg-mtp + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Add more sweep configs for MI355X FP8/FP4 Disagg" + - "Add TP/DP/EP size < 8 support " + - "Support DSR1-0528 MTP Disagg" + - "Bump SGL mori image to Feb 27" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823 - config-keys: - - gptoss-fp4-h200-trt + - minimaxm2.5-fp8-h100-vllm description: - - "Upgrade TensorRT-LLM container from release:gpt-oss-dev to release:v1.3.0rc5" - - "Remove sed hack for TensorRT bug (fixed upstream in v1.3.0rc5)" - - "Remove enable_block_reuse: false from kv_cache_config (default true is now recommended)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/854 + - "Add MiniMax-M2.5 FP8 vLLM benchmark for H100" + - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code" + - "Image: vllm/vllm-openai:v0.16.0" + - "Switch from TP=8/EP=8 to TP=4/EP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k" + - "Script uses conditional --enable-expert-parallel based on EP_SIZE env var" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/832 - config-keys: + - qwen3.5-fp8-b200-sglang + description: + - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for B200" + - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64" + - "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner" + - "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804 + +- config-keys: - qwen3.5-fp8-h200-sglang description: - "Add Qwen 3.5 FP8 H200 SGLang configuration" @@ -1001,13 +954,6 @@ - "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855 -- config-keys: - - kimik2.5-fp4-b200-vllm - description: - - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration" - - "Image: vllm/vllm-openai:v0.17.0" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862 - - config-keys: - dsr1-fp8-mi355x-sglang description: @@ -1016,18 +962,20 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/865 - config-keys: - - minimaxm2.5-fp8-h200-vllm - description: - - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869 - - -- config-keys: - - dsr1-fp8-h200-sglang + - qwen3.5-bf16-b200-sglang + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-bf16-mi355x-sglang + - qwen3.5-fp8-b200-sglang + - qwen3.5-fp8-h200-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + - qwen3.5-fp8-mi355x-sglang description: - - "Update H200 DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.9" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/887 - + - "Redo qwen eval" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/892 + evals-only: true + - config-keys: - gptoss-fp4-mi300x-vllm - gptoss-fp4-mi325x-vllm @@ -1039,31 +987,23 @@ - "Add AMDGCN_USE_BUFFER_OPS=0 and VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 env vars" - "Switch to --attention-backend ROCM_AITER_UNIFIED_ATTN and add fuse_rope_kvcache compilation pass" - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/889 - + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867 + - config-keys: - - qwen3.5-bf16-b200-sglang - - qwen3.5-bf16-mi300x-sglang - - qwen3.5-bf16-mi325x-sglang - - qwen3.5-bf16-mi355x-sglang - - qwen3.5-fp8-b200-sglang - - qwen3.5-fp8-h200-sglang - - qwen3.5-fp8-mi300x-sglang - - qwen3.5-fp8-mi325x-sglang - - qwen3.5-fp8-mi355x-sglang + - kimik2.5-fp4-b200-vllm description: - - "Redo qwen eval" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/892 - evals-only: true - + - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration" + - "Image: vllm/vllm-openai:v0.17.0" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862 - config-keys: - - qwen3.5-fp8-b200-sglang-mtp + - minimaxm2.5-fp8-b200-vllm description: - - "Add Single Node Agg FP8 MTP config for Qwen3.5 B200 SGLang" - - "EAGLE speculative decoding: num-steps 3, draft-tokens 4, topk 1" - - "New script: benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/898 + - "Add MiniMax-M2.5 FP8 vLLM benchmark for B200" + - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code" + - "Image: vllm/vllm-openai:v0.17.0" + - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/757 - config-keys: - dsr1-fp4-mi355x-sglang-disagg @@ -1074,12 +1014,11 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/899 - config-keys: - - kimik2.5-int4-mi325x-vllm + - minimaxm2.5-fp8-h200-vllm description: - - "Add Kimi K2.5 INT4 single-node MI325X vLLM benchmark (TP8)" - - "Uses vLLM ROCm v0.16.0 image following AMD Andy Luo's recipe" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/901 - + - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869 + - config-keys: - dsr1-fp8-b200-dynamo-sglang - dsr1-fp8-b200-dynamo-sglang-mtp @@ -1089,66 +1028,6 @@ - "14 variants: STP/MTP x low-latency/max-throughput with updated concurrencies and scale points" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/907 -- config-keys: - # NVIDIA single-node - - dsr1-fp4-b200-sglang - - dsr1-fp4-b200-trt - - dsr1-fp4-b200-trt-mtp - - dsr1-fp8-b200-sglang - - dsr1-fp8-b200-sglang-mtp - - dsr1-fp8-b200-trt - - dsr1-fp8-b200-trt-mtp - - dsr1-fp8-h200-sglang - - dsr1-fp8-h200-trt - - dsr1-fp8-h200-trt-mtp - - glm5-fp8-b200-sglang - - glm5-fp8-h200-sglang - - gptoss-fp4-b200-trt - - gptoss-fp4-b200-vllm - - gptoss-fp4-h100-vllm - - gptoss-fp4-h200-trt - - gptoss-fp4-h200-vllm - - kimik2.5-fp4-b200-vllm - - kimik2.5-int4-b200-vllm - - kimik2.5-int4-h200-vllm - - minimaxm2.5-fp8-b200-vllm - - minimaxm2.5-fp8-h100-vllm - - minimaxm2.5-fp8-h200-vllm - - qwen3.5-bf16-b200-sglang - - qwen3.5-fp8-b200-sglang - - qwen3.5-fp8-b200-sglang-mtp - - qwen3.5-fp8-h200-sglang - # AMD single-node - - dsr1-fp4-mi355x-atom - - dsr1-fp4-mi355x-atom-mtp - - dsr1-fp4-mi355x-sglang - - dsr1-fp8-mi325x-sglang - - dsr1-fp8-mi300x-sglang - - dsr1-fp8-mi355x-atom - - dsr1-fp8-mi355x-atom-mtp - - dsr1-fp8-mi355x-sglang - - glm5-fp8-mi355x-sglang - - gptoss-fp4-mi300x-vllm - - gptoss-fp4-mi325x-vllm - - gptoss-fp4-mi355x-atom - - gptoss-fp4-mi355x-vllm - - kimik2.5-fp4-mi355x-vllm - - kimik2.5-int4-mi325x-vllm - - kimik2.5-int4-mi355x-vllm - - minimaxm2.5-fp8-mi300x-vllm - - minimaxm2.5-fp8-mi325x-vllm - - minimaxm2.5-fp8-mi355x-vllm - - qwen3.5-bf16-mi300x-sglang - - qwen3.5-bf16-mi325x-sglang - - qwen3.5-bf16-mi355x-sglang - - qwen3.5-fp8-mi300x-sglang - - qwen3.5-fp8-mi325x-sglang - - qwen3.5-fp8-mi355x-sglang - description: - - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911 - evals-only: true - - config-keys: - glm5-fp8-h200-sglang description: @@ -1164,40 +1043,17 @@ - "Add GLM-5 FP8 SGLang benchmark for B200" - "Supports TP8 (low latency) and DEP8 (high throughput) modes with NSA attention backend" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/915 - - + - config-keys: - - qwen3.5-fp8-b200-sglang + - qwen3.5-fp8-b200-sglang-mtp description: - - "Replace FP8 with combination of TP4 and TP8 config" - - "Add --enable-flashinfer-allreduce-fusion to TP8" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918 + - "Add Single Node Agg FP8 MTP config for Qwen3.5 B200 SGLang" + - "EAGLE speculative decoding: num-steps 3, draft-tokens 4, topk 1" + - "New script: benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/898 - config-keys: - - dsr1-fp8-b200-dynamo-trt - - dsr1-fp8-h200-dynamo-trt - - dsr1-fp4-gb200-dynamo-trt - description: - - "Fix metadata inconsistencies in nvidia-master.yaml - TP/EP/DP-attn values now match actual recipe files" - - "B200 FP8 TRT 8K/1K: prefill_ep 8→1 (15 entries), prefill_dp_attn true→false (1 entry)" - - "H200 FP8 TRT 1K/1K: prefill_dp_attn false→true (9 entries)" - - "H200 FP8 TRT 8K/1K: prefill_dp_attn true→false (8 entries)" - - "GB200 FP4 TRT 8K/1K: decode_dp_attn true→false (2 entries)" - - "All fixes are metadata-only; no recipe files were modified" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/919 - -- config-keys: - - kimik2.5-int4-mi325x-vllm - - kimik2.5-int4-mi355x-vllm - - kimik2.5-int4-h200-vllm - - kimik2.5-fp4-mi355x-vllm - - kimik2.5-fp4-b200-vllm - description: - - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926 - -- config-keys: - - minimaxm2.5-fp8-mi355x-vllm + - minimaxm2.5-fp8-mi355x-vllm description: - "ADD minimax TP=8 with EP, in config of 1k1k, 1k8k, and 8k1k sequence lengths" - "Config concurrency: 32-256" @@ -1229,7 +1085,13 @@ - "Add --exclusive flag to MI355X single-node salloc and multi-node sbatch to prevent node sharing during benchmarks" - "Only non-TP8 configs listed; TP8 already uses all GPUs on the node" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/934 - + +- config-keys: + - qwen3.5-fp8-b200-sglang + description: + - "Replace FP8 with combination of TP4 and TP8 config" + - "Add --enable-flashinfer-allreduce-fusion to TP8" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918 - config-keys: - kimik2.5-int4-b200-vllm @@ -1237,15 +1099,6 @@ - "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935 -- config-keys: - - kimik2.5-fp4-mi355x-vllm - description: - - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" - - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)" - - "Add expert parallel, TP4, and TP4/EP4 search spaces" - - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936 - - config-keys: - dsr1-fp4-b200-sglang - dsr1-fp8-b200-sglang @@ -1258,15 +1111,22 @@ - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130" - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943 - + +- config-keys: + - minimaxm2.5-fp8-mi325x-vllm + description: + - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" + - "Replace TP4 with TP8/EP8, add conc range 4-256" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/953 - config-keys: - - minimaxm2.5-fp8-b200-vllm + - kimik2.5-fp4-mi355x-vllm description: - - "Update vLLM image from v0.17.0 to v0.19.0 for MiniMax-M2.5 FP8 B200" - - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs" - - "Remove ISL 1024 / OSL 8192 seq-len config" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947 + - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" + - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)" + - "Add expert parallel, TP4, and TP4/EP4 search spaces" + - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936 - config-keys: - kimik2.5-int4-mi355x-vllm @@ -1277,13 +1137,6 @@ - "Add --max-num-seqs 256, remove --disable-log-requests" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/950 -- config-keys: - - minimaxm2.5-fp8-mi325x-vllm - description: - - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" - - "Replace TP4 with TP8/EP8, add conc range 4-256" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/953 - - config-keys: - kimik2.5-int4-mi325x-vllm description: @@ -1293,13 +1146,6 @@ - "Add --max-num-seqs 256, remove --disable-log-requests" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/957 -- config-keys: - - minimaxm2.5-fp8-h100-vllm - - minimaxm2.5-fp8-h200-vllm - description: - - "Update vLLM image from v0.16.0 to v0.18.0 for minimax h100 and h200 configs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/958 - - config-keys: - gptoss-fp4-h100-vllm - gptoss-fp4-h200-vllm @@ -1307,6 +1153,16 @@ - "Update vLLM image from v0.15.1 to v0.18.0 for gptoss H100 and H200 configs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/960 +- config-keys: + - kimik2.5-int4-mi325x-vllm + - kimik2.5-int4-mi355x-vllm + - kimik2.5-int4-h200-vllm + - kimik2.5-fp4-mi355x-vllm + - kimik2.5-fp4-b200-vllm + description: + - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926 + - config-keys: - minimaxm2.5-fp8-b200-vllm - minimaxm2.5-fp8-h100-vllm @@ -1318,6 +1174,66 @@ - "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966 +- config-keys: + # NVIDIA single-node + - dsr1-fp4-b200-sglang + - dsr1-fp4-b200-trt + - dsr1-fp4-b200-trt-mtp + - dsr1-fp8-b200-sglang + - dsr1-fp8-b200-sglang-mtp + - dsr1-fp8-b200-trt + - dsr1-fp8-b200-trt-mtp + - dsr1-fp8-h200-sglang + - dsr1-fp8-h200-trt + - dsr1-fp8-h200-trt-mtp + - glm5-fp8-b200-sglang + - glm5-fp8-h200-sglang + - gptoss-fp4-b200-trt + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-trt + - gptoss-fp4-h200-vllm + - kimik2.5-fp4-b200-vllm + - kimik2.5-int4-b200-vllm + - kimik2.5-int4-h200-vllm + - minimaxm2.5-fp8-b200-vllm + - minimaxm2.5-fp8-h100-vllm + - minimaxm2.5-fp8-h200-vllm + - qwen3.5-bf16-b200-sglang + - qwen3.5-fp8-b200-sglang + - qwen3.5-fp8-b200-sglang-mtp + - qwen3.5-fp8-h200-sglang + # AMD single-node + - dsr1-fp4-mi355x-atom + - dsr1-fp4-mi355x-atom-mtp + - dsr1-fp4-mi355x-sglang + - dsr1-fp8-mi325x-sglang + - dsr1-fp8-mi300x-sglang + - dsr1-fp8-mi355x-atom + - dsr1-fp8-mi355x-atom-mtp + - dsr1-fp8-mi355x-sglang + - glm5-fp8-mi355x-sglang + - gptoss-fp4-mi300x-vllm + - gptoss-fp4-mi325x-vllm + - gptoss-fp4-mi355x-atom + - gptoss-fp4-mi355x-vllm + - kimik2.5-fp4-mi355x-vllm + - kimik2.5-int4-mi325x-vllm + - kimik2.5-int4-mi355x-vllm + - minimaxm2.5-fp8-mi300x-vllm + - minimaxm2.5-fp8-mi325x-vllm + - minimaxm2.5-fp8-mi355x-vllm + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-bf16-mi355x-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + - qwen3.5-fp8-mi355x-sglang + description: + - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911 + evals-only: true + - config-keys: - qwen3.5-bf16-mi300x-sglang - qwen3.5-bf16-mi325x-sglang @@ -1335,13 +1251,6 @@ - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973 -- config-keys: - - kimik2.5-int4-mi300x-vllm - description: - - "Add Kimi K2.5 INT4 single-node MI300X vLLM benchmark (TP8)" - - "Uses vLLM ROCm v0.18.0 image following AMD Andy Luo's recipe" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/975 - - config-keys: - dsr1-fp8-mi355x-atom-mtp description: @@ -1355,45 +1264,22 @@ description: - "New model support on ATOM framework" - "Kimi-K2.5 FP4, and MiniMax-M2.5 FP8 configs added for MI355X ATOM" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/992 - -- config-keys: - - minimaxm2.5-fp4-b200-vllm - description: - - "Optimize MiniMax-M2.5 NVFP4 B200 vLLM search-space" - - "Expand from tp2/tp4 to tp1/tp2/tp4/tp8 with expert parallel and dp-attn variants" - - "Add ep2, ep4, and dp-attn configurations for higher concurrency sweeps" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/996 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/963 - config-keys: - - dsr1-fp4-b200-dynamo-trt - - dsr1-fp8-b200-dynamo-trt - - dsr1-fp4-b200-dynamo-sglang - - dsr1-fp8-b200-dynamo-sglang - - dsr1-fp8-b200-dynamo-sglang-mtp - - dsr1-fp4-b200-dynamo-sglang-mtp - - dsr1-fp4-b300-dynamo-trt - - dsr1-fp8-b300-dynamo-trt - - dsr1-fp4-gb300-dynamo-trt - - dsr1-fp8-gb300-dynamo-trt - - dsr1-fp4-gb300-dynamo-sglang - - dsr1-fp8-gb300-dynamo-sglang - - dsr1-fp8-mi355x-sglang-disagg - - dsr1-fp8-mi355x-sglang-disagg-mtp - - dsr1-fp4-mi355x-sglang-disagg - - dsr1-fp4-mi355x-sglang-disagg-mtp + - minimaxm2.5-fp8-b200-vllm description: - - "Add multi-node lm-eval accuracy runs" - - "Eval picks the config with highest max eligible concurrency per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) group on 8k1k" - - "Eval concurrency set to the median eligible conc (>= MIN_EVAL_CONC=16) of the selected config to avoid OOM" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1000 - evals-only: true + - "Update vLLM image from v0.17.0 to v0.19.0 for MiniMax-M2.5 FP8 B200" + - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs" + - "Remove ISL 1024 / OSL 8192 seq-len config" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947 - config-keys: - - qwen3.5-fp8-h200-sglang-mtp + - minimaxm2.5-fp8-mi355x-vllm description: - - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001 + - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space" + - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1002 - config-keys: - minimaxm2.5-fp8-mi355x-vllm @@ -1403,14 +1289,28 @@ - "Upgrade vLLM image to v0.19.0" - "Enable FP8 KV cache + AITER FA for minimaxm2.5-fp8-mi355x-vllm" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1003 - + +- config-keys: + - minimaxm2.5-fp4-mi355x-vllm + description: + - "Add MiniMax M2.5 MXFP4 vLLM benchmark for MI355X" + - "Model: amd/MiniMax-M2.5-MXFP4 with --trust-remote-code and --block-size=32" + - "Image: vllm/vllm-openai-rocm:v0.19.1" + - "Environment: VLLM_ROCM_USE_AITER=1" + - "Tp=1, TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/827 - config-keys: - - qwen3.5-fp4-mi355x-sglang + - qwen3.5-fp8-h200-sglang-mtp description: - - "Qwen3.5 fp4 support on SGL" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1006 + - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001 +- config-keys: + - glm5-fp8-mi355x-atom + description: + - "GLM5 FP8 configs added for MI355X ATOM" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1009 - config-keys: - kimik2.5-fp4-gb200-dynamo-vllm @@ -1425,18 +1325,39 @@ - "Runner script updated to clone NVIDIA/srt-slurm and map vLLM container image" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1008 -- config-keys: - - glm5-fp8-mi355x-atom - description: - - "GLM5 FP8 configs added for MI355X ATOM" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1009 - - config-keys: - minimaxm2.5-fp8-b200-vllm description: - "Update MiniMax-M2.5 FP8 B200 config with new search spaces" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1010 +- config-keys: + - minimaxm2.5-fp4-b200-vllm + description: + - "Optimize MiniMax-M2.5 NVFP4 B200 vLLM search-space" + - "Expand from tp2/tp4 to tp1/tp2/tp4/tp8 with expert parallel and dp-attn variants" + - "Add ep2, ep4, and dp-attn configurations for higher concurrency sweeps" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/996 + +- config-keys: + - qwen3.5-fp4-b200-sglang + description: + - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang benchmark config and launch script" + - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64" + - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" + - "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820 + +- config-keys: + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + description: + - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and MI325X to achieve better performance" + - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1063 + - config-keys: - glm5-fp4-b200-sglang description: @@ -1447,25 +1368,31 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1011 - config-keys: - - glm5-fp8-b200-sglang + - qwen3.5-fp4-mi355x-sglang description: - - "Bump GLM-5 FP8 B200 SGLang concurrency from 128 to 256" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1012 - - + - "Qwen3.5 fp4 support on SGL" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1006 + - config-keys: - - qwen3.5-fp8-h200-sglang-mtp + - gptoss-fp4-h200-trt description: - - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017 + - "Upgrade TensorRT-LLM container from release:gpt-oss-dev to release:v1.3.0rc5" + - "Remove sed hack for TensorRT bug (fixed upstream in v1.3.0rc5)" + - "Remove enable_block_reuse: false from kv_cache_config (default true is now recommended)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/854 - config-keys: - - qwen3.5-fp4-mi355x-sglang + - glm5-fp8-b200-sglang + description: + - "Bump GLM-5 FP8 B200 SGLang concurrency from 128 to 256" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1012 + +- config-keys: + - qwen3.5-fp4-mi355x-sglang description: - "TP2/TP4 seach space exploration for Qwen3.5 fp4 on SGL" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1022 - - + - config-keys: - glm5-fp8-mi355x-sglang description: @@ -1474,28 +1401,10 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1023 - config-keys: - - kimik2.5-fp4-gb200-dynamo-trt - description: - - "Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo (14 STP configs)" - - "New framework: dynamo-trt (Dynamo frontend + TensorRT-LLM backend)" - - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" - - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" - - "Runner script updated to support kimik2.5 model prefix with dynamo-trt framework" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1026 - -- config-keys: - - glm5-fp4-b200-sglang - description: - - "Update SGLang image from nightly-dev-cu13-20260328-a27651d5 to v0.5.10.post1-cu130" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1031 - -- config-keys: - - qwen3.5-fp8-b300-sglang-mtp + - qwen3.5-fp8-h200-sglang-mtp description: - - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035 + - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017 - config-keys: - qwen3.5-fp8-mi355x-sglang @@ -1508,33 +1417,18 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1036 - config-keys: - - qwen3.5-fp8-mi355x-atom - - qwen3.5-fp8-mi355x-atom-mtp - description: - - "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1040 - - -- config-keys: - - qwen3.5-fp4-mi355x-sglang - description: - - "Update SGLang image from 'lmsysorg/sglang:v0.5.10-rocm720-mi35x' to 'rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413'" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1041 - -- config-keys: - - glm5.1-fp4-mi355x-atom + - glm5-fp4-b200-sglang description: - - "Add GLM-5.1 MXFP4 single-node MI355X ATOM benchmark" - - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post" - - "TP=2 and TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths" - - "Add --max-num-seqs and --gpu-memory-utilization 0.9 to server launch" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1043 + - "Update SGLang image from nightly-dev-cu13-20260328-a27651d5 to v0.5.10.post1-cu130" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1031 - config-keys: - - kimik2.5-fp4-b200-vllm + - qwen3.5-fp8-b300-sglang-mtp description: - - "Add kv-cache-dtype fp8, max-cudagraph-capture-size 2048, max-num-batched-tokens, and stream-interval 20 to server launch args" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1047 + - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035 - config-keys: - qwen3.5-fp8-b300-sglang @@ -1568,22 +1462,6 @@ - "At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 does not have a B300-specific recipe, so this reuses the existing GLM5 FP8 B200 SGLang recipe as-is" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1051 -- config-keys: - - minimaxm2.5-fp8-b300-vllm - description: - - "Add MiniMax-M2.5 FP8 B300 vLLM benchmark" - - "Image: vllm/vllm-openai:v0.19.0-cu130" - - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054 - -- config-keys: - - minimaxm2.5-fp4-b300-vllm - description: - - "Add MiniMax-M2.5 FP4 (NVFP4) B300 vLLM benchmark" - - "Image: vllm/vllm-openai:v0.19.0-cu130" - - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP4 B200 vLLM recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1055 - - config-keys: - glm5-fp4-b300-sglang description: @@ -1602,33 +1480,59 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1059 - config-keys: - - gptoss-fp4-mi300x-vllm + - minimaxm2.5-fp4-b300-vllm description: - - "Expand GPT-OSS 120B FP4 MI300X TP=1 concurrency from 64 to 256 for 1k1k" - - "Higher concurrency improves MoE weight amortization: 8552 total TPS at conc=256 vs 4016 at conc=64 (2.1x)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1061 + - "Add MiniMax-M2.5 FP4 (NVFP4) B300 vLLM benchmark" + - "Image: vllm/vllm-openai:v0.19.0-cu130" + - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP4 B200 vLLM recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1055 - config-keys: - - qwen3.5-bf16-mi300x-sglang - - qwen3.5-bf16-mi325x-sglang - - qwen3.5-fp8-mi300x-sglang - - qwen3.5-fp8-mi325x-sglang + - minimaxm2.5-fp8-b300-vllm description: - - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and MI325X to achieve better performance" - - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1063 + - "Add MiniMax-M2.5 FP8 B300 vLLM benchmark" + - "Image: vllm/vllm-openai:v0.19.0-cu130" + - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054 - config-keys: - - minimaxm2.5-fp8-b200-vllm + - kimik2.5-fp4-b300-vllm description: - - "Add VLLM_FLOAT32_MATMUL_PRECISION=high, remove VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1068 + - "Add Kimi-K2.5 FP4 (NVFP4) B300 vLLM benchmark" + - "Image: vllm/vllm-openai:v0.19.0-cu130" + - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1056 - config-keys: - - minimaxm2.5-fp4-b200-vllm + - gptoss-fp4-mi300x-vllm description: - - "Add VLLM_FLOAT32_MATMUL_PRECISION=high" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1069 + - "Expand GPT-OSS 120B FP4 MI300X TP=1 concurrency from 64 to 256 for 1k1k" + - "Higher concurrency improves MoE weight amortization: 8552 total TPS at conc=256 vs 4016 at conc=64 (2.1x)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1053 + +- config-keys: + - dsr1-fp4-b200-dynamo-trt + - dsr1-fp8-b200-dynamo-trt + - dsr1-fp4-b200-dynamo-sglang + - dsr1-fp8-b200-dynamo-sglang + - dsr1-fp8-b200-dynamo-sglang-mtp + - dsr1-fp4-b200-dynamo-sglang-mtp + - dsr1-fp4-b300-dynamo-trt + - dsr1-fp8-b300-dynamo-trt + - dsr1-fp4-gb300-dynamo-trt + - dsr1-fp8-gb300-dynamo-trt + - dsr1-fp4-gb300-dynamo-sglang + - dsr1-fp8-gb300-dynamo-sglang + - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-mi355x-sglang-disagg-mtp + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Add multi-node lm-eval accuracy runs" + - "Eval picks the config with highest max eligible concurrency per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) group on 8k1k" + - "Eval concurrency set to the median eligible conc (>= MIN_EVAL_CONC=16) of the selected config to avoid OOM" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1000 + evals-only: true - config-keys: - qwen3.5-fp4-b300-sglang @@ -1639,47 +1543,7 @@ - "Follows the SGLang cookbook recipe at https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5 as of 2026-04-17" - "Mirrors the B200 FP4 recipe with mem-fraction-static lowered to 0.8 and an extra TP2/EP2 search-space entry" - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1072 - -- config-keys: - - qwen3.5-bf16-b200-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B BF16 B200 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e" - - "Model: Qwen/Qwen3.5-397B-A17B" - - "Mirrors the qwen3.5-bf16-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-64 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1074 - -- config-keys: - - qwen3.5-fp4-b200-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6" - - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" - - "Mirrors the qwen3.5-fp4-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k and 8k1k, TP=4/EP=1 conc 4-128 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1075 - -- config-keys: - - qwen3.5-fp8-mi355x-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B FP8 MI355X SGLang MTP benchmark" - - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414" - - "Model: Qwen/Qwen3.5-397B-A17B-FP8" - - "Mirrors the qwen3.5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k (TP8/EP1, TP8/EP8, TP2/EP2) and 8k1k (TP2/EP2, TP4/EP1) with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1076 - -- config-keys: - - qwen3.5-bf16-mi355x-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B BF16 MI355X SGLang MTP benchmark" - - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" - - "Model: Qwen/Qwen3.5-397B-A17B" - - "Mirrors the qwen3.5-bf16-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1077 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - config-keys: - qwen3.5-bf16-b300-sglang @@ -1689,7 +1553,17 @@ - "Model: Qwen/Qwen3.5-397B-A17B" - "Mirrors the B200 BF16 recipe with an extra TP4/EP1 search-space entry alongside the existing TP8/EP1 sweep" - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1081 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - qwen3.5-bf16-b200-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B BF16 B200 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e" + - "Model: Qwen/Qwen3.5-397B-A17B" + - "Mirrors the qwen3.5-bf16-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-64 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - config-keys: - qwen3.5-bf16-b300-sglang-mtp @@ -1699,7 +1573,7 @@ - "Model: Qwen/Qwen3.5-397B-A17B" - "Mirrors the qwen3.5-bf16-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64, spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1082 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - config-keys: - qwen3.5-fp4-b300-sglang-mtp @@ -1709,7 +1583,7 @@ - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" - "Mirrors the qwen3.5-fp4-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128, spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1083 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - config-keys: - glm5-fp8-b300-sglang-mtp @@ -1719,7 +1593,17 @@ - "Model: zai-org/GLM-5-FP8" - "Mirrors the glm5-fp8-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1084 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - qwen3.5-bf16-mi355x-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B BF16 MI355X SGLang MTP benchmark" + - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" + - "Model: Qwen/Qwen3.5-397B-A17B" + - "Mirrors the qwen3.5-bf16-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - config-keys: - glm5-fp8-b200-sglang-mtp @@ -1729,7 +1613,27 @@ - "Model: zai-org/GLM-5-FP8" - "Mirrors the glm5-fp8-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1085 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - glm5-fp4-b300-sglang-mtp + description: + - "Add GLM-5 NVFP4 B300 SGLang MTP benchmark (draft)" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "Model: nvidia/GLM-5-NVFP4" + - "Follows the glm5-fp8-b300-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" + - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - qwen3.5-fp8-mi355x-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B FP8 MI355X SGLang MTP benchmark" + - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414" + - "Model: Qwen/Qwen3.5-397B-A17B-FP8" + - "Mirrors the qwen3.5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k (TP8/EP1, TP8/EP8, TP2/EP2) and 8k1k (TP2/EP2, TP4/EP1) with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - config-keys: - glm5-fp8-mi355x-sglang-mtp @@ -1739,27 +1643,27 @@ - "Model: zai-org/GLM-5-FP8" - "Mirrors the glm5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - "Configs: 1k1k and 8k1k, TP=8 conc 4-64 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1086 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - config-keys: - - glm5-fp4-b200-sglang-mtp + - qwen3.5-fp4-b200-sglang-mtp description: - - "Add GLM-5 NVFP4 B200 SGLang MTP benchmark (draft)" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "Model: nvidia/GLM-5-NVFP4" - - "Follows the glm5-fp8-b200-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1087 + - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6" + - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" + - "Mirrors the qwen3.5-fp4-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k and 8k1k, TP=4/EP=1 conc 4-128 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - config-keys: - - glm5-fp4-b300-sglang-mtp + - glm5-fp4-b200-sglang-mtp description: - - "Add GLM-5 NVFP4 B300 SGLang MTP benchmark (draft)" + - "Add GLM-5 NVFP4 B200 SGLang MTP benchmark (draft)" - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - "Model: nvidia/GLM-5-NVFP4" - - "Follows the glm5-fp8-b300-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" + - "Follows the glm5-fp8-b200-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1088 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - config-keys: - gptoss-fp4-mi300x-vllm @@ -1768,6 +1672,12 @@ - "low-latency endpoint for users prioritizing interactive single-user use cases (chat, copilot, agentic)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1092 +- config-keys: + - kimik2.5-fp4-b200-vllm + description: + - "Add kv-cache-dtype fp8, max-cudagraph-capture-size 2048, max-num-batched-tokens, and stream-interval 20 to server launch args" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1047 + - config-keys: - dsr1-fp8-h200-dynamo-trt - dsr1-fp8-h200-dynamo-sglang @@ -1776,21 +1686,6 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1094 evals-only: true -- config-keys: - - glm5.1-fp4-mi355x-sglang - description: - - "Add GLM5.1 MXFP4 (FP4) MI355X SGLang Support" - - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1098 - -- config-keys: - - kimik2.5-fp4-b300-vllm - description: - - "Add Kimi-K2.5 FP4 (NVFP4) B300 vLLM benchmark" - - "Image: vllm/vllm-openai:v0.19.0-cu130" - - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1100 - - config-keys: - minimaxm2.5-fp8-b300-vllm description: @@ -1804,11 +1699,16 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1107 - config-keys: - - dsr1-fp8-h100-dynamo-trt - - dsr1-fp8-h100-dynamo-sglang + - minimaxm2.5-fp8-b200-vllm description: - - "Trigger H100 multinode evals after dist-timeout and health-check timeout fixes" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1119 + - "Add VLLM_FLOAT32_MATMUL_PRECISION=high, remove VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1068 + +- config-keys: + - minimaxm2.5-fp4-b200-vllm + description: + - "Add VLLM_FLOAT32_MATMUL_PRECISION=high" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1069 - config-keys: - dsr1-fp8-h100-dynamo-trt @@ -1816,289 +1716,4 @@ description: - "Trigger H100 multinode evals after NVSHEMM fixes" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1120 - evals-only: true - -- config-keys: - - dsv4-fp4-gb200-dynamo-vllm - description: - - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (1k/1k sweep; 8k/1k currently commented out)" - - "Container: vllm/vllm-openai:deepseekv4-cu130; model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)" - - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern" - - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129 - - -- config-keys: - - dsv4-fp8-h200-vllm - description: - - "Add DeepSeek-V4-Pro vLLM H200 benchmark per https://vllm.ai/blog/deepseek-v4" - - "Image: vllm/vllm-openai:deepseekv4-cu129" - - "Model: deepseek-ai/DeepSeek-V4-Pro" - - "EP + DP=8, FP8 KV cache, block size 256, max-model-len 800000, prefix caching disabled" - - "H200 has no FP4 path, so --attention_config.use_fp4_indexer_cache is omitted" - - "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading" - - "Configs: 1k1k conc 4-64, 8k1k conc 4-64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130 - -- config-keys: - - dsv4-fp4-b200-sglang - description: - - "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)" - - "Container: lmsysorg/sglang:deepseek-v4-blackwell" - - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - - "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config" - - "Prefix caching and speculative decoding disabled for baseline numbers" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131 - -- config-keys: - - dsv4-fp8-mi355x-sglang - description: - - "Day 0 DeepSeek-V4-Pro FP8 MI355X SGLang benchmark" - - "Image: rocm/sgl-dev:deepseek-v4-mi35x (from sgl-project/sglang#23608)" - - "Model: sgl-project/DeepSeek-V4-Pro-FP8" - - "https://github.com/sgl-project/sglang/pull/23608#issuecomment-4311952977" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1134 - -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark (low-latency fallback)" - - "Image: lmsysorg/sglang:deepseek-v4-b300" - - "Model: deepseek-ai/DeepSeek-V4-Pro" - - "Low-latency only (TP=8, EP=1, no DP-attn, no DeepEP) — DeepEP FP8 weight-postprocess path is broken for this checkpoint on B300" - - "Prefix caching disabled, no speculative decoding" - - "Configs: 1k1k conc 4-1024, 8k1k conc 4-512" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143 - -- config-keys: - - dsv4-fp4-b300-vllm - description: - - "Add DeepSeek-V4-Pro single-node B300 vLLM aggregate benchmark" - - "Image: vllm/vllm-openai:deepseekv4-cu130" - - "Model: deepseek-ai/DeepSeek-V4-Pro" - - "Uses the submitted B300 pareto schedule for both 1k1k and 8k1k, excluding conc 1: TP8 at conc 4/128, TP4 at conc 4/8/16/32/64/128, DP4 at conc 256/512" - - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048" - - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1144 - -- config-keys: - - dsv4-fp8-mi355x-sglang - description: - - "Bump MI355X SLURM allocation from --time=180 to --time=300 in runners/launch_mi355x-amds.sh" - - "DSv4-Pro on MI355X exceeded the 3h cap (STEP CANCELLED DUE TO TIME LIMIT) due to ~30min MoE JIT compile plus slow torch-fallback kernels (SGLANG_HACK_FLASHMLA_BACKEND=torch et al.) from sgl-project/sglang#23608" - - "300 minutes matches the GH Actions outer timeout-minutes cap in benchmark-tmpl.yml" - - "Retriggering dsv4-fp8-mi355x-sglang" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1148 - -- config-keys: - - dsv4-fp8-mi355x-sglang - description: - - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh" - - "Bump --chunked-prefill-size from 4096 to 8192" - - "Retrigger dsv4-fp8-mi355x-sglang" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160 - -- config-keys: - - dsv4-fp4-mi355x-atom - description: - - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)" - - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script" - - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton" - - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us" - - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)" - - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1165 - -- config-keys: - - dsv4-fp4-mi355x-atom - description: - - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)" - - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script" - - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton" - - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us" - - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)" - - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170 - -- config-keys: - - dsv4-fp4-b300-sglang-mtp - description: - - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" - - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" - - "Model: deepseek-ai/DeepSeek-V4-Pro" - - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4" - - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" - - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" - - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 - -- config-keys: - - dsv4-fp4-b300-vllm - description: - - "Update search space based on B300 pareto sweep results" - - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192" - - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155 - -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "Recipe-per-CONC split for DeepSeek-V4-Pro on B300: low-latency (TP=8, EP=1), balanced (TP=4, EP=1) at conc=32, max-throughput (TP=4, EP=4, DP-attn + DeepEP) at conc=512, for both 1k1k and 8k1k" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3" - - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185 - -- config-keys: - - dsv4-fp4-b200-sglang - description: - - "Two-recipe dispatch for DeepSeek-V4-Pro on B200, selected by DP_ATTENTION knob: low-latency (TP=8, EP=1, flashinfer_mxfp4) for conc 1-32, DP-attention (TP=8, EP=8, DP-attn + DeepEP + mega_moe) for conc 64-{512,1024}. The DP-attention recipe uses identical flags across balanced and max-throughput CONC ranges; only --max-running-requests scales with CONC." - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b" - - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187 - -- config-keys: - - dsv4-fp4-b300-sglang-mtp - description: - - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182 - -- config-keys: - - dsv4-fp4-b300-vllm - description: - - Add low-latency configs and remove non-pareto configs - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1193 - -- config-keys: - - dsv4-fp4-b200-vllm - description: - - "Add DeepSeek-V4-Pro single-node B200 vLLM benchmark derived from B200 pareto sweep" - - "ISL=1024: TP8 conc 4-128; DP8 (dp-attn) conc 256-4096" - - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156 - -- config-keys: - - dsv4-fp4-b300-sglang-mtp - description: - - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" - - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" - - "Model: deepseek-ai/DeepSeek-V4-Pro" - - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4" - - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" - - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" - - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180 - -- config-keys: - - dsv4-fp8-mi355x-vllm - description: - - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)" - - "Base image rocm/atom:rocm7.2.2 (MI355X ROCm 7.2.2, aiter with MLA decode); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime via --no-deps overlay" - - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1" - - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k" - - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188 - -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8" - - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768" - - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179 - -- config-keys: - - dsv4-fp4-mi355x-atom - description: - - "Use ROCm/aiter#2916 mhc_pre device-allocation fix instead of disabling ATOM mhc_pre" - - "Patch installed aiter/ops/mhc.py at runtime to allocate mhc_pre intermediates on residual.device, preserving the aiter MHC fast path without rebuilding aiter" - - "Remove the ATOM deepseek_v4.py sed workaround that forced mhc_pre to torch fallback" - - "Keep dsv4-fp4-mi355x-atom at CONC=1 only; run 24953107645 showed high-concurrency DSv4 ATOM OOMs in PR #650 torch sparse-attention fallbacks before upstream AITER sparse-attention support lands" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1202 - -- config-keys: - - dsv4-fp4-b300-vllm-mtp - description: - - "Add preliminary vLLM MTP configs for DeepSeek-V4-Pro on B300" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1210 - -- config-keys: - - dsv4-fp4-b200-vllm - description: - - "Pin image to vllm/vllm-openai:v0.20.0-cu130 (was floating deepseekv4-cu130 tag); DeepGEMM is preinstalled in this image" - - "Use --attention_config.use_fp4_indexer_cache=True and --compilation-config {\"cudagraph_mode\": \"FULL_AND_PIECEWISE\", \"custom_ops\": [\"all\"]} for all configs" - - "Gate --moe-backend deep_gemm_mega_moe and --gpu-memory-utilization 0.85 on DP_ATTENTION=true per the v0.20.0 recipe" - - "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204 - -- config-keys: - - minimaxm2.5-fp4-mi355x-atom - description: - - "Add MiniMax-M2.5 MXFP4 MI355X Atom benchmark (rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post)" - - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042 - -- config-keys: - - dsv4-fp4-gb200-dynamo-vllm - description: - - "DSV4-Pro FP4 GB200 dynamo-vLLM disagg against srt-slurm aflowers/vllm-gb200-v0.20.0" - - "Keeps the three validated 8k/1k points: low-latency 1P/1D TP8 conc=1, mid-curve 1P/1D DEP8 conc=256, and max-tpt 3P/1D DEP8 conc=4096" - - "All three recipes run NATS/etcd on a dedicated infra node and use compute-node local NVMe model weights via /mnt/numa1/models/deepseek-v4-pro/" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1163 - -- config-keys: - - dsv4-fp4-gb200-dynamo-vllm - description: - - "Add GB200 Dynamo vLLM MegaMOE max-throughput recipe at conc=4096" - - "Topology matches max-tpt: 3 prefill DEP8 workers and 1 decode DEP8 worker with dedicated NATS/etcd" - - "Uses deep_gemm_mega_moe on prefill/decode, TORCH_SYMMMEM=NVSHMEM, and no offload" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218 - -- config-keys: - - dsv4-fp4-gb200-dynamo-vllm - description: - - "Add GB200 Dynamo vLLM low-middle curve recipe at conc=256/512" - - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd" - - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218 - -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "Add conc=8192 recipe for 1k1k: deepep mega_moe backend with cuda-graph-max-bs 1088, max-running-requests 8192, mem-fraction-static 0.80, swa-full-tokens-ratio 0.3, tokenizer-worker-num 16" - - "conc=8192 enables SGLANG_OPT_USE_ONLINE_COMPRESS=1 and --stream-interval 30" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1209 - -- config-keys: - - dsv4-fp4-b300-vllm - description: - - "Change image to vllm/vllm-openai:v0.20.0-cu130" - - "Use Mega MoE for DEP configs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1221 - -- config-keys: - - dsv4-fp4-b200-vllm-mtp - description: - - "Add preliminary vLLM MTP configs for DeepSeek-V4-Pro on B200" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1230 - -- config-keys: - - dsv4-fp4-gb200-dynamo-vllm - description: - - "Keep the GB200 Dynamo vLLM MegaMOE max-throughput recipe at 3P/1D DEP8 conc=4096" - - "Add GB200 Dynamo vLLM MegaMOE high-throughput recipe at 2P/1D DEP8 conc=4096" - - "Add GB200 Dynamo vLLM MegaMOE mid-curve recipe at 1P/1D DEP8 conc=256/512/1024" - - "Remove stale offload recipe copies and the old no-MegaMOE mid/max-throughput points from the GB200 Dynamo vLLM matrix" - - "Disable FlashInfer autotune on GB200 decode workers for accuracy stability, matching the srt-slurm recipe fix" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1223 - -- config-keys: - - dsv4-fp4-gb300-dynamo-sglang - description: - - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)" - - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)" - - "Topologies mirror the dsv4-fp4-gb300-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B" - - "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157 + evals-only: true \ No newline at end of file From ff116c2102ca09df55e78145c6f20bae49358c48 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 30 Apr 2026 12:41:57 +0530 Subject: [PATCH 08/10] AMD GLM5 FP8 MTP Support on MI355X - Perf Change Log Signed-off-by: ajith-sirra-amd --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 29c72bfe0..3d69bb58d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,10 @@ +- config-keys: + - glm5-fp8-mi355x-sglang-mtp + description: + - "Add GLM5 FP8 MTP MI355X SGLang Support" + - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122 + - config-keys: - dsr1-fp8-h100-dynamo-trt - dsr1-fp8-h100-dynamo-sglang From cad10bc1ccdeb2a9e4c6206182b0a4f45e3cf5f6 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 30 Apr 2026 12:52:44 +0530 Subject: [PATCH 09/10] Recover Perf Change Log Signed-off-by: ajith-sirra-amd --- perf-changelog.yaml | 1726 +++++++++++++++++++++++++------------------ 1 file changed, 1020 insertions(+), 706 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3d69bb58d..4e5926a56 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,74 +1,3 @@ -- config-keys: - - glm5-fp8-mi355x-sglang-mtp - description: - - "Add GLM5 FP8 MTP MI355X SGLang Support" - - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122 - -- config-keys: - - dsr1-fp8-h100-dynamo-trt - - dsr1-fp8-h100-dynamo-sglang - description: - - "Trigger H100 multinode evals after dist-timeout and health-check timeout fixes" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD - -- config-keys: - - glm5.1-fp4-mi355x-sglang - description: - - "Add GLM5.1 MXFP4 (FP4) MI355X SGLang Support" - - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1098 - -- config-keys: - - kimik2.5-fp4-gb200-dynamo-trt - description: - - "Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo (14 STP configs)" - - "New framework: dynamo-trt (Dynamo frontend + TensorRT-LLM backend)" - - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" - - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" - - "Runner script updated to support kimik2.5 model prefix with dynamo-trt framework" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1026 - -- config-keys: - - qwen3.5-fp4-mi355x-sglang - description: - - "Update SGLang image from 'lmsysorg/sglang:v0.5.10-rocm720-mi35x' to 'rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413'" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1041 - -- config-keys: - - kimik2.5-int4-mi300x-vllm - description: - - "Add Kimi K2.5 INT4 single-node MI300X vLLM benchmark (TP8)" - - "Uses vLLM ROCm v0.18.0 image following AMD Andy Luo's recipe" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX - -- config-keys: - - minimaxm2.5-fp8-h100-vllm - - minimaxm2.5-fp8-h200-vllm - description: - - "Update vLLM image from v0.16.0 to v0.18.0 for minimax h100 and h200 configs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX - -- config-keys: - - dsr1-fp8-b200-dynamo-trt - - dsr1-fp8-h200-dynamo-trt - - dsr1-fp4-gb200-dynamo-trt - description: - - "Fix metadata inconsistencies in nvidia-master.yaml - TP/EP/DP-attn values now match actual recipe files" - - "B200 FP8 TRT 8K/1K: prefill_ep 8→1 (15 entries), prefill_dp_attn true→false (1 entry)" - - "H200 FP8 TRT 1K/1K: prefill_dp_attn false→true (9 entries)" - - "H200 FP8 TRT 8K/1K: prefill_dp_attn true→false (8 entries)" - - "GB200 FP4 TRT 8K/1K: decode_dp_attn true→false (2 entries)" - - "All fixes are metadata-only; no recipe files were modified" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/919 - -- config-keys: - - kimik2.5-int4-mi325x-vllm - description: - - "Add Kimi K2.5 INT4 single-node MI325X vLLM benchmark (TP8)" - - "Uses vLLM ROCm v0.16.0 image following AMD Andy Luo's recipe" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/857 - - config-keys: - 70b-fp8-*-vllm description: @@ -133,6 +62,12 @@ - "Extend concurrency to 128 for gptoss b200 TRT configurations" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/233 +- config-keys: + - gptoss-fp4-b200-trt + description: + - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/256 + - config-keys: - "*gb200-dynamo-sglang" description: @@ -156,18 +91,22 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/273 - config-keys: - - gptoss-fp4-b200-trt + - dsr1-fp4-b200-sglang + - dsr1-fp8-b200-sglang + - dsr1-fp8-h200-sglang description: - - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/256 + - "Update NVIDIA DeepSeek sglang Docker image from v0.5.5 to v0.5.6" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/276 + - config-keys: - - dsr1-fp4-gb200-dynamo-trt - - dsr1-fp4-gb200-dynamo-sglang - - dsr1-fp8-gb200-dynamo-sglang + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-vllm description: - - "Add more configurations for GB200 SGLang DSR1" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/335 + - "Update vLLM image from v0.11.2 to v0.13.0" + - "Add VLLM_MXFP4_USE_MARLIN=1 to H100 and H200 benchmark scripts" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/327 - config-keys: - dsr1-fp4-mi355x-sglang @@ -175,6 +114,22 @@ - "Update MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post1" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/330 +- config-keys: + - dsr1-fp8-mi300x-sglang + - dsr1-fp8-mi325x-sglang + - dsr1-fp8-mi355x-sglang + description: + - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/332 + +- config-keys: + - dsr1-fp4-gb200-dynamo-trt + - dsr1-fp4-gb200-dynamo-sglang + - dsr1-fp8-gb200-dynamo-sglang + description: + - "Add more configurations for GB200 SGLang DSR1" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/335 + - config-keys: - dsr1-fp4-gb200-dynamo-sglang - dsr1-fp8-gb200-dynamo-sglang @@ -187,31 +142,7 @@ description: - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post2" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/369 - -- config-keys: - - dsr1-fp4-b200-sglang - - dsr1-fp8-b200-sglang - - dsr1-fp8-h200-sglang - description: - - "Update NVIDIA DeepSeek sglang Docker image from v0.5.5 to v0.5.6" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/276 - -- config-keys: - - gptoss-fp4-b200-vllm - - gptoss-fp4-h100-vllm - - gptoss-fp4-h200-vllm - description: - - "Update vLLM image from v0.11.2 to v0.13.0" - - "Add VLLM_MXFP4_USE_MARLIN=1 to H100 and H200 benchmark scripts" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/327 -- config-keys: - - dsr1-fp8-mi300x-sglang - - dsr1-fp8-mi325x-sglang - - dsr1-fp8-mi355x-sglang - description: - - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8 - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/332 - config-keys: - gptoss-fp4-gb200-dynamo-trt @@ -222,11 +153,14 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/387 - config-keys: - - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp4-b200-trt-mtp + - dsr1-fp8-b200-trt-mtp + - dsr1-fp8-h200-trt-mtp description: - - "Add PD disaggregation (1P2D) for Mi355X" - - "Includes with and without speculative decoding" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/348 + - Add MTP (Multi-Token Prediction) support for single-node TRT configs + - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/392 + - config-keys: - dsr1-fp4-mi355x-sglang @@ -234,21 +168,20 @@ - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.7" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/395 +- config-keys: + - dsr1-fp8-mi355x-sglang-disagg + description: + - "Add PD disaggregation (1P2D) for Mi355X" + - "Includes with and without speculative decoding" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/409 + - config-keys: - dsr1-fp8-b200-sglang description: - "Adds TP4 configurations to DSR1-FP8 B200 SGLang deployment experiments" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/411 - -- config-keys: - - dsr1-fp4-b200-trt-mtp - - dsr1-fp8-b200-trt-mtp - - dsr1-fp8-h200-trt-mtp - description: - - Add MTP (Multi-Token Prediction) support for single-node TRT configs - - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/392 - + + - config-keys: - dsr1-fp8-mi355x-atom - dsr1-fp4-mi355x-atom @@ -267,6 +200,22 @@ - "Add HIP_VISIBLE_DEVICES env var for Ray compatibility in vLLM 0.14+" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/496 +- config-keys: + - dsr1-fp4-gb200-dynamo-trt + description: + - "Update Dynamo TRT image from 0.5.1-rc0.pre3 to 0.8.1.post2" + - "Update TRT configurations" + - "Refactor configurations to use CONFIG_FILE-based recipes instead of inline parameter settings" + - "Introduce srt-slurm workflow for launching Dynamo jobs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/510 + +- config-keys: + - gptoss-fp4-mi300x-vllm + - gptoss-fp4-mi325x-vllm + description: + - "Fix AITER env vars for vLLM v0.14.0 on AMD MI300X and MI325X" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/535 + - config-keys: - dsr1-fp8-h200-sglang description: @@ -282,6 +231,7 @@ - "Set --attention-backend aiter for AMD aiter attention backend" - "Update chunked-prefill-size and max-prefill-tokens from 196608 to 131072" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/544 + - config-keys: - dsr1-fp8-mi325x-sglang description: @@ -294,11 +244,14 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/545 - config-keys: - - gptoss-fp4-mi300x-vllm - - gptoss-fp4-mi325x-vllm + - dsr1-fp8-h200-dynamo-trt description: - - "Fix AITER env vars for vLLM v0.14.0 on AMD MI300X and MI325X" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/535 + - "Add DSR1 FP8 H200 Dynamo TRT-LLM disaggregated multinode configuration" + - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" + - "Runner: h200-dgxc with multinode and disagg enabled" + - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths" + - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/570 - config-keys: - dsr1-fp8-mi355x-sglang @@ -307,6 +260,24 @@ - "Key fix: Disables mla persistent kernel when not using fp8 kv_cache (https://github.com/sgl-project/sglang/pull/17327)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/572 +- config-keys: + - dsr1-fp8-h200-dynamo-sglang + description: + - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration" + - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime" + - "Runner: h200-multinode-slurm with multinode and disagg enabled" + - "Recipes sourced from srtslurm repo (recipes/h200/)" + - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)" + - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)" + - "Concurrency levels range from 1 to 2048 depending on configuration" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/582 + +- config-keys: + - dsr1-fp4-b300-dynamo-trt + description: + - "Add DSR1 FP4 B300 Dynamo TRT configurations" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/585 + - config-keys: # NVIDIA single-node - dsr1-fp4-b200-sglang @@ -336,27 +307,17 @@ - gptoss-fp4-mi355x-atom description: - Add official GSM8k eval results to GPT-OSS and DeepSeek R1 scenarios - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/558 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/587 evals-only: true - config-keys: - - dsr1-fp8-h200-sglang - description: - - "Update H200 DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.9" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX - -- config-keys: - - dsr1-fp4-b300-dynamo-trt + - dsr1-fp4-b200-dynamo-trt description: - - "Add DSR1 FP4 B300 Dynamo TRT configurations" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/585 + - "Update DSR1 FP4 B200 Dynamo TRT configurations" + - "Update TRTLLM version to 1.2.0rc6.post2" + - "Transform to use srt-slurm recipes" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/588 -- config-keys: - - dsr1-fp4-mi355x-sglang - description: - - "Update SGLang image from v0.5.7 to v0.5.8 for DeepSeek-R1 FP4 on MI355x" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/595 - - config-keys: - dsr1-fp8-b200-trt description: @@ -368,33 +329,14 @@ - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges" - "Add TLLM_OVERRIDE_LAYER_NUM=61 to avoid OOM errors" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/594 - -- config-keys: - - dsr1-fp4-b200-dynamo-trt - description: - - "Update DSR1 FP4 B200 Dynamo TRT configurations" - - "Update TRTLLM version to 1.2.0rc6.post2" - - "Transform to use srt-slurm recipes" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/588 -- config-keys: - - dsr1-fp8-h200-dynamo-trt - description: - - "Add DSR1 FP8 H200 Dynamo TRT-LLM disaggregated multinode configuration" - - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1" - - "Runner: h200-dgxc with multinode and disagg enabled" - - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths" - - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/570 - config-keys: - - dsr1-fp4-gb200-dynamo-trt + - dsr1-fp4-mi355x-sglang description: - - "Update Dynamo TRT image from 0.5.1-rc0.pre3 to 0.8.1.post2" - - "Update TRT configurations" - - "Refactor configurations to use CONFIG_FILE-based recipes instead of inline parameter settings" - - "Introduce srt-slurm workflow for launching Dynamo jobs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/510 + - "Update SGLang image from v0.5.7 to v0.5.8 for DeepSeek-R1 FP4 on MI355x" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/595 + - config-keys: - dsr1-fp8-mi355x-sglang @@ -404,25 +346,21 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/613 - config-keys: - - dsr1-fp8-h200-dynamo-sglang + - dsr1-fp8-b200-dynamo-trt description: - - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration" - - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime" - - "Runner: h200-multinode-slurm with multinode and disagg enabled" - - "Recipes sourced from srtslurm repo (recipes/h200/)" - - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)" - - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)" - - "Concurrency levels range from 1 to 2048 depending on configuration" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/582 + - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/616 - config-keys: - - dsr1-fp4-b200-trt + - dsr1-fp8-gb200-dynamo-trt description: - - "Update TensorRT-LLM container from release:1.1.0rc2.post2 to release:1.2.0rc6.post2" - - "Change default MOE backend from DEEPGEMM to TRTLLM" - - "Add dynamic piecewise CUDA graphs for 1k1k (TEP8 and CONC64)" - - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/620 + - "Add DeepSeek R1 FP8 GB200 Dynamo TRT-LLM disaggregated multinode configurations" + - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2" + - "1k1k: 14 scenarios (7 MTP, 7 STP) with varying DP attention/TEP modes" + - "1k8k: 10 scenarios (5 MTP, 5 STP) for long output generation" + - "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads" + - "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/617 - config-keys: - dsr1-fp4-gb300-dynamo-trt @@ -433,6 +371,15 @@ - "Add gb300-nv runner and launch script for srt-slurm integration" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/618 +- config-keys: + - dsr1-fp4-b200-trt + description: + - "Update TensorRT-LLM container from release:1.1.0rc2.post2 to release:1.2.0rc6.post2" + - "Change default MOE backend from DEEPGEMM to TRTLLM" + - "Add dynamic piecewise CUDA graphs for 1k1k (TEP8 and CONC64)" + - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/620 + - config-keys: - dsr1-fp4-mi355x-sglang-disagg description: @@ -440,21 +387,20 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/622 - config-keys: - - dsr1-fp8-gb200-dynamo-trt + - dsr1-fp8-b200-sglang-mtp description: - - "Add DeepSeek R1 FP8 GB200 Dynamo TRT-LLM disaggregated multinode configurations" - - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2" - - "1k1k: 14 scenarios (7 MTP, 7 STP) with varying DP attention/TEP modes" - - "1k8k: 10 scenarios (5 MTP, 5 STP) for long output generation" - - "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads" - - "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/617 + - "Add MTP (Multi-Token Prediction) support for DeepSeek R1 FP8 B200 SGLang using EAGLE speculative decoding" + - "Image: lmsysorg/sglang:v0.5.8-cu130-amd64" + - "Add benchmark script dsr1_fp8_b200_mtp.sh with EAGLE speculative decoding (num-steps=2, draft-tokens=3, topk=1)" + - "Update launch_b200-dgxc.sh to support SPEC_SUFFIX for MTP script selection" + - "Configurations: TP=8, EP=1, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/626 - config-keys: - - dsr1-fp8-gb200-dynamo-trt + - dsr1-fp8-gb300-dynamo-trt description: - - "Fix model_prefix argument in yaml configs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/646 + - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 8k1k and 1k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/627 - config-keys: - dsr1-fp8-b200-trt-mtp @@ -466,10 +412,39 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/632 - config-keys: - - dsr1-fp8-gb300-dynamo-trt + - dsr1-fp4-gb200-dynamo-sglang description: - - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 8k1k and 1k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/627 + - "Update SGLang image from v0.5.5.post2 to v0.5.8-cu130" + - "Add FP4 model path separation via SRT_SLURM_MODEL_PREFIX in launch script" + - "Refactor to use CONFIG_FILE-based srt-slurm recipes instead of inline parameters" + - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)" + - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/633 + +- config-keys: + - dsr1-fp8-gb200-dynamo-sglang + - dsr1-fp8-gb300-dynamo-sglang + description: + - "Update GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode" + - "Image: lmsysorg/sglang:v0.5.8-cu130" + - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/635 + +- config-keys: + - dsr1-fp4-gb300-dynamo-sglang + description: + - "Add GB300 FP4 Dynamo SGLang disaggregated multinode configuration" + - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime" + - "Recipes sourced from srt-slurm repo (recipes/gb300-fp4/ folder)" + - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)" + - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/636 + +- config-keys: + - dsr1-fp8-b300-dynamo-trt + description: + - "New B300 FP8 Dynamo TRT configurations" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/638 - config-keys: - gptoss-fp4-b200-trt @@ -479,21 +454,12 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/639 - config-keys: - - dsr1-fp8-mi355x-sglang-disagg - description: - - "Add --use-chat-template argument to benchmark_serving script" - - "Without this arg, MTP acceptance rates are artificially high for DeepSeek with MTP" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/647 - -- config-keys: - - dsr1-fp8-b200-sglang-mtp + - dsr1-fp8-h200-dynamo-sglang description: - - "Add MTP (Multi-Token Prediction) support for DeepSeek R1 FP8 B200 SGLang using EAGLE speculative decoding" - - "Image: lmsysorg/sglang:v0.5.8-cu130-amd64" - - "Add benchmark script dsr1_fp8_b200_mtp.sh with EAGLE speculative decoding (num-steps=2, draft-tokens=3, topk=1)" - - "Update launch_b200-dgxc.sh to support SPEC_SUFFIX for MTP script selection" - - "Configurations: TP=8, EP=1, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/626 + - "Add MTP (EAGLE speculative decoding) configs alongside STP" + - "Update container to lmsysorg/sglang:v0.5.8.post1-cu130" + - "Remove aggregated configs, keep only disaggregated" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/640 - config-keys: - dsr1-fp4-b200-trt-mtp @@ -504,18 +470,35 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/642 - config-keys: - - dsr1-fp8-b200-dynamo-sglang + - dsr1-fp8-h100-dynamo-sglang description: - - "Add DSR1 FP8 B200 disaggregated SGLang multinode configuration" - - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64" - - "9 recipes: 4x 1k1k + 5x 8k1k, low-latency and max-throughput profiles" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/658 + - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang STP disaggregated multinode configurations" + - "Image: lmsysorg/sglang:v0.5.8-cu130" + - "1k1k, 1k8k, 8k1k sequence lengths" + - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/643 - config-keys: - - dsr1-fp8-gb300-dynamo-trt + - dsr1-fp8-h100-dynamo-sglang description: - - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 1k8k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/654 + - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang MTP disaggregated multinode configurations" + - "Image: lmsysorg/sglang:v0.5.8-cu130" + - "1k1k, 1k8k, 8k1k sequence lengths with MTP speculative decoding" + - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/644 + +- config-keys: + - dsr1-fp8-gb200-dynamo-trt + description: + - "Fix model_prefix argument in yaml configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/646 + +- config-keys: + - dsr1-fp8-mi355x-sglang-disagg + description: + - "Add --use-chat-template argument to benchmark_serving script" + - "Without this arg, MTP acceptance rates are artificially high for DeepSeek with MTP" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/649 - config-keys: - dsr1-fp8-h100-dynamo-trt @@ -524,18 +507,18 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/651 - config-keys: - - dsr1-fp8-h200-dynamo-sglang + - dsr1-fp8-gb300-dynamo-trt description: - - "Add MTP (EAGLE speculative decoding) configs alongside STP" - - "Update container to lmsysorg/sglang:v0.5.8.post1-cu130" - - "Remove aggregated configs, keep only disaggregated" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/640 + - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 1k8k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/654 - config-keys: - - dsr1-fp8-b300-dynamo-trt + - dsr1-fp8-b200-dynamo-sglang description: - - "New B300 FP8 Dynamo TRT configurations" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/638 + - "Add DSR1 FP8 B200 disaggregated SGLang multinode configuration" + - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64" + - "9 recipes: 4x 1k1k + 5x 8k1k, low-latency and max-throughput profiles" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/658 - config-keys: - dsr1-fp8-h100-dynamo-trt @@ -544,25 +527,6 @@ - "fix model_prefix bug from https://github.com/SemiAnalysisAI/InferenceX/pull/651" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/663 -- config-keys: - - dsr1-fp4-gb200-dynamo-sglang - description: - - "Update SGLang image from v0.5.5.post2 to v0.5.8-cu130" - - "Add FP4 model path separation via SRT_SLURM_MODEL_PREFIX in launch script" - - "Refactor to use CONFIG_FILE-based srt-slurm recipes instead of inline parameters" - - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)" - - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/633 - -- config-keys: - - dsr1-fp8-gb200-dynamo-sglang - - dsr1-fp8-gb300-dynamo-sglang - description: - - "Update GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode" - - "Image: lmsysorg/sglang:v0.5.8-cu130" - - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/635 - - config-keys: - dsr1-fp8-b200-dynamo-sglang-mtp description: @@ -571,33 +535,6 @@ - "9 recipes: 4x 1k1k + 5x 8k1k, low-latency and max-throughput with EAGLE speculative decoding" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/667 -- config-keys: - - dsr1-fp8-h100-dynamo-sglang - description: - - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang STP disaggregated multinode configurations" - - "Image: lmsysorg/sglang:v0.5.8-cu130" - - "1k1k, 1k8k, 8k1k sequence lengths" - - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/643 - -- config-keys: - - dsr1-fp8-h100-dynamo-sglang - description: - - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang MTP disaggregated multinode configurations" - - "Image: lmsysorg/sglang:v0.5.8-cu130" - - "1k1k, 1k8k, 8k1k sequence lengths with MTP speculative decoding" - - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/644 - -- config-keys: - - dsr1-fp8-mi355x-atom-mtp - - dsr1-fp4-mi355x-atom-mtp - description: - - "Add DSR1 FP8/FP4 MI355X ATOM with MTP configuration" - - "Image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1" - - "Deepseek R1 with speculative decoding: 1k1k, 1k8k, 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/673 - - config-keys: - dsr1-fp4-b200-dynamo-sglang description: @@ -608,10 +545,13 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/672 - config-keys: - - dsr1-fp8-b200-dynamo-trt + - dsr1-fp8-mi355x-atom-mtp + - dsr1-fp4-mi355x-atom-mtp description: - - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/616 + - "Add DSR1 FP8/FP4 MI355X ATOM with MTP configuration" + - "Image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1" + - "Deepseek R1 with speculative decoding: 1k1k, 1k8k, 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/673 - config-keys: - dsr1-fp8-mi355x-sglang-disagg @@ -631,22 +571,17 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/683 - config-keys: - - dsr1-fp4-gb300-dynamo-sglang + - dsr1-fp8-b200-dynamo-trt description: - - "Add GB300 FP4 Dynamo SGLang disaggregated multinode configuration" - - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime" - - "Recipes sourced from srt-slurm repo (recipes/gb300-fp4/ folder)" - - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)" - - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/636 + - "Update max_num_tokens and max_batch_size for min-latency decode workers" + - "See srt-slurm recipe changes: https://github.com/ishandhanani/srt-slurm/pull/173" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/686 - config-keys: - - dsr1-fp8-b200-dynamo-sglang-mtp + - dsr1-fp8-mi355x-sglang-disagg description: - - "Patches one missing concurrency point for " - - "DSR1 FP8 B200 disaggregated SGLang MTP multinode configuration. " - - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/691 + - "Add more sweep points for DSR1 FP8 both MTP and non-MTP 1k1k, 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/689 - config-keys: - dsr1-fp8-b300-dynamo-trt @@ -655,23 +590,25 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/690 - config-keys: - - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-b200-dynamo-sglang-mtp description: - - "Add more sweep points for DSR1 FP8 both MTP and non-MTP 1k1k, 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/689 + - "Patches one missing concurrency point for " + - "DSR1 FP8 B200 disaggregated SGLang MTP multinode configuration. " + - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/691 - config-keys: - - dsr1-fp8-b200-dynamo-trt + - dsr1-fp4-mi355x-sglang-disagg description: - - "Update max_num_tokens and max_batch_size for min-latency decode workers" - - "See srt-slurm recipe changes: https://github.com/ishandhanani/srt-slurm/pull/173" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/686 + - "Add more sweep points for DSR1 FP4 both MTP and non-MTP 1k1k, 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692 + - config-keys: - dsr1-fp8-mi325x-sglang description: - "Update MI325X DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.8" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/695 - config-keys: - dsr1-fp8-mi300x-sglang @@ -679,12 +616,6 @@ - "Update MI300X DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.8" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/696 -- config-keys: - - dsr1-fp4-mi355x-sglang-disagg - description: - - "Add more sweep points for DSR1 FP4 both MTP and non-MTP 1k1k, 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692 - - config-keys: - dsr1-fp8-b200-dynamo-sglang-mtp description: @@ -739,6 +670,15 @@ - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/734 +- config-keys: + - kimik2.5-int4-b200-vllm + description: + - "Add Kimi-K2.5 INT4 vLLM benchmark for B200" + - "Model: moonshotai/Kimi-K2.5 with --mm-encoder-tp-mode data and --trust-remote-code" + - "Image: vllm/vllm-openai:v0.15.1" + - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/735 + - config-keys: - minimaxm2.5-fp8-mi355x-vllm description: @@ -750,12 +690,13 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/755 - config-keys: - - qwen3.5-fp8-mi355x-sglang + - minimaxm2.5-fp8-b200-vllm description: - - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for MI355X" - - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218" - - "Uses triton attention backend, TP=8, concurrency 4-64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/768 + - "Add MiniMax-M2.5 FP8 vLLM benchmark for B200" + - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code" + - "Image: vllm/vllm-openai:v0.17.0" + - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/757 - config-keys: - qwen3.5-bf16-b200-sglang @@ -767,12 +708,30 @@ - "Set cuda-graph-max-bs to match concurrency, scheduler-recv-interval based on concurrency" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/758 +- config-keys: + - glm5-fp8-mi355x-sglang + description: + - "Add GLM-5 FP8 SGLang benchmark for MI355X" + - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends" + - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219" + - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/762 + +- config-keys: + - qwen3.5-fp8-mi355x-sglang + description: + - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for MI355X" + - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218" + - "Uses triton attention backend, TP=8, concurrency 4-64" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/768 + - config-keys: - dsr1-fp8-mi355x-sglang-disagg description: - "Add more configs for MI355X FP8 Disagg" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/770 - + + - config-keys: - gptoss-fp4-mi300x-vllm - gptoss-fp4-mi325x-vllm @@ -781,15 +740,6 @@ - "Gains: ROCm skinny GEMM dispatch fix, MoRI EP all2all backend, KV cache shuffle + paged attention for AITER" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/781 -- config-keys: - - kimik2.5-int4-b200-vllm - description: - - "Add Kimi-K2.5 INT4 vLLM benchmark for B200" - - "Model: moonshotai/Kimi-K2.5 with --mm-encoder-tp-mode data and --trust-remote-code" - - "Image: vllm/vllm-openai:v0.15.1" - - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/735 - - config-keys: - gptoss-fp4-b200-vllm - gptoss-fp4-h100-vllm @@ -800,7 +750,8 @@ - "Gains: CUTLASS MoE optimizations (~8% throughput), FP4 kernel improvements (~4% E2E on B200), torch.compile cold-start fix" - "v0.15.1 includes fix for prefix cache hit rate of 0% on GPT-OSS hybrid attention models" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/789 - + + - config-keys: - dsr1-fp4-mi355x-atom - dsr1-fp4-mi355x-atom-mtp @@ -809,16 +760,16 @@ - "Comment out TP=4 configs, consolidate to TP=8 only" - "Extend concurrency range to conc-end: 256 across all sequence lengths (1k1k, 1k8k, 8k1k)" - "Fix MTP 1k8k conc-start from 256 to 4 to enable full concurrency sweep" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/699 - + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/792 + - config-keys: - - glm5-fp8-mi355x-sglang + - qwen3.5-fp8-b200-sglang description: - - "Add GLM-5 FP8 SGLang benchmark for MI355X" - - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends" - - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219" - - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for B200" + - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64" + - "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner" + - "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804 - config-keys: - gptoss-fp4-mi300x-vllm @@ -841,12 +792,62 @@ - "Key changes: AITER v0.1.10.post3 with FP8 Prefill/Decode/KV Cache, FP8 prefill attention kernel, MORI EP two-batch overlapping, OOM fix for DeepSeek weight loading" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/816 +- config-keys: + - qwen3.5-fp4-b200-sglang + description: + - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang benchmark config and launch script" + - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64" + - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" + - "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820 + +- config-keys: + - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-mi355x-sglang-disagg-mtp + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Add more sweep configs for MI355X FP8/FP4 Disagg" + - "Add TP/DP/EP size < 8 support " + - "Support DSR1-0528 MTP Disagg" + - "Bump SGL mori image to Feb 27" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823 + +- config-keys: + - kimik2.5-fp4-mi355x-vllm + description: + - "Add Kimi-K2.5 MXFP4 vLLM benchmark for MI355X" + - "Model: amd/Kimi-K2.5-MXFP4 with --mm-encoder-tp-mode data and --trust-remote-code" + - "Image: vllm/vllm-openai-rocm:v0.15.1" + - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/825 + +- config-keys: + - minimaxm2.5-fp4-mi355x-vllm + description: + - "Add MiniMax M2.5 MXFP4 vLLM benchmark for MI355X" + - "Model: amd/MiniMax-M2.5-MXFP4 with --trust-remote-code and --block-size=32" + - "Image: vllm/vllm-openai-rocm:v0.19.1" + - "Environment: VLLM_ROCM_USE_AITER=1" + - "Tp=1, TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/827 + - config-keys: - minimaxm2.5-fp8-h200-vllm description: - "Add MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP4)" - "New benchmark script with --trust-remote-code for MiniMaxAI/MiniMax-M2.5" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/831 + +- config-keys: + - minimaxm2.5-fp8-h100-vllm + description: + - "Add MiniMax-M2.5 FP8 vLLM benchmark for H100" + - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code" + - "Image: vllm/vllm-openai:v0.16.0" + - "Switch from TP=8/EP=8 to TP=4/EP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k" + - "Script uses conditional --enable-expert-parallel based on EP_SIZE env var" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/832 - config-keys: - minimaxm2.5-fp8-mi325x-vllm @@ -868,22 +869,13 @@ - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/837 -- config-keys: - - kimik2.5-fp4-mi355x-vllm - description: - - "Add Kimi-K2.5 MXFP4 vLLM benchmark for MI355X" - - "Model: amd/Kimi-K2.5-MXFP4 with --mm-encoder-tp-mode data and --trust-remote-code" - - "Image: vllm/vllm-openai-rocm:v0.15.1" - - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/825 - - config-keys: - qwen3.5-bf16-mi325x-sglang description: - "Add Qwen3.5-397B-A17B BF16 SGLang benchmark for MI325X" - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x" - "Uses triton attention backend, TP=8, concurrency 4-64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/842 - config-keys: - qwen3.5-bf16-mi300x-sglang @@ -894,13 +886,14 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/843 - config-keys: - - qwen3.5-fp8-mi325x-sglang + - kimik2.5-int4-h200-vllm description: - - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark for MI325X" - - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x" - - "Following AMD Andy Luo's recipe with triton attention backend" + - "Add Kimi-K2.5 INT4 vLLM benchmark for H200" + - "Model: moonshotai/Kimi-K2.5 with --reasoning-parser kimi_k2 and --trust-remote-code" + - "Image: vllm/vllm-openai:v0.16.0" - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/847 - config-keys: - qwen3.5-fp8-mi300x-sglang @@ -912,47 +905,23 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/850 - config-keys: - - kimik2.5-int4-h200-vllm + - qwen3.5-fp8-mi325x-sglang description: - - "Add Kimi-K2.5 INT4 vLLM benchmark for H200" - - "Model: moonshotai/Kimi-K2.5 with --reasoning-parser kimi_k2 and --trust-remote-code" - - "Image: vllm/vllm-openai:v0.16.0" + - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark for MI325X" + - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x" + - "Following AMD Andy Luo's recipe with triton attention backend" - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839 - -- config-keys: - - dsr1-fp8-mi355x-sglang-disagg - - dsr1-fp8-mi355x-sglang-disagg-mtp - - dsr1-fp4-mi355x-sglang-disagg - - dsr1-fp4-mi355x-sglang-disagg-mtp - description: - - "Add more sweep configs for MI355X FP8/FP4 Disagg" - - "Add TP/DP/EP size < 8 support " - - "Support DSR1-0528 MTP Disagg" - - "Bump SGL mori image to Feb 27" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/852 - config-keys: - - minimaxm2.5-fp8-h100-vllm + - gptoss-fp4-h200-trt description: - - "Add MiniMax-M2.5 FP8 vLLM benchmark for H100" - - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code" - - "Image: vllm/vllm-openai:v0.16.0" - - "Switch from TP=8/EP=8 to TP=4/EP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k" - - "Script uses conditional --enable-expert-parallel based on EP_SIZE env var" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/832 + - "Upgrade TensorRT-LLM container from release:gpt-oss-dev to release:v1.3.0rc5" + - "Remove sed hack for TensorRT bug (fixed upstream in v1.3.0rc5)" + - "Remove enable_block_reuse: false from kv_cache_config (default true is now recommended)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/854 - config-keys: - - qwen3.5-fp8-b200-sglang - description: - - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for B200" - - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64" - - "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner" - - "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804 - -- config-keys: - qwen3.5-fp8-h200-sglang description: - "Add Qwen 3.5 FP8 H200 SGLang configuration" @@ -961,6 +930,13 @@ - "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855 +- config-keys: + - kimik2.5-fp4-b200-vllm + description: + - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration" + - "Image: vllm/vllm-openai:v0.17.0" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862 + - config-keys: - dsr1-fp8-mi355x-sglang description: @@ -969,20 +945,18 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/865 - config-keys: - - qwen3.5-bf16-b200-sglang - - qwen3.5-bf16-mi300x-sglang - - qwen3.5-bf16-mi325x-sglang - - qwen3.5-bf16-mi355x-sglang - - qwen3.5-fp8-b200-sglang - - qwen3.5-fp8-h200-sglang - - qwen3.5-fp8-mi300x-sglang - - qwen3.5-fp8-mi325x-sglang - - qwen3.5-fp8-mi355x-sglang + - minimaxm2.5-fp8-h200-vllm description: - - "Redo qwen eval" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/892 - evals-only: true - + - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869 + + +- config-keys: + - dsr1-fp8-h200-sglang + description: + - "Update H200 DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.9" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/887 + - config-keys: - gptoss-fp4-mi300x-vllm - gptoss-fp4-mi325x-vllm @@ -994,23 +968,31 @@ - "Add AMDGCN_USE_BUFFER_OPS=0 and VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 env vars" - "Switch to --attention-backend ROCM_AITER_UNIFIED_ATTN and add fuse_rope_kvcache compilation pass" - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867 - + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/889 + - config-keys: - - kimik2.5-fp4-b200-vllm + - qwen3.5-bf16-b200-sglang + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-bf16-mi355x-sglang + - qwen3.5-fp8-b200-sglang + - qwen3.5-fp8-h200-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + - qwen3.5-fp8-mi355x-sglang description: - - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration" - - "Image: vllm/vllm-openai:v0.17.0" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862 + - "Redo qwen eval" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/892 + evals-only: true + - config-keys: - - minimaxm2.5-fp8-b200-vllm + - qwen3.5-fp8-b200-sglang-mtp description: - - "Add MiniMax-M2.5 FP8 vLLM benchmark for B200" - - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code" - - "Image: vllm/vllm-openai:v0.17.0" - - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/757 + - "Add Single Node Agg FP8 MTP config for Qwen3.5 B200 SGLang" + - "EAGLE speculative decoding: num-steps 3, draft-tokens 4, topk 1" + - "New script: benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/898 - config-keys: - dsr1-fp4-mi355x-sglang-disagg @@ -1021,11 +1003,12 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/899 - config-keys: - - minimaxm2.5-fp8-h200-vllm + - kimik2.5-int4-mi325x-vllm description: - - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869 - + - "Add Kimi K2.5 INT4 single-node MI325X vLLM benchmark (TP8)" + - "Uses vLLM ROCm v0.16.0 image following AMD Andy Luo's recipe" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/901 + - config-keys: - dsr1-fp8-b200-dynamo-sglang - dsr1-fp8-b200-dynamo-sglang-mtp @@ -1036,28 +1019,111 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/907 - config-keys: - - glm5-fp8-h200-sglang - description: - - "Add GLM-5 FP8 SGLang H200 single-node benchmark" - - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper" - - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" - - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 - -- config-keys: + # NVIDIA single-node + - dsr1-fp4-b200-sglang + - dsr1-fp4-b200-trt + - dsr1-fp4-b200-trt-mtp + - dsr1-fp8-b200-sglang + - dsr1-fp8-b200-sglang-mtp + - dsr1-fp8-b200-trt + - dsr1-fp8-b200-trt-mtp + - dsr1-fp8-h200-sglang + - dsr1-fp8-h200-trt + - dsr1-fp8-h200-trt-mtp - glm5-fp8-b200-sglang - description: - - "Add GLM-5 FP8 SGLang benchmark for B200" - - "Supports TP8 (low latency) and DEP8 (high throughput) modes with NSA attention backend" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/915 - -- config-keys: + - glm5-fp8-h200-sglang + - gptoss-fp4-b200-trt + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-trt + - gptoss-fp4-h200-vllm + - kimik2.5-fp4-b200-vllm + - kimik2.5-int4-b200-vllm + - kimik2.5-int4-h200-vllm + - minimaxm2.5-fp8-b200-vllm + - minimaxm2.5-fp8-h100-vllm + - minimaxm2.5-fp8-h200-vllm + - qwen3.5-bf16-b200-sglang + - qwen3.5-fp8-b200-sglang - qwen3.5-fp8-b200-sglang-mtp + - qwen3.5-fp8-h200-sglang + # AMD single-node + - dsr1-fp4-mi355x-atom + - dsr1-fp4-mi355x-atom-mtp + - dsr1-fp4-mi355x-sglang + - dsr1-fp8-mi325x-sglang + - dsr1-fp8-mi300x-sglang + - dsr1-fp8-mi355x-atom + - dsr1-fp8-mi355x-atom-mtp + - dsr1-fp8-mi355x-sglang + - glm5-fp8-mi355x-sglang + - gptoss-fp4-mi300x-vllm + - gptoss-fp4-mi325x-vllm + - gptoss-fp4-mi355x-atom + - gptoss-fp4-mi355x-vllm + - kimik2.5-fp4-mi355x-vllm + - kimik2.5-int4-mi325x-vllm + - kimik2.5-int4-mi355x-vllm + - minimaxm2.5-fp8-mi300x-vllm + - minimaxm2.5-fp8-mi325x-vllm + - minimaxm2.5-fp8-mi355x-vllm + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-bf16-mi355x-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + - qwen3.5-fp8-mi355x-sglang description: - - "Add Single Node Agg FP8 MTP config for Qwen3.5 B200 SGLang" - - "EAGLE speculative decoding: num-steps 3, draft-tokens 4, topk 1" - - "New script: benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/898 + - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911 + evals-only: true + +- config-keys: + - glm5-fp8-h200-sglang + description: + - "Add GLM-5 FP8 SGLang H200 single-node benchmark" + - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper" + - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" + - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 + +- config-keys: + - glm5-fp8-b200-sglang + description: + - "Add GLM-5 FP8 SGLang benchmark for B200" + - "Supports TP8 (low latency) and DEP8 (high throughput) modes with NSA attention backend" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/915 + + +- config-keys: + - qwen3.5-fp8-b200-sglang + description: + - "Replace FP8 with combination of TP4 and TP8 config" + - "Add --enable-flashinfer-allreduce-fusion to TP8" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918 + +- config-keys: + - dsr1-fp8-b200-dynamo-trt + - dsr1-fp8-h200-dynamo-trt + - dsr1-fp4-gb200-dynamo-trt + description: + - "Fix metadata inconsistencies in nvidia-master.yaml - TP/EP/DP-attn values now match actual recipe files" + - "B200 FP8 TRT 8K/1K: prefill_ep 8→1 (15 entries), prefill_dp_attn true→false (1 entry)" + - "H200 FP8 TRT 1K/1K: prefill_dp_attn false→true (9 entries)" + - "H200 FP8 TRT 8K/1K: prefill_dp_attn true→false (8 entries)" + - "GB200 FP4 TRT 8K/1K: decode_dp_attn true→false (2 entries)" + - "All fixes are metadata-only; no recipe files were modified" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/919 + +- config-keys: + - kimik2.5-int4-mi325x-vllm + - kimik2.5-int4-mi355x-vllm + - kimik2.5-int4-h200-vllm + - kimik2.5-fp4-mi355x-vllm + - kimik2.5-fp4-b200-vllm + description: + - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926 - config-keys: - minimaxm2.5-fp8-mi355x-vllm @@ -1092,13 +1158,7 @@ - "Add --exclusive flag to MI355X single-node salloc and multi-node sbatch to prevent node sharing during benchmarks" - "Only non-TP8 configs listed; TP8 already uses all GPUs on the node" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/934 - -- config-keys: - - qwen3.5-fp8-b200-sglang - description: - - "Replace FP8 with combination of TP4 and TP8 config" - - "Add --enable-flashinfer-allreduce-fusion to TP8" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918 + - config-keys: - kimik2.5-int4-b200-vllm @@ -1106,6 +1166,15 @@ - "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935 +- config-keys: + - kimik2.5-fp4-mi355x-vllm + description: + - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" + - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)" + - "Add expert parallel, TP4, and TP4/EP4 search spaces" + - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936 + - config-keys: - dsr1-fp4-b200-sglang - dsr1-fp8-b200-sglang @@ -1118,22 +1187,15 @@ - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130" - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943 - -- config-keys: - - minimaxm2.5-fp8-mi325x-vllm - description: - - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" - - "Replace TP4 with TP8/EP8, add conc range 4-256" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/953 + - config-keys: - - kimik2.5-fp4-mi355x-vllm + - minimaxm2.5-fp8-b200-vllm description: - - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" - - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)" - - "Add expert parallel, TP4, and TP4/EP4 search spaces" - - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936 + - "Update vLLM image from v0.17.0 to v0.19.0 for MiniMax-M2.5 FP8 B200" + - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs" + - "Remove ISL 1024 / OSL 8192 seq-len config" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947 - config-keys: - kimik2.5-int4-mi355x-vllm @@ -1144,6 +1206,13 @@ - "Add --max-num-seqs 256, remove --disable-log-requests" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/950 +- config-keys: + - minimaxm2.5-fp8-mi325x-vllm + description: + - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0" + - "Replace TP4 with TP8/EP8, add conc range 4-256" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/953 + - config-keys: - kimik2.5-int4-mi325x-vllm description: @@ -1153,6 +1222,13 @@ - "Add --max-num-seqs 256, remove --disable-log-requests" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/957 +- config-keys: + - minimaxm2.5-fp8-h100-vllm + - minimaxm2.5-fp8-h200-vllm + description: + - "Update vLLM image from v0.16.0 to v0.18.0 for minimax h100 and h200 configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/958 + - config-keys: - gptoss-fp4-h100-vllm - gptoss-fp4-h200-vllm @@ -1160,16 +1236,6 @@ - "Update vLLM image from v0.15.1 to v0.18.0 for gptoss H100 and H200 configs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/960 -- config-keys: - - kimik2.5-int4-mi325x-vllm - - kimik2.5-int4-mi355x-vllm - - kimik2.5-int4-h200-vllm - - kimik2.5-fp4-mi355x-vllm - - kimik2.5-fp4-b200-vllm - description: - - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926 - - config-keys: - minimaxm2.5-fp8-b200-vllm - minimaxm2.5-fp8-h100-vllm @@ -1181,66 +1247,6 @@ - "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966 -- config-keys: - # NVIDIA single-node - - dsr1-fp4-b200-sglang - - dsr1-fp4-b200-trt - - dsr1-fp4-b200-trt-mtp - - dsr1-fp8-b200-sglang - - dsr1-fp8-b200-sglang-mtp - - dsr1-fp8-b200-trt - - dsr1-fp8-b200-trt-mtp - - dsr1-fp8-h200-sglang - - dsr1-fp8-h200-trt - - dsr1-fp8-h200-trt-mtp - - glm5-fp8-b200-sglang - - glm5-fp8-h200-sglang - - gptoss-fp4-b200-trt - - gptoss-fp4-b200-vllm - - gptoss-fp4-h100-vllm - - gptoss-fp4-h200-trt - - gptoss-fp4-h200-vllm - - kimik2.5-fp4-b200-vllm - - kimik2.5-int4-b200-vllm - - kimik2.5-int4-h200-vllm - - minimaxm2.5-fp8-b200-vllm - - minimaxm2.5-fp8-h100-vllm - - minimaxm2.5-fp8-h200-vllm - - qwen3.5-bf16-b200-sglang - - qwen3.5-fp8-b200-sglang - - qwen3.5-fp8-b200-sglang-mtp - - qwen3.5-fp8-h200-sglang - # AMD single-node - - dsr1-fp4-mi355x-atom - - dsr1-fp4-mi355x-atom-mtp - - dsr1-fp4-mi355x-sglang - - dsr1-fp8-mi325x-sglang - - dsr1-fp8-mi300x-sglang - - dsr1-fp8-mi355x-atom - - dsr1-fp8-mi355x-atom-mtp - - dsr1-fp8-mi355x-sglang - - glm5-fp8-mi355x-sglang - - gptoss-fp4-mi300x-vllm - - gptoss-fp4-mi325x-vllm - - gptoss-fp4-mi355x-atom - - gptoss-fp4-mi355x-vllm - - kimik2.5-fp4-mi355x-vllm - - kimik2.5-int4-mi325x-vllm - - kimik2.5-int4-mi355x-vllm - - minimaxm2.5-fp8-mi300x-vllm - - minimaxm2.5-fp8-mi325x-vllm - - minimaxm2.5-fp8-mi355x-vllm - - qwen3.5-bf16-mi300x-sglang - - qwen3.5-bf16-mi325x-sglang - - qwen3.5-bf16-mi355x-sglang - - qwen3.5-fp8-mi300x-sglang - - qwen3.5-fp8-mi325x-sglang - - qwen3.5-fp8-mi355x-sglang - description: - - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911 - evals-only: true - - config-keys: - qwen3.5-bf16-mi300x-sglang - qwen3.5-bf16-mi325x-sglang @@ -1258,6 +1264,13 @@ - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973 +- config-keys: + - kimik2.5-int4-mi300x-vllm + description: + - "Add Kimi K2.5 INT4 single-node MI300X vLLM benchmark (TP8)" + - "Uses vLLM ROCm v0.18.0 image following AMD Andy Luo's recipe" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/975 + - config-keys: - dsr1-fp8-mi355x-atom-mtp description: @@ -1271,53 +1284,62 @@ description: - "New model support on ATOM framework" - "Kimi-K2.5 FP4, and MiniMax-M2.5 FP8 configs added for MI355X ATOM" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/963 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/992 - config-keys: - - minimaxm2.5-fp8-b200-vllm + - minimaxm2.5-fp4-b200-vllm description: - - "Update vLLM image from v0.17.0 to v0.19.0 for MiniMax-M2.5 FP8 B200" - - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs" - - "Remove ISL 1024 / OSL 8192 seq-len config" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947 + - "Optimize MiniMax-M2.5 NVFP4 B200 vLLM search-space" + - "Expand from tp2/tp4 to tp1/tp2/tp4/tp8 with expert parallel and dp-attn variants" + - "Add ep2, ep4, and dp-attn configurations for higher concurrency sweeps" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/996 - config-keys: - - minimaxm2.5-fp8-mi355x-vllm + - dsr1-fp4-b200-dynamo-trt + - dsr1-fp8-b200-dynamo-trt + - dsr1-fp4-b200-dynamo-sglang + - dsr1-fp8-b200-dynamo-sglang + - dsr1-fp8-b200-dynamo-sglang-mtp + - dsr1-fp4-b200-dynamo-sglang-mtp + - dsr1-fp4-b300-dynamo-trt + - dsr1-fp8-b300-dynamo-trt + - dsr1-fp4-gb300-dynamo-trt + - dsr1-fp8-gb300-dynamo-trt + - dsr1-fp4-gb300-dynamo-sglang + - dsr1-fp8-gb300-dynamo-sglang + - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-mi355x-sglang-disagg-mtp + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp description: - - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space" - - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1002 + - "Add multi-node lm-eval accuracy runs" + - "Eval picks the config with highest max eligible concurrency per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) group on 8k1k" + - "Eval concurrency set to the median eligible conc (>= MIN_EVAL_CONC=16) of the selected config to avoid OOM" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1000 + evals-only: true - config-keys: - - minimaxm2.5-fp8-mi355x-vllm + - qwen3.5-fp8-h200-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001 + +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm description: - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space" - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs" - "Upgrade vLLM image to v0.19.0" - "Enable FP8 KV cache + AITER FA for minimaxm2.5-fp8-mi355x-vllm" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1003 - -- config-keys: - - minimaxm2.5-fp4-mi355x-vllm - description: - - "Add MiniMax M2.5 MXFP4 vLLM benchmark for MI355X" - - "Model: amd/MiniMax-M2.5-MXFP4 with --trust-remote-code and --block-size=32" - - "Image: vllm/vllm-openai-rocm:v0.19.1" - - "Environment: VLLM_ROCM_USE_AITER=1" - - "Tp=1, TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/827 -- config-keys: - - qwen3.5-fp8-h200-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001 - config-keys: - - glm5-fp8-mi355x-atom + - qwen3.5-fp4-mi355x-sglang description: - - "GLM5 FP8 configs added for MI355X ATOM" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1009 + - "Qwen3.5 fp4 support on SGL" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1006 + - config-keys: - kimik2.5-fp4-gb200-dynamo-vllm @@ -1333,37 +1355,16 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1008 - config-keys: - - minimaxm2.5-fp8-b200-vllm - description: - - "Update MiniMax-M2.5 FP8 B200 config with new search spaces" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1010 - -- config-keys: - - minimaxm2.5-fp4-b200-vllm - description: - - "Optimize MiniMax-M2.5 NVFP4 B200 vLLM search-space" - - "Expand from tp2/tp4 to tp1/tp2/tp4/tp8 with expert parallel and dp-attn variants" - - "Add ep2, ep4, and dp-attn configurations for higher concurrency sweeps" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/996 - -- config-keys: - - qwen3.5-fp4-b200-sglang + - glm5-fp8-mi355x-atom description: - - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang benchmark config and launch script" - - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64" - - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" - - "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820 + - "GLM5 FP8 configs added for MI355X ATOM" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1009 - config-keys: - - qwen3.5-bf16-mi300x-sglang - - qwen3.5-bf16-mi325x-sglang - - qwen3.5-fp8-mi300x-sglang - - qwen3.5-fp8-mi325x-sglang + - minimaxm2.5-fp8-b200-vllm description: - - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and MI325X to achieve better performance" - - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1063 + - "Update MiniMax-M2.5 FP8 B200 config with new search spaces" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1010 - config-keys: - glm5-fp4-b200-sglang @@ -1374,32 +1375,26 @@ - "Tune mem-fraction-static to 0.9, chunked-prefill-size to 32768, add tokenizer-worker-num 6" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1011 -- config-keys: - - qwen3.5-fp4-mi355x-sglang - description: - - "Qwen3.5 fp4 support on SGL" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1006 - -- config-keys: - - gptoss-fp4-h200-trt - description: - - "Upgrade TensorRT-LLM container from release:gpt-oss-dev to release:v1.3.0rc5" - - "Remove sed hack for TensorRT bug (fixed upstream in v1.3.0rc5)" - - "Remove enable_block_reuse: false from kv_cache_config (default true is now recommended)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/854 - - config-keys: - glm5-fp8-b200-sglang description: - "Bump GLM-5 FP8 B200 SGLang concurrency from 128 to 256" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1012 - + + +- config-keys: + - qwen3.5-fp8-h200-sglang-mtp + description: + - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017 + - config-keys: - qwen3.5-fp4-mi355x-sglang description: - "TP2/TP4 seach space exploration for Qwen3.5 fp4 on SGL" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1022 - + + - config-keys: - glm5-fp8-mi355x-sglang description: @@ -1408,10 +1403,28 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1023 - config-keys: - - qwen3.5-fp8-h200-sglang-mtp + - kimik2.5-fp4-gb200-dynamo-trt description: - - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017 + - "Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo (14 STP configs)" + - "New framework: dynamo-trt (Dynamo frontend + TensorRT-LLM backend)" + - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" + - "Runner script updated to support kimik2.5 model prefix with dynamo-trt framework" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1026 + +- config-keys: + - glm5-fp4-b200-sglang + description: + - "Update SGLang image from nightly-dev-cu13-20260328-a27651d5 to v0.5.10.post1-cu130" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1031 + +- config-keys: + - qwen3.5-fp8-b300-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035 - config-keys: - qwen3.5-fp8-mi355x-sglang @@ -1424,18 +1437,33 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1036 - config-keys: - - glm5-fp4-b200-sglang + - qwen3.5-fp8-mi355x-atom + - qwen3.5-fp8-mi355x-atom-mtp description: - - "Update SGLang image from nightly-dev-cu13-20260328-a27651d5 to v0.5.10.post1-cu130" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1031 + - "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1040 + - config-keys: - - qwen3.5-fp8-b300-sglang-mtp + - qwen3.5-fp4-mi355x-sglang description: - - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035 + - "Update SGLang image from 'lmsysorg/sglang:v0.5.10-rocm720-mi35x' to 'rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413'" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1041 + +- config-keys: + - glm5.1-fp4-mi355x-atom + description: + - "Add GLM-5.1 MXFP4 single-node MI355X ATOM benchmark" + - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post" + - "TP=2 and TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths" + - "Add --max-num-seqs and --gpu-memory-utilization 0.9 to server launch" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1043 + +- config-keys: + - kimik2.5-fp4-b200-vllm + description: + - "Add kv-cache-dtype fp8, max-cudagraph-capture-size 2048, max-num-batched-tokens, and stream-interval 20 to server launch args" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1047 - config-keys: - qwen3.5-fp8-b300-sglang @@ -1469,6 +1497,22 @@ - "At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 does not have a B300-specific recipe, so this reuses the existing GLM5 FP8 B200 SGLang recipe as-is" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1051 +- config-keys: + - minimaxm2.5-fp8-b300-vllm + description: + - "Add MiniMax-M2.5 FP8 B300 vLLM benchmark" + - "Image: vllm/vllm-openai:v0.19.0-cu130" + - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054 + +- config-keys: + - minimaxm2.5-fp4-b300-vllm + description: + - "Add MiniMax-M2.5 FP4 (NVFP4) B300 vLLM benchmark" + - "Image: vllm/vllm-openai:v0.19.0-cu130" + - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP4 B200 vLLM recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1055 + - config-keys: - glm5-fp4-b300-sglang description: @@ -1487,59 +1531,33 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1059 - config-keys: - - minimaxm2.5-fp4-b300-vllm - description: - - "Add MiniMax-M2.5 FP4 (NVFP4) B300 vLLM benchmark" - - "Image: vllm/vllm-openai:v0.19.0-cu130" - - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP4 B200 vLLM recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1055 - -- config-keys: - - minimaxm2.5-fp8-b300-vllm + - gptoss-fp4-mi300x-vllm description: - - "Add MiniMax-M2.5 FP8 B300 vLLM benchmark" - - "Image: vllm/vllm-openai:v0.19.0-cu130" - - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054 + - "Expand GPT-OSS 120B FP4 MI300X TP=1 concurrency from 64 to 256 for 1k1k" + - "Higher concurrency improves MoE weight amortization: 8552 total TPS at conc=256 vs 4016 at conc=64 (2.1x)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1061 - config-keys: - - kimik2.5-fp4-b300-vllm + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang description: - - "Add Kimi-K2.5 FP4 (NVFP4) B300 vLLM benchmark" - - "Image: vllm/vllm-openai:v0.19.0-cu130" - - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1056 + - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and MI325X to achieve better performance" + - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1063 - config-keys: - - gptoss-fp4-mi300x-vllm + - minimaxm2.5-fp8-b200-vllm description: - - "Expand GPT-OSS 120B FP4 MI300X TP=1 concurrency from 64 to 256 for 1k1k" - - "Higher concurrency improves MoE weight amortization: 8552 total TPS at conc=256 vs 4016 at conc=64 (2.1x)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1053 + - "Add VLLM_FLOAT32_MATMUL_PRECISION=high, remove VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1068 - config-keys: - - dsr1-fp4-b200-dynamo-trt - - dsr1-fp8-b200-dynamo-trt - - dsr1-fp4-b200-dynamo-sglang - - dsr1-fp8-b200-dynamo-sglang - - dsr1-fp8-b200-dynamo-sglang-mtp - - dsr1-fp4-b200-dynamo-sglang-mtp - - dsr1-fp4-b300-dynamo-trt - - dsr1-fp8-b300-dynamo-trt - - dsr1-fp4-gb300-dynamo-trt - - dsr1-fp8-gb300-dynamo-trt - - dsr1-fp4-gb300-dynamo-sglang - - dsr1-fp8-gb300-dynamo-sglang - - dsr1-fp8-mi355x-sglang-disagg - - dsr1-fp8-mi355x-sglang-disagg-mtp - - dsr1-fp4-mi355x-sglang-disagg - - dsr1-fp4-mi355x-sglang-disagg-mtp + - minimaxm2.5-fp4-b200-vllm description: - - "Add multi-node lm-eval accuracy runs" - - "Eval picks the config with highest max eligible concurrency per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) group on 8k1k" - - "Eval concurrency set to the median eligible conc (>= MIN_EVAL_CONC=16) of the selected config to avoid OOM" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1000 - evals-only: true + - "Add VLLM_FLOAT32_MATMUL_PRECISION=high" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1069 - config-keys: - qwen3.5-fp4-b300-sglang @@ -1550,17 +1568,7 @@ - "Follows the SGLang cookbook recipe at https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5 as of 2026-04-17" - "Mirrors the B200 FP4 recipe with mem-fraction-static lowered to 0.8 and an extra TP2/EP2 search-space entry" - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-bf16-b300-sglang - description: - - "Add Qwen3.5-397B-A17B BF16 B300 SGLang benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "Model: Qwen/Qwen3.5-397B-A17B" - - "Mirrors the B200 BF16 recipe with an extra TP4/EP1 search-space entry alongside the existing TP8/EP1 sweep" - - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1072 - config-keys: - qwen3.5-bf16-b200-sglang-mtp @@ -1570,7 +1578,47 @@ - "Model: Qwen/Qwen3.5-397B-A17B" - "Mirrors the qwen3.5-bf16-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-64 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1074 + +- config-keys: + - qwen3.5-fp4-b200-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6" + - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" + - "Mirrors the qwen3.5-fp4-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k and 8k1k, TP=4/EP=1 conc 4-128 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1075 + +- config-keys: + - qwen3.5-fp8-mi355x-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B FP8 MI355X SGLang MTP benchmark" + - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414" + - "Model: Qwen/Qwen3.5-397B-A17B-FP8" + - "Mirrors the qwen3.5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k (TP8/EP1, TP8/EP8, TP2/EP2) and 8k1k (TP2/EP2, TP4/EP1) with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1076 + +- config-keys: + - qwen3.5-bf16-mi355x-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B BF16 MI355X SGLang MTP benchmark" + - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" + - "Model: Qwen/Qwen3.5-397B-A17B" + - "Mirrors the qwen3.5-bf16-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1077 + +- config-keys: + - qwen3.5-bf16-b300-sglang + description: + - "Add Qwen3.5-397B-A17B BF16 B300 SGLang benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "Model: Qwen/Qwen3.5-397B-A17B" + - "Mirrors the B200 BF16 recipe with an extra TP4/EP1 search-space entry alongside the existing TP8/EP1 sweep" + - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1081 - config-keys: - qwen3.5-bf16-b300-sglang-mtp @@ -1580,7 +1628,7 @@ - "Model: Qwen/Qwen3.5-397B-A17B" - "Mirrors the qwen3.5-bf16-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64, spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1082 - config-keys: - qwen3.5-fp4-b300-sglang-mtp @@ -1590,7 +1638,7 @@ - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" - "Mirrors the qwen3.5-fp4-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128, spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1083 - config-keys: - glm5-fp8-b300-sglang-mtp @@ -1600,17 +1648,7 @@ - "Model: zai-org/GLM-5-FP8" - "Mirrors the glm5-fp8-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-bf16-mi355x-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B BF16 MI355X SGLang MTP benchmark" - - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" - - "Model: Qwen/Qwen3.5-397B-A17B" - - "Mirrors the qwen3.5-bf16-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1084 - config-keys: - glm5-fp8-b200-sglang-mtp @@ -1620,27 +1658,7 @@ - "Model: zai-org/GLM-5-FP8" - "Mirrors the glm5-fp8-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - glm5-fp4-b300-sglang-mtp - description: - - "Add GLM-5 NVFP4 B300 SGLang MTP benchmark (draft)" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "Model: nvidia/GLM-5-NVFP4" - - "Follows the glm5-fp8-b300-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-fp8-mi355x-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B FP8 MI355X SGLang MTP benchmark" - - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414" - - "Model: Qwen/Qwen3.5-397B-A17B-FP8" - - "Mirrors the qwen3.5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k (TP8/EP1, TP8/EP8, TP2/EP2) and 8k1k (TP2/EP2, TP4/EP1) with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1085 - config-keys: - glm5-fp8-mi355x-sglang-mtp @@ -1650,17 +1668,7 @@ - "Model: zai-org/GLM-5-FP8" - "Mirrors the glm5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - "Configs: 1k1k and 8k1k, TP=8 conc 4-64 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-fp4-b200-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6" - - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" - - "Mirrors the qwen3.5-fp4-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k and 8k1k, TP=4/EP=1 conc 4-128 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1086 - config-keys: - glm5-fp4-b200-sglang-mtp @@ -1670,7 +1678,17 @@ - "Model: nvidia/GLM-5-NVFP4" - "Follows the glm5-fp8-b200-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1087 + +- config-keys: + - glm5-fp4-b300-sglang-mtp + description: + - "Add GLM-5 NVFP4 B300 SGLang MTP benchmark (draft)" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "Model: nvidia/GLM-5-NVFP4" + - "Follows the glm5-fp8-b300-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" + - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1088 - config-keys: - gptoss-fp4-mi300x-vllm @@ -1679,12 +1697,6 @@ - "low-latency endpoint for users prioritizing interactive single-user use cases (chat, copilot, agentic)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1092 -- config-keys: - - kimik2.5-fp4-b200-vllm - description: - - "Add kv-cache-dtype fp8, max-cudagraph-capture-size 2048, max-num-batched-tokens, and stream-interval 20 to server launch args" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1047 - - config-keys: - dsr1-fp8-h200-dynamo-trt - dsr1-fp8-h200-dynamo-sglang @@ -1693,6 +1705,21 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1094 evals-only: true +- config-keys: + - glm5.1-fp4-mi355x-sglang + description: + - "Add GLM5.1 MXFP4 (FP4) MI355X SGLang Support" + - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1098 + +- config-keys: + - kimik2.5-fp4-b300-vllm + description: + - "Add Kimi-K2.5 FP4 (NVFP4) B300 vLLM benchmark" + - "Image: vllm/vllm-openai:v0.19.0-cu130" + - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1100 + - config-keys: - minimaxm2.5-fp8-b300-vllm description: @@ -1706,16 +1733,11 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1107 - config-keys: - - minimaxm2.5-fp8-b200-vllm - description: - - "Add VLLM_FLOAT32_MATMUL_PRECISION=high, remove VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1068 - -- config-keys: - - minimaxm2.5-fp4-b200-vllm + - dsr1-fp8-h100-dynamo-trt + - dsr1-fp8-h100-dynamo-sglang description: - - "Add VLLM_FLOAT32_MATMUL_PRECISION=high" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1069 + - "Trigger H100 multinode evals after dist-timeout and health-check timeout fixes" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1119 - config-keys: - dsr1-fp8-h100-dynamo-trt @@ -1723,4 +1745,296 @@ description: - "Trigger H100 multinode evals after NVSHEMM fixes" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1120 - evals-only: true \ No newline at end of file + evals-only: true + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (1k/1k sweep; 8k/1k currently commented out)" + - "Container: vllm/vllm-openai:deepseekv4-cu130; model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)" + - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern" + - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129 + + +- config-keys: + - dsv4-fp8-h200-vllm + description: + - "Add DeepSeek-V4-Pro vLLM H200 benchmark per https://vllm.ai/blog/deepseek-v4" + - "Image: vllm/vllm-openai:deepseekv4-cu129" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "EP + DP=8, FP8 KV cache, block size 256, max-model-len 800000, prefix caching disabled" + - "H200 has no FP4 path, so --attention_config.use_fp4_indexer_cache is omitted" + - "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading" + - "Configs: 1k1k conc 4-64, 8k1k conc 4-64" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130 + +- config-keys: + - dsv4-fp4-b200-sglang + description: + - "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)" + - "Container: lmsysorg/sglang:deepseek-v4-blackwell" + - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config" + - "Prefix caching and speculative decoding disabled for baseline numbers" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131 + +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Day 0 DeepSeek-V4-Pro FP8 MI355X SGLang benchmark" + - "Image: rocm/sgl-dev:deepseek-v4-mi35x (from sgl-project/sglang#23608)" + - "Model: sgl-project/DeepSeek-V4-Pro-FP8" + - "https://github.com/sgl-project/sglang/pull/23608#issuecomment-4311952977" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1134 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark (low-latency fallback)" + - "Image: lmsysorg/sglang:deepseek-v4-b300" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "Low-latency only (TP=8, EP=1, no DP-attn, no DeepEP) — DeepEP FP8 weight-postprocess path is broken for this checkpoint on B300" + - "Prefix caching disabled, no speculative decoding" + - "Configs: 1k1k conc 4-1024, 8k1k conc 4-512" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143 + +- config-keys: + - dsv4-fp4-b300-vllm + description: + - "Add DeepSeek-V4-Pro single-node B300 vLLM aggregate benchmark" + - "Image: vllm/vllm-openai:deepseekv4-cu130" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "Uses the submitted B300 pareto schedule for both 1k1k and 8k1k, excluding conc 1: TP8 at conc 4/128, TP4 at conc 4/8/16/32/64/128, DP4 at conc 256/512" + - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048" + - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1144 + +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Bump MI355X SLURM allocation from --time=180 to --time=300 in runners/launch_mi355x-amds.sh" + - "DSv4-Pro on MI355X exceeded the 3h cap (STEP CANCELLED DUE TO TIME LIMIT) due to ~30min MoE JIT compile plus slow torch-fallback kernels (SGLANG_HACK_FLASHMLA_BACKEND=torch et al.) from sgl-project/sglang#23608" + - "300 minutes matches the GH Actions outer timeout-minutes cap in benchmark-tmpl.yml" + - "Retriggering dsv4-fp8-mi355x-sglang" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1148 + +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh" + - "Bump --chunked-prefill-size from 4096 to 8192" + - "Retrigger dsv4-fp8-mi355x-sglang" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160 + +- config-keys: + - dsv4-fp4-mi355x-atom + description: + - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)" + - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script" + - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton" + - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us" + - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)" + - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1165 + +- config-keys: + - dsv4-fp4-mi355x-atom + description: + - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)" + - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script" + - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton" + - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us" + - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)" + - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" + - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4" + - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" + - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" + - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 + +- config-keys: + - dsv4-fp4-b300-vllm + description: + - "Update search space based on B300 pareto sweep results" + - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192" + - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Recipe-per-CONC split for DeepSeek-V4-Pro on B300: low-latency (TP=8, EP=1), balanced (TP=4, EP=1) at conc=32, max-throughput (TP=4, EP=4, DP-attn + DeepEP) at conc=512, for both 1k1k and 8k1k" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3" + - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185 + +- config-keys: + - dsv4-fp4-b200-sglang + description: + - "Two-recipe dispatch for DeepSeek-V4-Pro on B200, selected by DP_ATTENTION knob: low-latency (TP=8, EP=1, flashinfer_mxfp4) for conc 1-32, DP-attention (TP=8, EP=8, DP-attn + DeepEP + mega_moe) for conc 64-{512,1024}. The DP-attention recipe uses identical flags across balanced and max-throughput CONC ranges; only --max-running-requests scales with CONC." + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b" + - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182 + +- config-keys: + - dsv4-fp4-b300-vllm + description: + - Add low-latency configs and remove non-pareto configs + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1193 + +- config-keys: + - dsv4-fp4-b200-vllm + description: + - "Add DeepSeek-V4-Pro single-node B200 vLLM benchmark derived from B200 pareto sweep" + - "ISL=1024: TP8 conc 4-128; DP8 (dp-attn) conc 256-4096" + - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" + - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4" + - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" + - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" + - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180 + +- config-keys: + - dsv4-fp8-mi355x-vllm + description: + - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)" + - "Base image rocm/atom:rocm7.2.2 (MI355X ROCm 7.2.2, aiter with MLA decode); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime via --no-deps overlay" + - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1" + - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k" + - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8" + - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768" + - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179 + +- config-keys: + - dsv4-fp4-mi355x-atom + description: + - "Use ROCm/aiter#2916 mhc_pre device-allocation fix instead of disabling ATOM mhc_pre" + - "Patch installed aiter/ops/mhc.py at runtime to allocate mhc_pre intermediates on residual.device, preserving the aiter MHC fast path without rebuilding aiter" + - "Remove the ATOM deepseek_v4.py sed workaround that forced mhc_pre to torch fallback" + - "Keep dsv4-fp4-mi355x-atom at CONC=1 only; run 24953107645 showed high-concurrency DSv4 ATOM OOMs in PR #650 torch sparse-attention fallbacks before upstream AITER sparse-attention support lands" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1202 + +- config-keys: + - dsv4-fp4-b300-vllm-mtp + description: + - "Add preliminary vLLM MTP configs for DeepSeek-V4-Pro on B300" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1210 + +- config-keys: + - dsv4-fp4-b200-vllm + description: + - "Pin image to vllm/vllm-openai:v0.20.0-cu130 (was floating deepseekv4-cu130 tag); DeepGEMM is preinstalled in this image" + - "Use --attention_config.use_fp4_indexer_cache=True and --compilation-config {\"cudagraph_mode\": \"FULL_AND_PIECEWISE\", \"custom_ops\": [\"all\"]} for all configs" + - "Gate --moe-backend deep_gemm_mega_moe and --gpu-memory-utilization 0.85 on DP_ATTENTION=true per the v0.20.0 recipe" + - "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204 + +- config-keys: + - minimaxm2.5-fp4-mi355x-atom + description: + - "Add MiniMax-M2.5 MXFP4 MI355X Atom benchmark (rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post)" + - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042 + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "DSV4-Pro FP4 GB200 dynamo-vLLM disagg against srt-slurm aflowers/vllm-gb200-v0.20.0" + - "Keeps the three validated 8k/1k points: low-latency 1P/1D TP8 conc=1, mid-curve 1P/1D DEP8 conc=256, and max-tpt 3P/1D DEP8 conc=4096" + - "All three recipes run NATS/etcd on a dedicated infra node and use compute-node local NVMe model weights via /mnt/numa1/models/deepseek-v4-pro/" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1163 + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "Add GB200 Dynamo vLLM MegaMOE max-throughput recipe at conc=4096" + - "Topology matches max-tpt: 3 prefill DEP8 workers and 1 decode DEP8 worker with dedicated NATS/etcd" + - "Uses deep_gemm_mega_moe on prefill/decode, TORCH_SYMMMEM=NVSHMEM, and no offload" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218 + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "Add GB200 Dynamo vLLM low-middle curve recipe at conc=256/512" + - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd" + - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Add conc=8192 recipe for 1k1k: deepep mega_moe backend with cuda-graph-max-bs 1088, max-running-requests 8192, mem-fraction-static 0.80, swa-full-tokens-ratio 0.3, tokenizer-worker-num 16" + - "conc=8192 enables SGLANG_OPT_USE_ONLINE_COMPRESS=1 and --stream-interval 30" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1209 + +- config-keys: + - dsv4-fp4-b300-vllm + description: + - "Change image to vllm/vllm-openai:v0.20.0-cu130" + - "Use Mega MoE for DEP configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1221 + +- config-keys: + - dsv4-fp4-b200-vllm-mtp + description: + - "Add preliminary vLLM MTP configs for DeepSeek-V4-Pro on B200" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1230 + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "Keep the GB200 Dynamo vLLM MegaMOE max-throughput recipe at 3P/1D DEP8 conc=4096" + - "Add GB200 Dynamo vLLM MegaMOE high-throughput recipe at 2P/1D DEP8 conc=4096" + - "Add GB200 Dynamo vLLM MegaMOE mid-curve recipe at 1P/1D DEP8 conc=256/512/1024" + - "Remove stale offload recipe copies and the old no-MegaMOE mid/max-throughput points from the GB200 Dynamo vLLM matrix" + - "Disable FlashInfer autotune on GB200 decode workers for accuracy stability, matching the srt-slurm recipe fix" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1223 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang + description: + - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)" + - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)" + - "Topologies mirror the dsv4-fp4-gb300-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B" + - "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157 + +- config-keys: + - glm5-fp8-mi355x-sglang-mtp + description: + - "Add GLM5 FP8 MTP MI355X SGLang Support" + - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122 From f1cb159fdaae8bce8db981765884b447dfbf8a56 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 30 Apr 2026 13:49:12 +0530 Subject: [PATCH 10/10] Added Chat Template Signed-off-by: ajith-sirra-amd --- benchmarks/single_node/glm5_fp8_mi355x_mtp.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh index 504ba0184..5c28ebeaf 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh @@ -73,7 +73,8 @@ run_benchmark_serving \ --num-prompts "$((CONC * 10))" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --use-chat-template # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then