From 02487091201c0c5fce856c95160c60b5cab01479 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Mon, 16 Feb 2026 19:01:36 +0000 Subject: [PATCH 1/2] Add Qwen3.5-397B-A17B FP8 B200 SGLang benchmark (no MTP) Simple benchmark script without speculative decoding or MTP flags: - No SGLANG_USE_CUDA_IPC_TRANSPORT - No --reasoning-parser qwen3 - No --speculative-algo NEXTN or related flags - TP=8, --mem-fraction-static 0.8 - Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e Co-authored-by: functionstackx --- .github/configs/nvidia-master.yaml | 22 +++++++++++ benchmarks/qwen3.5_fp8_b200.sh | 62 ++++++++++++++++++++++++++++++ perf-changelog.yaml | 8 ++++ 3 files changed, 92 insertions(+) create mode 100755 benchmarks/qwen3.5_fp8_b200.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 570e52d43..48a901353 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1782,6 +1782,28 @@ dsr1-fp8-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } +qwen3.5-fp8-b200-sglang: + image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: b200 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + dsr1-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.8-cu130-amd64 model: deepseek-ai/DeepSeek-R1-0528 diff --git a/benchmarks/qwen3.5_fp8_b200.sh b/benchmarks/qwen3.5_fp8_b200.sh new file mode 100755 index 000000000..f6ded15e8 --- /dev/null +++ b/benchmarks/qwen3.5_fp8_b200.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +MEM_FRAC_STATIC=0.8 + +ps aux + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ + --model-path=$MODEL \ + --host=0.0.0.0 \ + --port=$PORT \ + --tensor-parallel-size=$TP \ + --mem-fraction-static $MEM_FRAC_STATIC \ + > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5fafe2d64..fc247d5e4 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -636,3 +636,11 @@ description: - "Bump MI355X disagg FP8 recipe commit to fix perf regression on 8k1k DEP8" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/701 + +- config-keys: + - qwen3.5-fp8-b200-sglang + description: + - "Add Qwen3.5-397B-A17B FP8 B200 SGLang benchmark" + - "Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e" + - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From ce477cfaa4d46252d7f54ee0434f2173224e9d70 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 17 Feb 2026 03:26:33 +0000 Subject: [PATCH 2/2] Rename Qwen3.5-397B-A17B benchmark from FP8 to BF16 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The benchmark script runs without --quantization fp8, so the model loads in BF16. Rename the script, config key, and changelog entry to accurately reflect the precision. - Rename benchmarks/qwen3.5_fp8_b200.sh → qwen3.5_bf16_b200.sh - Update nvidia-master.yaml config key and precision field - Update perf-changelog.yaml references and PR link Co-authored-by: functionstackx --- .github/configs/nvidia-master.yaml | 4 ++-- benchmarks/{qwen3.5_fp8_b200.sh => qwen3.5_bf16_b200.sh} | 0 perf-changelog.yaml | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) rename benchmarks/{qwen3.5_fp8_b200.sh => qwen3.5_bf16_b200.sh} (100%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 48a901353..46112ff50 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1782,12 +1782,12 @@ dsr1-fp8-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } -qwen3.5-fp8-b200-sglang: +qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: b200 - precision: fp8 + precision: bf16 framework: sglang multinode: false seq-len-configs: diff --git a/benchmarks/qwen3.5_fp8_b200.sh b/benchmarks/qwen3.5_bf16_b200.sh similarity index 100% rename from benchmarks/qwen3.5_fp8_b200.sh rename to benchmarks/qwen3.5_bf16_b200.sh diff --git a/perf-changelog.yaml b/perf-changelog.yaml index fc247d5e4..f828d705c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -638,9 +638,9 @@ pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/701 - config-keys: - - qwen3.5-fp8-b200-sglang + - qwen3.5-bf16-b200-sglang description: - - "Add Qwen3.5-397B-A17B FP8 B200 SGLang benchmark" + - "Add Qwen3.5-397B-A17B BF16 B200 SGLang benchmark" - "Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e" - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/704