diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 570e52d43..46112ff50 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1782,6 +1782,28 @@ dsr1-fp8-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } +qwen3.5-bf16-b200-sglang: + image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: b200 + precision: bf16 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + dsr1-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.8-cu130-amd64 model: deepseek-ai/DeepSeek-R1-0528 diff --git a/benchmarks/qwen3.5_bf16_b200.sh b/benchmarks/qwen3.5_bf16_b200.sh new file mode 100755 index 000000000..f6ded15e8 --- /dev/null +++ b/benchmarks/qwen3.5_bf16_b200.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +MEM_FRAC_STATIC=0.8 + +ps aux + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ + --model-path=$MODEL \ + --host=0.0.0.0 \ + --port=$PORT \ + --tensor-parallel-size=$TP \ + --mem-fraction-static $MEM_FRAC_STATIC \ + > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5fafe2d64..f828d705c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -636,3 +636,11 @@ description: - "Bump MI355X disagg FP8 recipe commit to fix perf regression on 8k1k DEP8" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/701 + +- config-keys: + - qwen3.5-bf16-b200-sglang + description: + - "Add Qwen3.5-397B-A17B BF16 B200 SGLang benchmark" + - "Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e" + - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/704