From d7960fea08ed9bf3fc426bdd2ff7525b11da04cc Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 23:50:59 +0800 Subject: [PATCH 1/6] sglang dpskv4 hopper --- .github/configs/nvidia-master.yaml | 19 +++++ .../single_node/dsv4_fp4_h200_sglang.sh | 73 +++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 benchmarks/single_node/dsv4_fp4_h200_sglang.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9e4177ee8..e3c32ef0d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2510,6 +2510,25 @@ dsv4-fp8-h200-vllm: search-space: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } +# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP4, TP-only). +dsv4-fp4-h200-sglang: + image: lmsysorg/sglang:deepseek-v4-hopper + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: h200 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 # pareto sweep. The single-node schema has no explicit data-parallel-size # field, so dp-attn=true is used as the existing vLLM script switch for DP4 diff --git a/benchmarks/single_node/dsv4_fp4_h200_sglang.sh b/benchmarks/single_node/dsv4_fp4_h200_sglang.sh new file mode 100644 index 000000000..a7e822596 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_h200_sglang.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +nvidia-smi + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +set -x +PYTHONNOUSERSITE=1 sglang serve \ + --model-path $MODEL \ + --host 0.0.0.0 \ + --port $PORT \ + --trust-remote-code \ + --tp $TP \ + --moe-runner-backend marlin \ + --chunked-prefill-size 4096 \ + --disable-flashinfer-autotune \ + --mem-fraction-static 0.88 \ + --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ + $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x From eea0897a2ee28722ffe24397fca98546857d977a Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 00:32:50 +0800 Subject: [PATCH 2/6] h200 runner: support framework-tagged script names --- runners/launch_h200-dgxc-slurm.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index e11ca7b20..df22a3dc4 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -286,13 +286,24 @@ else fi " + # Prefer a framework-tagged script (e.g. dsv4_fp4_h200_sglang.sh) so models + # with multiple inference engines can coexist; fall back to the historical + # name without an engine suffix for scripts that haven't been retagged yet. + SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + BENCH_BASE="benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200" + BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" + if [[ ! -f "$BENCH_SCRIPT" ]]; then + LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" + fi + srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh + bash $BENCH_SCRIPT scancel $JOB_ID From d6039e3e98897fea79ed0fa04bb3a19410e3aa3e Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 00:40:52 +0800 Subject: [PATCH 3/6] h200 runners: fix script path and /workspace mount conflict --- runners/launch_h200-cw.sh | 19 +++++++++++++++---- runners/launch_h200-dgxc-slurm.sh | 13 ++++++++----- runners/launch_h200-nb.sh | 20 ++++++++++++++++---- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 84b40480c..03e758ebc 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -4,8 +4,13 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +BENCH_BASE="benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200" +BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" +if [[ ! -f "$BENCH_SCRIPT" ]]; then + LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" +fi PARTITION="h200" SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -38,13 +43,19 @@ else CONTAINER_IMAGE=$(realpath $SQUASH_FILE) fi +if [[ "$IMAGE" == *deepseek-v4-hopper* ]]; then + CONTAINER_MOUNT_DIR=/ix +else + CONTAINER_MOUNT_DIR=/workspace +fi + srun --jobid=$JOB_ID \ --container-image=$CONTAINER_IMAGE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-mount-home \ ---container-workdir=/workspace/ \ +--container-workdir=$CONTAINER_MOUNT_DIR/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash $BENCH_SCRIPT rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index df22a3dc4..7be6504fd 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -286,9 +286,6 @@ else fi " - # Prefer a framework-tagged script (e.g. dsv4_fp4_h200_sglang.sh) so models - # with multiple inference engines can coexist; fall back to the historical - # name without an engine suffix for scripts that haven't been retagged yet. SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') BENCH_BASE="benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200" BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" @@ -297,11 +294,17 @@ else BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" fi + if [[ "$IMAGE" == *deepseek-v4-hopper* ]]; then + CONTAINER_MOUNT_DIR=/ix + else + CONTAINER_MOUNT_DIR=/workspace + fi + srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ - --container-workdir=/workspace/ \ + --container-workdir=$CONTAINER_MOUNT_DIR/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash $BENCH_SCRIPT diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 9d157a858..55f1b4071 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -4,19 +4,31 @@ export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +BENCH_BASE="benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200" +BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" +if [[ ! -f "$BENCH_SCRIPT" ]]; then + LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" +fi + +if [[ "$IMAGE" == *deepseek-v4-hopper* ]]; then + CONTAINER_MOUNT_DIR=/ix +else + CONTAINER_MOUNT_DIR=/workspace +fi + PARTITION="main" set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \ --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --container-remap-root \ --container-writable \ --container-mount-home \ ---container-workdir=/workspace/ \ +--container-workdir=$CONTAINER_MOUNT_DIR/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash $BENCH_SCRIPT From 1a9d38e0a1390ca0aa1b6841277b690598e1346a Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 00:52:17 +0800 Subject: [PATCH 4/6] pin deepseek-v4-hopper image digest --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e3c32ef0d..be4310b57 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2512,7 +2512,7 @@ dsv4-fp8-h200-vllm: # DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP4, TP-only). dsv4-fp4-h200-sglang: - image: lmsysorg/sglang:deepseek-v4-hopper + image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 From 0afb1d97b6a0077ead943d665a5b0cc3d219e9da Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 12:18:13 +0800 Subject: [PATCH 5/6] fp4 -> fp8 --- .github/configs/nvidia-master.yaml | 6 +++--- .../{dsv4_fp4_h200_sglang.sh => dsv4_fp8_h200_sglang.sh} | 0 2 files changed, 3 insertions(+), 3 deletions(-) rename benchmarks/single_node/{dsv4_fp4_h200_sglang.sh => dsv4_fp8_h200_sglang.sh} (100%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index be4310b57..66c63383a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2510,13 +2510,13 @@ dsv4-fp8-h200-vllm: search-space: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } -# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP4, TP-only). -dsv4-fp4-h200-sglang: +# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). +dsv4-fp8-h200-sglang: image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 - precision: fp4 + precision: fp8 framework: sglang multinode: false seq-len-configs: diff --git a/benchmarks/single_node/dsv4_fp4_h200_sglang.sh b/benchmarks/single_node/dsv4_fp8_h200_sglang.sh similarity index 100% rename from benchmarks/single_node/dsv4_fp4_h200_sglang.sh rename to benchmarks/single_node/dsv4_fp8_h200_sglang.sh From d6fd273e130e837641fbaef275542c482a44468e Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 12:19:26 +0800 Subject: [PATCH 6/6] conc: drop 2, add 64 --- .github/configs/nvidia-master.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 66c63383a..86405edd9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2523,11 +2523,13 @@ dsv4-fp8-h200-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 # pareto sweep. The single-node schema has no explicit data-parallel-size