Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1497,8 +1497,8 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_MTP_SIZE=1"

dsv4-fp8-mi355x-sglang:
image: rocm/sgl-dev:deepseek-v4-mi35x
model: sgl-project/DeepSeek-V4-Pro-FP8
image: rocm/sgl-dev:v0.5.10.post1-rocm700-mi35x-20260428
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: mi355x
precision: fp8
Expand All @@ -1514,6 +1514,29 @@ dsv4-fp8-mi355x-sglang:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

# FP4-experts variant of dsv4-fp8-mi355x-sglang. Same image and sglang overlay
# (amd/deepseek_v4 branch pinned at runtime by dsv4_fp4_mi355x_sglang.sh);
# differs only in SGLANG_DSV4_FP4_EXPERTS=True / SGLANG_FORCE_TRITON_MOE_FP8=0
# and uses the bf16 DeepSeek-V4-Pro checkpoint (FP4 quantization is applied
# at expert-runtime by sglang).
dsv4-fp4-mi355x-sglang:
image: rocm/sgl-dev:v0.5.10.post1-rocm700-mi35x-20260428
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: mi355x
precision: fp4
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

# vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889,
# stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with
# MLA decode, MI355X GPU detection); vLLM is rebuilt from the PR branch
Expand Down
153 changes: 153 additions & 0 deletions benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

# Overlay sglang from the amd/deepseek_v4 branch on top of whatever the
# rocm/sgl-dev:v0.5.10.post1-rocm700-mi35x-20260428 image ships with. We
# stay on the rocm700 (ROCm 7.0.0a) line because rocm720 hit
# hipErrorInvalidConfiguration on use_symmetric_memory-allocated dp_attention
# buffers (RCCL symmetric-memory bug; SGLANG_USE_ROCM700A WA only covers the
# cuda-graph path, not eager mode that we use via --disable-cuda-graph).
# Bump SGL_PR_SHA when the branch advances.
SGL_PR_SHA="18afbf151a2992b06a089191769b299629ed73dd"
SGL_PR_DIR="/tmp/sglang-amd-dsv4"

if [ ! -d "$SGL_PR_DIR/.git" ]; then
git clone --filter=blob:none https://github.com/sgl-project/sglang.git "$SGL_PR_DIR"
fi
(
cd "$SGL_PR_DIR"
git fetch --depth=1 origin "$SGL_PR_SHA" 2>/dev/null \
|| git fetch --depth=1 origin amd/deepseek_v4
git checkout --force "$SGL_PR_SHA"
test "$(git rev-parse HEAD)" = "$SGL_PR_SHA"

# Reinstall just the Python package; the image already has the ROCm
# kernel deps (aiter, triton, tilelang, torch) at versions matched to
# this branch, so --no-deps avoids pip resolving them against PyPI.
pip install --no-build-isolation --no-deps --force-reinstall -e python/
)

python3 -c "import sglang; print(f'sglang {sglang.__version__} from {sglang.__path__[0]}')"

# Transformers in the container doesn't recognize the `deepseek_v4` model_type.
# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this
# by writing a patched config to /tmp, but in practice isn't catching the error
# in this image. Patch the cached config.json directly instead: set model_type
# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep
# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native
# DSv4 model class (python/sglang/srt/models/deepseek_v4.py).
python3 << PYEOF
import json
from huggingface_hub import hf_hub_download
path = hf_hub_download(repo_id="$MODEL", filename="config.json")
with open(path) as f:
config = json.load(f)
if config.get("model_type") == "deepseek_v4":
config["model_type"] = "deepseek_v3"
with open(path, "w") as f:
json.dump(config, f, indent=2)
print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3")
else:
print(f"No patch needed: model_type is {config.get('model_type')!r}")
PYEOF

# DSv4 FP4-experts path. Mirrors the active path of python/run_dsv4.sh on
# the amd/deepseek_v4 branch at SGL_PR_SHA:
# SGLANG_DSV4_FP4_EXPERTS=True -> route experts through the FP4 kernels
# SGLANG_FORCE_TRITON_MOE_FP8=0 -> dispatch MoE through aiter (gating
# switch added in commit 33de1e64);
# also enables swiglu_limit clamp in the
# triton MoE fallback path.
export SGLANG_REASONING_EFFORT=max
export SGLANG_OPT_USE_FUSED_COMPRESS=false
export SGLANG_OPT_USE_OLD_COMPRESSOR=true
export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false
export SGLANG_OPT_USE_FUSED_HASH_TOPK=false
export SGLANG_HACK_FLASHMLA_BACKEND=torch
export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
export SGLANG_OPT_USE_TILELANG_MHC_POST=false
export SGLANG_ENABLE_THINKING=1
export SGLANG_USE_AITER=1
export SGLANG_USE_ROCM700A=1
export SGLANG_TOPK_TRANSFORM_512_TORCH=1
export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
export SGLANG_DSV4_FP4_EXPERTS=True
export SGLANG_OPT_DPSK_V4_RADIX=0
export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false
export SGLANG_OPT_USE_FUSED_STORE_CACHE=false
export SGLANG_FORCE_TRITON_MOE_FP8=0

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

python3 -m sglang.launch_server \
--model-path $MODEL \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--dp $TP \
--enable-dp-attention \
--trust-remote-code \
--disable-radix-cache \
--attention-backend compressed \
--max-running-request 256 \
--page-size 256 \
--chunked-prefill-size 8192 \
--disable-shared-experts-fusion \
--disable-cuda-graph \
--tool-call-parser deepseekv4 \
--reasoning-parser deepseek-v4 \
--watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
37 changes: 35 additions & 2 deletions benchmarks/single_node/dsv4_fp8_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,34 @@ fi

hf download "$MODEL"

# Overlay sglang from the amd/deepseek_v4 branch on top of whatever the
# rocm/sgl-dev:v0.5.10.post1-rocm700-mi35x-20260428 image ships with. We
# stay on the rocm700 (ROCm 7.0.0a) line because rocm720 hit
# hipErrorInvalidConfiguration on use_symmetric_memory-allocated dp_attention
# buffers (RCCL symmetric-memory bug; SGLANG_USE_ROCM700A WA only covers the
# cuda-graph path, not eager mode that we use via --disable-cuda-graph).
# Bump SGL_PR_SHA when the branch advances.
SGL_PR_SHA="18afbf151a2992b06a089191769b299629ed73dd"
SGL_PR_DIR="/tmp/sglang-amd-dsv4"

if [ ! -d "$SGL_PR_DIR/.git" ]; then
git clone --filter=blob:none https://github.com/sgl-project/sglang.git "$SGL_PR_DIR"
fi
(
cd "$SGL_PR_DIR"
git fetch --depth=1 origin "$SGL_PR_SHA" 2>/dev/null \
|| git fetch --depth=1 origin amd/deepseek_v4
git checkout --force "$SGL_PR_SHA"
test "$(git rev-parse HEAD)" = "$SGL_PR_SHA"

# Reinstall just the Python package; the image already has the ROCm
# kernel deps (aiter, triton, tilelang, torch) at versions matched to
# this branch, so --no-deps avoids pip resolving them against PyPI.
pip install --no-build-isolation --no-deps --force-reinstall -e python/
)
Comment on lines +33 to +44
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 The new sglang overlay block (lines 20-41) does not actually enforce the 'reproducible pin per benchmark run' the comment promises: the fallback git fetch --depth=1 origin amd/deepseek_v4 only retrieves the branch tip, so when SGL_PR_SHA lags the branch (the exact case the 'Bump SGL_PR_SHA when the branch advances' comment anticipates) and the by-SHA fetch fails, the subsequent git checkout --force "$SGL_PR_SHA" cannot resolve the SHA. Combined with the missing set -eo pipefail (the sister script dsv4_fp8_mi355x_vllm.sh:2 has it), the failed checkout, the test $(git rev-parse HEAD) = $SGL_PR_SHA pin guard at line 35, and any other failure are all swallowed — pip install -e python/ runs unconditionally against whatever was previously checked out, and benchmark numbers get reported against the wrong sglang. Fix: add set -eo pipefail near the top, and either drop the --depth=1 fallback or fetch amd/deepseek_v4 with deeper history (e.g. --depth=50) so older pinned SHAs remain reachable.

Extended reasoning...

Bug

The overlay block added at lines 20-41 promises a "reproducible pin per benchmark run" (see comment lines 20-23), but the control flow does not enforce it. Two compounding issues silently install the wrong sglang on a fetch/pin mismatch:

1. Shallow-fetch fallback cannot resolve a non-tip pin (lines 32-33). The clone at line 26 uses --filter=blob:none and clones only the default branch, which is not amd/deepseek_v4. The subshell then runs:

git fetch --depth=1 origin "$SGL_PR_SHA" 2>/dev/null \
    || git fetch --depth=1 origin amd/deepseek_v4

The fallback retrieves a single commit — the current tip of amd/deepseek_v4. The very next line, git checkout --force "$SGL_PR_SHA", then fails with reference is not a tree whenever SGL_PR_SHA is not that tip. The script's own comment "Bump SGL_PR_SHA when the branch advances" explicitly anticipates that the pinned SHA lags the branch tip, so this is not a hypothetical case. The fetch-by-SHA can fail for ordinary reasons (transient GitHub error, unauthenticated rate limit, the SHA being GC'd after a force-push), and the fallback then cannot produce the requested object.

2. No set -eo pipefail (line 1 is the shebang, line 3 sources benchmark_lib.sh with no shell-options line in between). The sister script benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh:2 sets set -eo pipefail, and the sourced benchmark_lib.sh does not enable errexit either. Without it, every failure inside the subshell on lines 30-41 is swallowed:

  • the failed git checkout --force "$SGL_PR_SHA" does not abort the subshell;
  • test "$(git rev-parse HEAD)" = "$SGL_PR_SHA" at line 35 — the explicit pin guard — only sets $?, it does not exit;
  • pip install --no-build-isolation --no-deps --force-reinstall -e python/ at line 40 then runs unconditionally, against whatever the working tree happens to contain, and its exit code becomes the subshell's, masking every earlier failure;
  • the parent script also lacks errexit, so even a non-zero subshell exit would be ignored.

The python3 -c "import sglang; print(...)" line on 43 only prints the version, it does not assert anything, and sglang.launch_server then runs against the wrong sglang.

Step-by-step proof

Assume SGL_PR_SHA = 18afbf15… and the branch amd/deepseek_v4 has since advanced to a different SHA Y. The fetch-by-SHA call fails (transient 503 from github.com):

  1. Line 25-27: clone succeeds, default branch only (no amd/deepseek_v4, no 18afbf15…).
  2. Line 32: git fetch --depth=1 origin 18afbf15… → fails (stderr suppressed via 2>/dev/null), exit 128.
  3. Line 33: git fetch --depth=1 origin amd/deepseek_v4 → succeeds, downloads only commit Y.
  4. Line 34: git checkout --force 18afbf15… → fails: fatal: reference is not a tree: 18afbf15…. Without set -e, subshell continues; HEAD is still whatever the initial clone left (default branch tip).
  5. Line 35: test "$(git rev-parse HEAD)" = "18afbf15…"$? set non-zero. Subshell continues.
  6. Line 40: pip install … -e python/ → runs against the default-branch tree (not the pin), succeeds, exit 0.
  7. Subshell exits 0 (pip's status). Parent continues.
  8. Line 43: import sglang; print(...) → prints the wrong version.
  9. sglang.launch_server runs with the wrong sglang and benchmark numbers are reported under the assumed pin.

A second realistic trigger: the by-SHA fetch succeeds on first run, then on a later invocation in the same /tmp/sglang-amd-dsv4 checkout the network is flaky. The if [ ! -d "$SGL_PR_DIR/.git" ] guard at line 25 means the clone is reused; the fallback then leaves the local repo with only commit Y; the same silent cascade follows.

Fix

Minimum: add set -eo pipefail near the top of the script (matching the vllm sister script). That alone causes the failed checkout / mismatched-HEAD test to abort the subshell and the script.

Additionally, make the fallback actually able to produce non-tip SHAs — drop --depth=1 from the branch fallback, or use a deeper fetch (e.g. --depth=50) so older pinned SHAs remain reachable.


python3 -c "import sglang; print(f'sglang {sglang.__version__} from {sglang.__path__[0]}')"

# Transformers in the container doesn't recognize the `deepseek_v4` model_type.
# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this
# by writing a patched config to /tmp, but in practice isn't catching the error
Expand All @@ -39,7 +67,12 @@ else:
print(f"No patch needed: model_type is {config.get('model_type')!r}")
PYEOF

# DSv4-specific SGLang env vars (from sgl-project/sglang#23608)
# DSv4-specific SGLang env vars. Mirrors python/run_dsv4.sh on the
# amd/deepseek_v4 branch (commented FP8 path) at SGL_PR_SHA. The branch's
# FP4 Models integration commit (33de1e64) flipped SGLANG_FORCE_TRITON_MOE_FP8
# from 1 to 0; with it set to 0, FP8 MoE dispatches through aiter (shuffled
# weights + aiter fused_moe) instead of the triton MoE fallback.
export SGLANG_REASONING_EFFORT=max
export SGLANG_OPT_USE_FUSED_COMPRESS=false
export SGLANG_OPT_USE_OLD_COMPRESSOR=true
export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
Expand All @@ -58,7 +91,7 @@ export SGLANG_DSV4_FP4_EXPERTS=false
export SGLANG_OPT_DPSK_V4_RADIX=0
export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false
export SGLANG_OPT_USE_FUSED_STORE_CACHE=false
export SGLANG_FORCE_TRITON_MOE_FP8=1
export SGLANG_FORCE_TRITON_MOE_FP8=0

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
Expand Down
12 changes: 12 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2006,3 +2006,15 @@
- "Change image to vllm/vllm-openai:v0.20.0-cu130"
- "Use Mega MoE for DEP configs"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1221

- config-keys:
- dsv4-fp8-mi355x-sglang
- dsv4-fp4-mi355x-sglang
description:
- "Pin sglang at runtime to amd/deepseek_v4 branch SHA 18afbf15 via clone+reinstall in the benchmark scripts"
- "Bump dsv4-fp8-mi355x-sglang image to rocm/sgl-dev:v0.5.10.post1-rocm700-mi35x-20260428"
- "Switch dsv4-fp8-mi355x-sglang model from sgl-project/DeepSeek-V4-Pro-FP8 to deepseek-ai/DeepSeek-V4-Pro"
- "Set SGLANG_FORCE_TRITON_MOE_FP8=0 in dsv4_fp8_mi355x.sh so FP8 MoE dispatches through aiter (matches branch run_dsv4.sh after FP4 Models commit 33de1e64)"
- "Add SGLANG_REASONING_EFFORT=max to both scripts"
- "Add dsv4-fp4-mi355x-sglang variant: SGLANG_DSV4_FP4_EXPERTS=True, SGLANG_FORCE_TRITON_MOE_FP8=0, model deepseek-ai/DeepSeek-V4-Pro"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1231
Loading