Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
d8a8e77
Add DeepSeek-V4-Flash FP8 MI355X SGLang benchmark
Oseltamivir Apr 24, 2026
61b1029
Update perf-changelog pr-link to #1134
Oseltamivir Apr 24, 2026
a3bdccd
Install transformers with deepseek_v4 support in dsv4 benchmark
Oseltamivir Apr 24, 2026
13d5c72
Also upgrade huggingface_hub to fix rope_theta type validation
Oseltamivir Apr 24, 2026
ecd0dd5
Patch rope_theta int→float in downloaded config.json
Oseltamivir Apr 24, 2026
504606c
Fix huggingface_hub typer clash and config.json corruption
Oseltamivir Apr 24, 2026
4f10713
Use huggingface_hub scan_cache_dir to find config.json for patching
Oseltamivir Apr 24, 2026
f540916
Patch DeepseekV4Config source to accept int for rope_theta
Oseltamivir Apr 24, 2026
698ae6b
Use --no-deps for transformers install; patch rope_theta before download
Oseltamivir Apr 24, 2026
4ea3e18
import error
Oseltamivir Apr 24, 2026
7d2d911
Fix Union import in rope_theta patch and allow full transformers deps
Oseltamivir Apr 24, 2026
8fab7b6
Remove corrupted cached config.json before model download
Oseltamivir Apr 24, 2026
f40bfb1
Patch all float fields in DeepseekV4Config, not just rope_theta
Oseltamivir Apr 24, 2026
64e2c55
Use the correct SGLang image for DSv4; drop transformers patching
Oseltamivir Apr 24, 2026
2f301be
Patch config.json model_type so AutoConfig can load DSv4
Oseltamivir Apr 24, 2026
0df7b1e
Bump watchdog timeout to 1800s for DSv4 MoE JIT compile
Oseltamivir Apr 24, 2026
3c5c86b
Use newest SGLang DSv4 image from AMD team
Oseltamivir Apr 24, 2026
98191d6
Switch DSv4 benchmark from Flash to Pro
Oseltamivir Apr 24, 2026
fe6e71d
Switch dsv4pro to TP=8
Oseltamivir Apr 24, 2026
fe7f870
Revert accidental TP changes outside dsv4pro
Oseltamivir Apr 24, 2026
0139a4f
Pin max-total-tokens and raise mem-fraction-static
Oseltamivir Apr 24, 2026
48ff44e
Merge branch 'main' into dsv4-fp8-mi355x-sglang
Oseltamivir Apr 24, 2026
d39d6ee
Size max-total-tokens for concurrent requests
Oseltamivir Apr 24, 2026
8c32d73
Rename dsv4pro to dsv4; bump mem-fraction-static to 0.95
Oseltamivir Apr 24, 2026
8e6f48f
oom
Oseltamivir Apr 24, 2026
78f0306
Merge branch 'main' into dsv4-fp8-mi355x-sglang
Oseltamivir Apr 24, 2026
6bd5383
Update benchmark description for DeepSeek-V4-Pro
Oseltamivir Apr 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1472,3 +1472,20 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"

dsv4-fp8-mi355x-sglang:
image: rocm/sgl-dev:deepseek-v4-mi35x
model: sgl-project/DeepSeek-V4-Pro-FP8
model-prefix: dsv4
runner: mi355x
precision: fp8
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
1 change: 1 addition & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ When working with benchmark configurations, use these valid values:

**Models (model-prefix)**:
- `dsr1` - DeepSeek-R1-0528
- `dsv4` - DeepSeek-V4-Pro
- `gptoss` - GPT-OSS-120B

**Precisions**:
Expand Down
120 changes: 120 additions & 0 deletions benchmarks/single_node/dsv4_fp8_mi355x.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

# Transformers in the container doesn't recognize the `deepseek_v4` model_type.
# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this
# by writing a patched config to /tmp, but in practice isn't catching the error
# in this image. Patch the cached config.json directly instead: set model_type
# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep
# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native
# DSv4 model class (python/sglang/srt/models/deepseek_v4.py).
python3 << PYEOF
import json
from huggingface_hub import hf_hub_download
path = hf_hub_download(repo_id="$MODEL", filename="config.json")
with open(path) as f:
config = json.load(f)
if config.get("model_type") == "deepseek_v4":
config["model_type"] = "deepseek_v3"
with open(path, "w") as f:
json.dump(config, f, indent=2)
print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3")
else:
print(f"No patch needed: model_type is {config.get('model_type')!r}")
PYEOF

# DSv4-specific SGLang env vars (from sgl-project/sglang#23608)
export SGLANG_OPT_USE_FUSED_COMPRESS=false
export SGLANG_OPT_USE_OLD_COMPRESSOR=true
export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false
export SGLANG_OPT_USE_FUSED_HASH_TOPK=false
export SGLANG_HACK_FLASHMLA_BACKEND=torch
export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
export SGLANG_OPT_USE_TILELANG_MHC_POST=false
export SGLANG_ENABLE_THINKING=1
export SGLANG_USE_AITER=1
export SGLANG_USE_ROCM700A=1
export SGLANG_TOPK_TRANSFORM_512_TORCH=1
export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
export SGLANG_DSV4_FP4_EXPERTS=false
export SGLANG_OPT_DPSK_V4_RADIX=0
export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false
export SGLANG_OPT_USE_FUSED_STORE_CACHE=false
export SGLANG_FORCE_TRITON_MOE_FP8=1

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

python3 -m sglang.launch_server \
--model-path $MODEL \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--dp $TP \
--enable-dp-attention \
--trust-remote-code \
--disable-radix-cache \
--attention-backend compressed \
--max-running-request 256 \
Comment on lines +79 to +84
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Two SGLang flags in the new script use non-canonical abbreviated forms: --dp $TP on line 57 (should be --dp-size) and --max-running-request 256 on line 62 (should be --max-running-requests, plural). Every other SGLang benchmark script in this repo uses the canonical --dp-size/--data-parallel-size and --max-running-requests (e.g. dsr1_fp8_b200.sh, glm5_fp8_b200.sh, qwen3.5_fp8_b200.sh, multi_node/amd_utils/server.sh). These work today only because argparse's default allow_abbrev=True resolves them as unambiguous prefix matches — they would silently break on a future SGLang release that adds another --dp-* flag (e.g. --dp-lb-port) or on a build that sets allow_abbrev=False, either via an "ambiguous option" error at server startup or by not applying the intended 256 running-request cap / DP=TP setting.

Extended reasoning...

The bug

At benchmarks/single_node/dsv4_fp8_mi355x.sh:57 and :62, two SGLang launch_server flags are written in non-canonical abbreviated form:

--tensor-parallel-size $TP \
--dp $TP \                    # line 57 — should be --dp-size $TP
...
--max-running-request 256 \    # line 62 — should be --max-running-requests 256

SGLang's ServerArgs registers exactly one option for each: --dp-size (aliased as --data-parallel-size) and --max-running-requests (plural). Neither --dp nor --max-running-request is a registered alias. Every other benchmark script in this repo uses the canonical names — multi_node/amd_utils/server.sh uses --dp-size, 25+ single-node scripts use --data-parallel-size, and 30+ scripts use --max-running-requests (plural). Within this very file, line 56 already uses the full --tensor-parallel-size form, so --dp on line 57 is inconsistent even locally.

Why it happens to work today

Python's argparse defaults to allow_abbrev=True. As long as SGLang registers only one option starting with each prefix, argparse resolves --dp--dp-size and --max-running-request--max-running-requests as unambiguous prefix matches, and the values are applied. So on the current custom image lmsysorg/sglang:deepseek-v4-mi350 this most likely runs successfully.

Why it is still worth fixing

The other verifier correctly pointed out that the breakage scenarios are hypothetical. That objection is fair as stated — argparse would raise ambiguous option rather than silently pick the wrong flag, and today's behavior is correct. But the script is a new, one-line-each fix and sits on top of a custom fork build from sgl-project/sglang#23608, not a stable release. Two concrete fragilities remain:

  1. Upstream SGLang could add a sibling flag at any time. If a future version introduces any --dp-* option (e.g. --dp-lb-port, --dp-balance-mode) or --max-running-request-* option, argparse will raise ambiguous option: --dp could match --dp-size, --dp-lb-port at server startup, and the benchmark will silently fail to launch. Since this script's image is a PR-branch build that will be rebased onto upstream, this is a real risk window, not purely hypothetical.
  2. A build with allow_abbrev=False would reject both flags. The linked PR #23608 could set this on its parser; if it ever does, the server fails with unrecognized arguments: --dp --max-running-request 256.

Step-by-step proof the canonical forms are required

  1. Grep upstream python/sglang/srt/server_args.py: only --dp-size / --data-parallel-size and --max-running-requests are registered. No --dp alias. No --max-running-request alias.
  2. Grep this repo for --dp (bare): only hit is dsv4_fp8_mi355x.sh:57. All 25+ other scripts use --dp-size or --data-parallel-size.
  3. Grep this repo for --max-running-request (singular): only hit is dsv4_fp8_mi355x.sh:62. All 30+ other scripts use --max-running-requests (plural).
  4. Concrete failure example: if upstream SGLang merges a --dp-lb-port flag next release and this image is rebuilt, running this script produces:
    sglang.launch_server: error: ambiguous option: --dp could match --dp-size, --dp-lb-port
    
    and the benchmark aborts before wait_for_server_ready completes.

Fix

Two trivial renames on lines 57 and 62:

-    --dp $TP \
+    --dp-size $TP \
...
-    --max-running-request 256 \
+    --max-running-requests 256 \

This brings the script in line with every other benchmark in benchmarks/ and removes both fragilities.

--page-size 256 \
--chunked-prefill-size 4096 \
--disable-shared-experts-fusion \
--disable-cuda-graph \
--tool-call-parser deepseekv4 \
--reasoning-parser deepseek-v4 \
--mem-fraction-static 0.88 \
--max-total-tokens $((CONC * (ISL + OSL) + 200)) \
--watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1766,3 +1766,12 @@
- "Prefix caching disabled, no speculative decoding"
- "Configs: 1k1k conc 4-1024, 8k1k conc 4-512"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143

- config-keys:
- dsv4-fp8-mi355x-sglang
description:
- "Day 0 DeepSeek-V4-Pro FP8 MI355X SGLang benchmark"
- "Image: rocm/sgl-dev:deepseek-v4-mi35x (from sgl-project/sglang#23608)"
- "Model: sgl-project/DeepSeek-V4-Pro-FP8"
- "https://github.com/sgl-project/sglang/pull/23608#issuecomment-4311952977"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1134
Loading