Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1536,6 +1536,27 @@ dsv4-fp8-mi355x-sglang:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

# FP4-experts variant of dsv4-fp8-mi355x-sglang. Same image and sglang overlay,
# but uses the canonical DeepSeek-V4-Pro checkpoint and enables the FP4 expert
# loader path in benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh.
dsv4-fp4-mi355x-sglang:
image: rocm/sgl-dev:rocm720-deepseek-v4-mi35x
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: mi355x
precision: fp4
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

# vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889,
# stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with
# MLA decode, MI355X GPU detection); vLLM is rebuilt from the PR branch
Expand Down
150 changes: 150 additions & 0 deletions benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

# Overlay sglang from the amd/deepseek_v4 branch on top of whatever the
# rocm/sgl-dev:rocm720-deepseek-v4-mi35x image ships with. The image's sglang
# is moving fast and we want a reproducible pin per benchmark run. Bump
# SGL_PR_SHA when the branch advances.
SGL_PR_SHA="18afbf151a2992b06a089191769b299629ed73dd"
SGL_PR_DIR="/tmp/sglang-amd-dsv4"

if [ ! -d "$SGL_PR_DIR/.git" ]; then
git clone --filter=blob:none https://github.com/sgl-project/sglang.git "$SGL_PR_DIR"
fi
(
cd "$SGL_PR_DIR"
git fetch --depth=1 origin "$SGL_PR_SHA" 2>/dev/null \
|| git fetch --depth=1 origin amd/deepseek_v4
git checkout --force "$SGL_PR_SHA"
test "$(git rev-parse HEAD)" = "$SGL_PR_SHA"

# Reinstall just the Python package; the image already has the ROCm
# kernel deps (aiter, triton, tilelang, torch) at versions matched to
# this branch, so --no-deps avoids pip resolving them against PyPI.
pip install --no-build-isolation --no-deps --force-reinstall -e python/
)

python3 -c "import sglang; print(f'sglang {sglang.__version__} from {sglang.__path__[0]}')"

# Transformers in the container doesn't recognize the `deepseek_v4` model_type.
# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this
# by writing a patched config to /tmp, but in practice isn't catching the error
# in this image. Patch the cached config.json directly instead: set model_type
# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep
# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native
# DSv4 model class (python/sglang/srt/models/deepseek_v4.py).
python3 << PYEOF
import json
from huggingface_hub import hf_hub_download
path = hf_hub_download(repo_id="$MODEL", filename="config.json")
with open(path) as f:
config = json.load(f)
if config.get("model_type") == "deepseek_v4":
config["model_type"] = "deepseek_v3"
with open(path, "w") as f:
json.dump(config, f, indent=2)
print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3")
else:
print(f"No patch needed: model_type is {config.get('model_type')!r}")
PYEOF

# DSv4 FP4-experts path. Mirrors the active path of python/run_dsv4.sh on
# the amd/deepseek_v4 branch at SGL_PR_SHA:
# SGLANG_DSV4_FP4_EXPERTS=True -> route experts through the FP4 kernels
# SGLANG_FORCE_TRITON_MOE_FP8=0 -> dispatch MoE through aiter (gating
# switch added in commit 33de1e64);
# also enables swiglu_limit clamp in the
# triton MoE fallback path.
export SGLANG_REASONING_EFFORT=max
export SGLANG_OPT_USE_FUSED_COMPRESS=false
export SGLANG_OPT_USE_OLD_COMPRESSOR=true
export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false
export SGLANG_OPT_USE_FUSED_HASH_TOPK=false
export SGLANG_HACK_FLASHMLA_BACKEND=torch
export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
export SGLANG_OPT_USE_TILELANG_MHC_POST=false
export SGLANG_ENABLE_THINKING=1
export SGLANG_USE_AITER=1
export SGLANG_USE_ROCM700A=1
export SGLANG_TOPK_TRANSFORM_512_TORCH=1
export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
export SGLANG_DSV4_FP4_EXPERTS=True
export SGLANG_OPT_DPSK_V4_RADIX=0
export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false
export SGLANG_OPT_USE_FUSED_STORE_CACHE=false
export SGLANG_FORCE_TRITON_MOE_FP8=0

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

python3 -m sglang.launch_server \
--model-path $MODEL \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--dp $TP \
--enable-dp-attention \
--trust-remote-code \
--disable-radix-cache \
--attention-backend compressed \
--max-running-request 256 \
--page-size 256 \
--chunked-prefill-size 8192 \
--disable-shared-experts-fusion \
--disable-cuda-graph \
--tool-call-parser deepseekv4 \
--reasoning-parser deepseek-v4 \
--watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2056,3 +2056,12 @@
- "Add --gpu-memory-utilization 0.9 to server launch"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1126

- config-keys:
- dsv4-fp4-mi355x-sglang
description:
- "Add MI355X DSv4 FP4 SGLang benchmark using rocm/sgl-dev:rocm720-deepseek-v4-mi35x"
- "Use deepseek-ai/DeepSeek-V4-Pro with SGLANG_DSV4_FP4_EXPERTS=True"
- "Reinstall SGLang from amd/deepseek_v4 SHA 18afbf15 at runtime"
- "Set SGLANG_FORCE_TRITON_MOE_FP8=0 and SGLANG_REASONING_EFFORT=max to match the FP4 Models update"
- "Delete legacy /var/lib/squash/rocm_sgl-dev_deepseek-v4-mi35x.sqsh and refresh the new rocm720 squash import on first launch"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1244
8 changes: 8 additions & 0 deletions runners/launch_mi355x-amds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ else
PARTITION="compute"
SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"
LEGACY_SQUASH_FILE=""
if [[ "$IMAGE" == "rocm/sgl-dev:rocm720-deepseek-v4-mi35x" ]]; then
LEGACY_SQUASH_FILE="/var/lib/squash/rocm_sgl-dev_deepseek-v4-mi35x.sqsh"
fi

set -x
salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME"
Expand All @@ -195,6 +199,10 @@ else
srun --jobid=$JOB_ID bash -c "
exec 9>\"$LOCK_FILE\"
flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
if [[ -n \"$LEGACY_SQUASH_FILE\" && -e \"$LEGACY_SQUASH_FILE\" ]]; then
echo 'Removing legacy squash file and refreshing import: $LEGACY_SQUASH_FILE'
rm -f \"$LEGACY_SQUASH_FILE\" \"$SQUASH_FILE\"
fi
if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
echo 'Squash file already exists and is valid, skipping import'
else
Expand Down
Loading