Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ glm5-fp8-mi355x-sglang:
- { tp: 8, conc-start: 4, conc-end: 64 }

glm5-fp8-mi355x-sglang-mtp:
image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413
image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
model: zai-org/GLM-5-FP8
model-prefix: glm5
runner: mi355x
Expand All @@ -375,11 +375,13 @@ glm5-fp8-mi355x-sglang-mtp:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
- { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
- { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }

glm5-fp8-mi355x-atom:
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
Expand Down
12 changes: 6 additions & 6 deletions benchmarks/single_node/glm5_fp8_mi355x_mtp.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env bash
set -x

source "$(dirname "$0")/../benchmark_lib.sh"

Expand All @@ -15,11 +16,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

# GLM-5 requires transformers with glm_moe_dsa model type support.
# However, the Image rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 doesn't provide this support.
python3 -m pip install -U --no-cache-dir \
"git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2"

hf download "$MODEL"

# ROCm / SGLang performance tuning for MI355X
Expand All @@ -30,6 +26,7 @@ export SGLANG_ENABLE_SPEC_V2=1

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 32))

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
Expand All @@ -45,9 +42,11 @@ python3 -m sglang.launch_server \
--port $PORT \
--tensor-parallel-size $TP \
--trust-remote-code \
--cuda-graph-max-bs $CONC \
--context-length $CONTEXT_LENGTH \
--mem-fraction-static 0.85 \
--tool-call-parser glm47 \
--reasoning-parser glm45 \
--mem-fraction-static 0.85 \
--model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
--nsa-prefill-backend tilelang \
--nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS \
Expand All @@ -56,6 +55,7 @@ python3 -m sglang.launch_server \
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 4 \
--tokenizer-worker-num $((TP*2)) \
--disable-radix-cache> $SERVER_LOG 2>&1 &

SERVER_PID=$!
Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2041,6 +2041,13 @@
- "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157

- config-keys:
- glm5-fp8-mi355x-sglang-mtp
description:
- "Add GLM5 FP8 MTP MI355X SGLang Support"
- "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122

- config-keys:
- dsv4-fp4-gb300-dynamo-vllm
description:
Expand Down
Loading