Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 13 additions & 20 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1832,40 +1832,33 @@ dsr1-fp8-b300-sglang:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }

# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# lists B200 (not B300) as the Blackwell target. This config reuses the
# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
# until a B300-specific recipe ships. Prefix caching is disabled.
# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while
# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro
# on B300. Re-introduce balanced/max-throughput rows once fixed upstream.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
image: lmsysorg/sglang:deepseek-v4-b300
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
precision: fp4
framework: sglang
multinode: false
# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC:
# low-latency (CONC <= 32): TP-only
# balanced (32 < CONC <= 128): + DP-attn
# max-throughput (CONC > 128): + DP-attn
# Split so result filenames (ep=, dpa=) accurately reflect the recipe.
# ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
# while low-latency leaves ep_size at the default of 1.
# TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP)
# while the DeepEP FP8 weight-postprocess path is broken for this
# checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3
# integers. raised from sglang.srt.layers.quantization.fp8
# .process_weights_after_loading_block_quant). Full concurrency sweep
# retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300
# once sglang can load the checkpoint under --moe-a2a-backend deepep.
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }

# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
Expand Down
86 changes: 23 additions & 63 deletions benchmarks/single_node/dsv4_fp4_b300_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ source "$(dirname "$0")/../benchmark_lib.sh"
check_env_vars \
MODEL \
TP \
DP_ATTENTION \
CONC \
ISL \
OSL \
Expand All @@ -24,13 +23,12 @@ fi

nvidia-smi

# Common SGLANG env vars (apply to every config).
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
export SGLANG_OPT_USE_JIT_NORM=1
export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
export SGLANG_OPT_USE_TOPK_V2=1
export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1

# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its
# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half
# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks.
unset CUDA_VISIBLE_DEVICES

# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
Expand All @@ -42,7 +40,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
Expand All @@ -52,57 +50,21 @@ fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was
# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
if [[ "$ISL" == "1024" ]]; then
SWA_FULL_TOKENS_RATIO=0.5
else
SWA_FULL_TOKENS_RATIO=0.1
fi

# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm
# script's pattern). DP-attention runs the empirically-tuned high-concurrency
# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer);
# single-instance uses flashinfer_mxfp4 with the cookbook defaults.
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

# Default; the DP-attn branch below overrides to 0.94.
MEM_FRACTION_STATIC=0.90

if [ "${DP_ATTENTION}" = "true" ]; then
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
MEM_FRACTION_STATIC=0.94
else
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 8192
--disable-flashinfer-autotune
)
fi

# Print all SGLANG_* env vars to both the CI step log and server.log so the
# launch config is auditable from the result artifact alone.
{
echo "=== SGLANG_* env vars at launch ==="
env | grep -E '^SGLANG_' | sort
echo "==================================="
} | tee "$SERVER_LOG"
# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the
# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300
# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from
# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant).
# Restore the CONC-based low-latency / balanced / max-throughput dispatch
# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under
# --moe-a2a-backend deepep.
RECIPE=low-latency
RECIPE_FLAGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 4096
--disable-flashinfer-autotune
--mem-fraction-static 0.82
)
echo "Recipe: $RECIPE (CONC=$CONC)"

set -x
PYTHONNOUSERSITE=1 sglang serve \
Expand All @@ -111,10 +73,8 @@ PYTHONNOUSERSITE=1 sglang serve \
--port $PORT \
--trust-remote-code \
--tp $TP \
--max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
--mem-fraction-static "$MEM_FRACTION_STATIC" \
--swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
"${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
--disable-radix-cache \
"${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
34 changes: 0 additions & 34 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1779,13 +1779,6 @@
- "Prefix caching and speculative decoding disabled for baseline numbers"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
- "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158

- config-keys:
- dsv4-fp8-mi355x-sglang
description:
Comment on lines 1779 to 1784
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 This PR deletes six prior dsv4-fp4-b300-sglang changelog entries from perf-changelog.yaml, but utils/process_changelog.py:get_added_lines explicitly raises ValueError on any non-whitespace deletion in this file. The run-sweep workflow invokes that script on every PR with a sweep label and on every push to main, so the very first deletion (- config-keys:) will abort the workflow before any benchmark runs — defeating the PR's stated test plan. Fix by dropping the wipe and only appending the new entry.

Extended reasoning...

What the bug is

utils/process_changelog.py enforces an append-only invariant on perf-changelog.yaml. The relevant code at utils/process_changelog.py:24-37 (get_added_lines) iterates over the unified diff for the changelog file and explicitly rejects any non-whitespace deletion:

if line.startswith("-") and not line.startswith("---"):
    deleted_content = line[1:]
    if deleted_content.strip():
        raise ValueError(
            f"Deletions are not allowed in {filepath}. "
            f"Only additions to the changelog are permitted. "
            f"Found deleted line: {deleted_content}"
        )

This PR's diff does the opposite: it removes six prior dsv4-fp4-b300-sglang entries (PRs #1143, #1132, #1158, #1173, #1174, #1178) spanning lines 1779-1793 and 1838-1855 in the diff. There are roughly 30 non-whitespace deleted lines (- config-keys:, - dsv4-fp4-b300-sglang, the description bullets, and pr-link: ...), all of which strip to non-empty content.

Why this is a hard CI blocker

The validation is wired into .github/workflows/run-sweep.yml:91-101 (the setup job), which calls process_changelog.py whenever the workflow runs. The workflow runs in two contexts that matter here:

  1. On PR with the sweep-enabled / full-sweep-enabled label — required to actually exercise the sweep this PR is asking for.
  2. On push to main after merge — runs unconditionally unless the commit message contains [skip-sweep].

In both contexts, the deletion check fires before any pydantic ChangelogEntry validation. There is no override flag.

Proof: walking through the diff

Take the first non-whitespace deletion in the diff, around line 1782:

-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "Restore the recipe-per-CONC split ... from #1143; ..."

When get_added_lines processes this:

  • Line -- config-keys: → starts with -, doesn't start with ---; deleted_content = "- config-keys:", .strip() is non-empty → ValueError raised on the very first iteration.

The setup job exits non-zero, no benchmark or sweep job runs, and the PR's test plan ("Sweep run on B300 completes end-to-end") cannot be satisfied. Post-merge, the same failure occurs on the push-to-main run, so the new entry never triggers a sweep — directly defeating the PR's purpose.

What contradicts existing safeguards

The PR description literally says "Wipes all six prior dsv4-fp4-b300-sglang changelog entries (from #1143, #1132, #1158, #1173, #1174, #1178) and adds a single fresh entry to start clean." This is in direct conflict with the append-only invariant the code enforces. This bug is independent of any pr-link issue — even if the new entry were perfect, the deletions would still trip get_added_lines first.

How to fix

Two options:

  1. Drop the wipe — leave the prior six entries in place and only append the new "revert to [NVIDIA] chore: B300 single node DeepSeek v4 SGLang LOW LATENCY ONLY #1143 baseline" entry. The reader can already infer the chronological revert from the PR-link history.
  2. Relax the validator — if intentional rewrites are desired, add an opt-in escape (e.g., a [changelog-rewrite] commit-message marker, or a squash: block in the YAML), but this is a larger policy change and should not be smuggled in here.

Option (1) is the obviously safe path.

Expand Down Expand Up @@ -1856,26 +1849,6 @@
- "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
- "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "better performance for dp-attention"
- "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174

- config-keys:
- dsv4-fp4-b300-sglang-mtp
description:
Expand All @@ -1888,13 +1861,6 @@
- "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "better performance for dp-attention"
- "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1178

- config-keys:
- dsv4-fp4-b300-vllm
description:
Expand Down
Loading