diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh index 58740432c..88b5f9580 100644 --- a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh @@ -205,13 +205,15 @@ set -x BLOCK_SIZE=${BLOCK_SIZE:-16} # --enforce-eager is required: ROCm/ATOM#650 (PR1 skeleton) has no CUDAGraph -# support yet (deferred to a follow-up PR). max-num-seqs uses the ATOM -# default (512) — matches every other ATOM benchmark script in the repo. -# The PR1 kv_cache[:1,...] hardcode in deepseek_v4.py means any forward -# with batch>1 silently corrupts non-slot-0 lanes; this risk activates -# whenever the scheduler assembles batch>1, regardless of the explicit -# max-num-seqs value, so pinning it to 4 (the PR's offline repro value) -# offered no protective benefit. eval (gsm8k) at conc>1 is the canary. +# support yet (deferred to a follow-up PR). max-num-seqs is sized to the +# client concurrency with a floor at 4 — the ATOM default (512) makes the +# KV/GDN-mamba allocator overshoot the GPU budget ("GDN mamba tensor +# exceeds available KV budget"), and using 1 hangs warmup at 0% GPU. 4 +# is the minimum we've seen complete warmup successfully (also the PR's +# offline repro value). The PR1 kv_cache[:1,...] hardcode in +# deepseek_v4.py means any forward with batch>1 silently corrupts +# non-slot-0 lanes; eval (gsm8k) at conc>1 is the canary. +MAX_NUM_SEQS=$(( CONC < 4 ? 4 : CONC )) python3 -m atom.entrypoints.openai_server \ --model $MODEL \ --server-port $PORT \ @@ -219,6 +221,7 @@ python3 -m atom.entrypoints.openai_server \ --kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \ --block-size $BLOCK_SIZE \ --enforce-eager \ + --max-num-seqs $MAX_NUM_SEQS \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 256c08d7b..6dcbe41cb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1844,3 +1844,14 @@ - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)" - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1165 + +- config-keys: + - dsv4-fp4-mi355x-atom + description: + - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)" + - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script" + - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton" + - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us" + - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)" + - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170