Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,48 @@ qwen3.5-fp8-mi355x-sglang-mtp:
- { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp }

qwen3.5-fp8-mi355x-atom:
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
model: Qwen/Qwen3.5-397B-A17B-FP8
model-prefix: qwen3.5
runner: mi355x
precision: fp8
framework: atom
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 2, ep: 1, conc-start: 4, conc-end: 256 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
- { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 2, ep: 1, conc-start: 4, conc-end: 256 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
- { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }

qwen3.5-fp8-mi355x-atom-mtp:
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
model: Qwen/Qwen3.5-397B-A17B-FP8
model-prefix: qwen3.5
runner: mi355x
precision: fp8
framework: atom
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
- { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
- { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }

qwen3.5-fp4-mi355x-sglang:
image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413
model: amd/Qwen3.5-397B-A17B-MXFP4
Expand Down
82 changes: 82 additions & 0 deletions benchmarks/single_node/qwen3.5_fp8_mi355x_atom.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

export OMP_NUM_THREADS=1

# Calculate max-model-len based on ISL and OSL
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
CALCULATED_MAX_MODEL_LEN=""
else
CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
fi

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
else
EP=" "
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
MEM_FRAC_STATIC=0.9

set -x

python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
-tp $TP \
--kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
--gpu-memory-utilization $MEM_FRAC_STATIC \
--trust-remote-code \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

export PYTHONDONTWRITEBYTECODE=1
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
84 changes: 84 additions & 0 deletions benchmarks/single_node/qwen3.5_fp8_mi355x_atom_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

export OMP_NUM_THREADS=1

# Calculate max-model-len based on ISL and OSL
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
CALCULATED_MAX_MODEL_LEN=""
else
CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
fi

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
else
EP=" "
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
MEM_FRAC_STATIC=0.9

set -x

python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
-tp $TP \
--kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
--gpu-memory-utilization $MEM_FRAC_STATIC \
--method mtp \
--num-speculative-tokens 3 \
Comment thread
seungrokj marked this conversation as resolved.
--trust-remote-code \
> $SERVER_LOG 2>&1 &

Comment on lines +45 to +56
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 The bf16 atom script (qwen3.5_bf16_mi355x_atom.sh) is a byte-for-byte copy of the fp8 script and incorrectly applies --kv_cache_dtype fp8 (line 49) to a native BF16 model, producing non-representative BF16 benchmark results that are actually fp8-KV-quantized runs. Additionally, the bf16 atom script has no corresponding entry in .github/configs/amd-master.yaml (only fp8 and fp4 atom YAML entries were added), so the benchmark pipeline cannot invoke it at all — fix both issues before merging.

Extended reasoning...

Issue 1: --kv_cache_dtype fp8 in the bf16 atom script

The two scripts qwen3.5_bf16_mi355x_atom.sh and qwen3.5_fp8_mi355x_atom.sh are byte-for-byte identical (verified by diff in the PR). Both pass --kv_cache_dtype fp8 on line 49 of the server launch command. For an fp8-weight model (Qwen/Qwen3.5-397B-A17B-FP8), applying fp8 KV cache is appropriate and expected. For a native BF16 model (Qwen/Qwen3.5-397B-A17B), however, this forces KV cache compression that would not normally be applied in a "plain BF16" scenario, altering both memory utilization and potentially output quality.

Addressing the refutation (intentional pattern)

One verifier argued this is intentional because dsr1_fp8_mi355x_atom.sh and dsr1_fp4_mi355x_atom.sh are identical and both use --kv_cache_dtype fp8, with precision differentiation done via the YAML model field. This is partially correct for the DSR1 case — but crucially, there is a qwen3.5-bf16-mi355x-sglang config using model Qwen/Qwen3.5-397B-A17B, and the corresponding sglang script (qwen3.5_bf16_mi355x.sh) does NOT use --kv_cache_dtype fp8. Every other BF16 benchmark script in this codebase for this model follows the same pattern of omitting KV quantization. If the intent for atom-framework BF16 benchmarking is to also compress the KV cache, that should be an explicit and documented decision — not an accidental copy-paste from the fp8 variant.

Issue 2: Missing YAML config entry for qwen3.5-bf16-mi355x-atom

The PR adds qwen3.5-fp8-mi355x-atom and qwen3.5-fp4-mi355x-atom entries to .github/configs/amd-master.yaml, but no qwen3.5-bf16-mi355x-atom entry. The benchmark pipeline discovers which benchmarks to run from this YAML; without an entry, qwen3.5_bf16_mi355x_atom.sh is an orphaned script that can never be triggered. This is confirmed by grep returning no results for 'qwen3.5-bf16-mi355x-atom' anywhere in the YAML. The PR title explicitly says 'fp8/bf16 on mi355x', so bf16 atom was clearly intended to be integrated.

Concrete proof of the dual problem

Step 1: The YAML entries qwen3.5-fp8-mi355x-atom and qwen3.5-fp4-mi355x-atom both reference their respective model checkpoints (FP8 and MXFP4 variants) and have corresponding benchmark scripts. Step 2: A matching qwen3.5-bf16-mi355x-atom YAML entry with model: Qwen/Qwen3.5-397B-A17B is absent. Step 3: Even if a bf16 atom YAML entry were added now and pointed the pipeline at qwen3.5_bf16_mi355x_atom.sh, the script would still launch the server with --kv_cache_dtype fp8, meaning the resulting numbers would reflect BF16 model weights + FP8 KV cache, not a clean BF16 baseline. Step 4: Compare to qwen3.5-bf16-mi355x-sglang: that config uses Qwen/Qwen3.5-397B-A17B with no --kv_cache_dtype flag — the expected BF16 baseline behavior.

How to fix

  1. In qwen3.5_bf16_mi355x_atom.sh, remove the --kv_cache_dtype fp8 flag (or make it conditional) to match the behavior of other BF16 benchmark scripts. 2. Add a qwen3.5-bf16-mi355x-atom entry to .github/configs/amd-master.yaml with model: Qwen/Qwen3.5-397B-A17B, precision: bf16, framework: atom, and appropriate search-space parameters (similar to the fp8/fp4 atom entries but with the correct BF16 model checkpoint).

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

export PYTHONDONTWRITEBYTECODE=1
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1719,6 +1719,13 @@
evals-only: true

- config-keys:
- qwen3.5-fp8-mi355x-atom
- qwen3.5-fp8-mi355x-atom-mtp
description:
- "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1040

- config-keys:
- glm5.1-fp4-mi355x-atom
description:
- "Add GLM-5.1 MXFP4 single-node MI355X ATOM benchmark"
Expand Down
Loading