Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,31 @@ minimaxm2.5-fp8-mi355x-vllm:
- { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
- { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }

minimaxm2.5-fp8-mi355x-vllm-eagle3:
image: vllm/vllm-openai-rocm:nightly-4eafc729285e459a5fc96efd6f7b313b155cad48
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: mi355x
precision: fp8
framework: vllm
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, ep: 4, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
- isl: 1024
osl: 8192
search-space:
- { tp: 4, ep: 4, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, ep: 4, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: eagle3 }

minimaxm2.5-fp8-mi355x-atom:
image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
model: MiniMaxAI/MiniMax-M2.5
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ The corresponding `SingleNodeMatrixEntry` enforces these same fields with approp

2. **`extra='forbid'`**: Unknown fields are rejected, preventing typos or deprecated fields from slipping through.

3. **Strict typing**: Fields like `spec-decoding` use `Literal["mtp", "draft_model", "none"]` to restrict values to known options.
3. **Strict typing**: Fields like `spec-decoding` use `Literal` types to restrict values to known options; see `utils/matrix_logic/validation.py` for the current set.

4. **Concurrency validation**: The system ensures either `conc-list` OR `conc-start`/`conc-end` is provided, but not both.

Expand Down
99 changes: 99 additions & 0 deletions benchmarks/single_node/minimaxm2.5_fp8_mi355x_eagle3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

SPEC_DRAFT_MODEL=${SPEC_DRAFT_MODEL:-thoughtworks/MiniMax-M2.5-Eagle3}
SPEC_NUM_TOKENS=${SPEC_NUM_TOKENS:-3}
SPEC_DRAFT_TP=${SPEC_DRAFT_TP:-1}

hf download "$SPEC_DRAFT_MODEL"

# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+
if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
else
EP=" "
fi

# Eagle3 speculative config. Double-quoted JSON string with escaped inner quotes
# so the draft model and token count expand before being passed as one argument.
SPEC_CONFIG="{\"model\":\"${SPEC_DRAFT_MODEL}\",\"method\":\"eagle3\",\"num_speculative_tokens\":${SPEC_NUM_TOKENS},\"draft_tensor_parallel_size\":${SPEC_DRAFT_TP}}"

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
$EP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--kv-cache-dtype fp8 \
--block-size=32 \
--no-enable-prefix-caching \
--attention-backend "ROCM_AITER_FA" \
--reasoning-parser minimax_m2 \
--trust-remote-code \
--speculative-config "$SPEC_CONFIG" > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

# EAGLE-style speculative decoding is trained on chat-formatted prompts.
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--use-chat-template \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2031,3 +2031,14 @@
- "Topologies mirror the dsv4-fp4-gb300-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B"
- "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157

- config-keys:
- minimaxm2.5-fp8-mi355x-vllm-eagle3
description:
- "Add MiniMax-M2.5 FP8 vLLM Eagle3 speculative decoding benchmark for MI355X"
- "Image: vllm/vllm-openai-rocm:nightly-4eafc729285e459a5fc96efd6f7b313b155cad48"
- "Model: MiniMaxAI/MiniMax-M2.5"
- "Draft model: thoughtworks/MiniMax-M2.5-Eagle3"
- "Recipe uses TP/EP={4/4, 8/8}, num_speculative_tokens=3, draft_tensor_parallel_size=1"
- "Sweep covers ISL/OSL 1k/1k, 1k/8k, and 8k/1k with concurrency 4-64"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1234
6 changes: 5 additions & 1 deletion runners/launch_mi355x-amds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,11 @@ else
export PORT_OFFSET=${RUNNER_NAME: -1}
export PORT=$(( 8888 + ${PORT_OFFSET} ))
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "atom" ]] && printf '_atom' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
case "$SPEC_DECODING" in
mtp) SPEC_SUFFIX='_mtp' ;;
eagle3) SPEC_SUFFIX='_eagle3' ;;
*) SPEC_SUFFIX='' ;;
esac

PARTITION="compute"
SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
Expand Down
2 changes: 1 addition & 1 deletion utils/matrix_logic/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def test_conc_as_list(self, valid_single_node_matrix_entry):

def test_spec_decoding_values(self, valid_single_node_matrix_entry):
"""Spec decoding should accept valid literal values."""
for value in ["mtp", "draft_model", "none"]:
for value in ["mtp", "draft_model", "eagle3", "none"]:
valid_single_node_matrix_entry["spec-decoding"] = value
entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
assert entry.spec_decoding == value
Expand Down
8 changes: 4 additions & 4 deletions utils/matrix_logic/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class SingleNodeMatrixEntry(BaseModel):
model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
precision: str
framework: str
spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
spec_decoding: Literal["mtp", "draft_model", "eagle3", "none"] = Field(
alias=Fields.SPEC_DECODING.value
)
runner: str
Expand Down Expand Up @@ -116,7 +116,7 @@ class MultiNodeMatrixEntry(BaseModel):
model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
precision: str
framework: str
spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
spec_decoding: Literal["mtp", "draft_model", "eagle3", "none"] = Field(
alias=Fields.SPEC_DECODING.value
)
runner: str
Expand Down Expand Up @@ -204,7 +204,7 @@ class SingleNodeSearchSpaceEntry(BaseModel):

tp: int
ep: Optional[int] = None
spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
spec_decoding: Literal["mtp", "draft_model", "eagle3", "none"] = Field(
default="none", alias=Fields.SPEC_DECODING.value)
dp_attn: Optional[bool] = Field(
default=None, alias=Fields.DP_ATTN.value)
Expand All @@ -224,7 +224,7 @@ class MultiNodeSearchSpaceEntry(BaseModel):
"""Multinode search space configuration."""
model_config = ConfigDict(extra='forbid', populate_by_name=True)

spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
spec_decoding: Literal["mtp", "draft_model", "eagle3", "none"] = Field(
default="none", alias=Fields.SPEC_DECODING.value)
prefill: WorkerConfig
decode: WorkerConfig
Expand Down
Loading