Skip to content
Merged
24 changes: 24 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,30 @@ minimaxm2.5-fp8-mi355x-atom:
- { tp: 4, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 32, conc-end: 256 }

minimaxm2.5-fp4-mi355x-atom:
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
model: amd/MiniMax-M2.5-MXFP4
model-prefix: minimaxm2.5
runner: mi355x
precision: fp4
framework: atom
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 1, conc-start: 4, conc-end: 1024 }
- { tp: 2, conc-start: 4, conc-end: 1024 }
- { tp: 4, conc-start: 4, conc-end: 128 }
- { tp: 8, conc-start: 4, conc-end: 16 }
- isl: 8192
osl: 1024
search-space:
- { tp: 1, conc-start: 4, conc-end: 1024 }
- { tp: 2, conc-start: 4, conc-end: 1024 }
- { tp: 4, conc-start: 4, conc-end: 128 }
- { tp: 8, conc-start: 4, conc-end: 16 }

minimaxm2.5-fp4-mi355x-vllm:
image: vllm/vllm-openai-rocm:v0.19.1
model: amd/MiniMax-M2.5-MXFP4
Expand Down
80 changes: 80 additions & 0 deletions benchmarks/single_node/minimaxm2.5_fp4_mi355x_atom.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

export OMP_NUM_THREADS=1

# Calculate max-model-len based on ISL and OSL
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
CALCULATED_MAX_MODEL_LEN=""
else
CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
fi

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
else
EP=" "
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x

python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
-tp $TP \
--kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
--trust-remote-code \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

export PYTHONDONTWRITEBYTECODE=1
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1962,6 +1962,13 @@
- "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204

- config-keys:
- minimaxm2.5-fp4-mi355x-atom
description:
- "Add MiniMax-M2.5 MXFP4 MI355X Atom benchmark (rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post)"
- "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042

- config-keys:
- dsv4-fp4-gb200-dynamo-vllm
description:
Expand Down
Loading