From 609e7a91b01f427c3be562af52e3f9942b7a93db Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Thu, 23 Apr 2026 17:03:46 +0530
Subject: [PATCH 01/10] AMD GLM5.1 FP8 MTP Support on MI355X

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 .github/configs/amd-master.yaml               | 20 +++++
 .../single_node/glm5.1_fp8_mi355x_mtp.sh      | 88 +++++++++++++++++++
 perf-changelog.yaml                           |  7 ++
 3 files changed, 115 insertions(+)
 create mode 100644 benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9e1f9834e..554819b68 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -357,6 +357,26 @@ glm5.1-fp4-mi355x-sglang:
     - { tp: 2, conc-start: 4, conc-end: 256 }
     - { tp: 4, conc-start: 4, conc-end: 16 }
 
+glm5.1-fp8-mi355x-sglang-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5.1
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+
 kimik2.5-int4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.18.0
   model: moonshotai/Kimi-K2.5
diff --git a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh
new file mode 100644
index 000000000..17e289114
--- /dev/null
+++ b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+set -x
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+hf download "$MODEL"
+
+# ROCm / SGLang performance tuning for MI355X
+export SGLANG_ROCM_FUSED_DECODE_MLA=0
+export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+export SAFETENSORS_FAST_GPU=1
+export SGLANG_ENABLE_SPEC_V2=1
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+CONTEXT_LENGTH=$((ISL + OSL + 32))
+
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+pip install -U transformers
+
+python3 -m sglang.launch_server \
+    --model-path $MODEL \
+    --host=0.0.0.0 \
+    --port $PORT \
+    --tensor-parallel-size $TP \
+    --trust-remote-code \
+    --cuda-graph-max-bs $CONC \
+    --context-length $CONTEXT_LENGTH \
+    --mem-fraction-static 0.85 \
+    --tool-call-parser glm47 \
+    --reasoning-parser glm45 \
+    --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
+    --nsa-prefill-backend tilelang \
+    --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS  \
+    --kv-cache-dtype fp8_e4m3 \
+    --speculative-algorithm EAGLE \
+    --speculative-num-steps 3 \
+    --speculative-eagle-topk 1 \
+    --speculative-num-draft-tokens 4 \
+    --tokenizer-worker-num $((TP*2)) \
+    --disable-radix-cache> $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e27a2511a..575886049 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,10 @@
+- config-keys:
+    - glm5.1-fp8-mi355x-sglang-mtp
+  description:
+    - "Add GLM5.1 FP8 MTP MI355X SGLang Support" 
+    - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
+  pr-link: TO BE UPDATE
+
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
     - dsr1-fp8-h100-dynamo-sglang

From b9e979a1eb7eca3b449fce3843ca493bfcf522da Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Thu, 23 Apr 2026 17:38:34 +0530
Subject: [PATCH 02/10] AMD GLM5.1 FP8 MTP Support on MI355X

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 575886049..18904c51f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3,7 +3,7 @@
   description:
     - "Add GLM5.1 FP8 MTP MI355X SGLang Support" 
     - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
-  pr-link: TO BE UPDATE
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt

From 5a9c06202e3c63a82819a0cf5ad81a9165d015d1 Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Thu, 23 Apr 2026 17:50:12 +0530
Subject: [PATCH 03/10] AMD GLM5.1 FP8 MTP Support on MI355X

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh
index 17e289114..504ba0184 100644
--- a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh
+++ b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh
@@ -36,8 +36,6 @@ fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
-pip install -U transformers
-
 python3 -m sglang.launch_server \
     --model-path $MODEL \
     --host=0.0.0.0 \

From 89764f7f17e213075ec3cae373c392f4ddf679c3 Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Fri, 24 Apr 2026 13:25:42 +0530
Subject: [PATCH 04/10] AMD GLM5.1 FP8 MTP Support on MI355X - Merging with
 GLM5

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 .github/configs/amd-master.yaml               | 69 ++++++++++++---
 .../single_node/glm5.1_fp8_mi355x_mtp.sh      | 86 -------------------
 benchmarks/single_node/glm5_fp8_mi355x_mtp.sh | 15 ++--
 perf-changelog.yaml                           |  2 +-
 4 files changed, 63 insertions(+), 109 deletions(-)
 delete mode 100644 benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 554819b68..78b412281 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -245,6 +245,48 @@ qwen3.5-fp8-mi355x-sglang-mtp:
     - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp }
 
+qwen3.5-fp8-mi355x-atom:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: atom
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 }
+    - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 }
+    - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
+
+qwen3.5-fp8-mi355x-atom-mtp:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: atom
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+
 qwen3.5-fp4-mi355x-sglang:
   image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413
   model: amd/Qwen3.5-397B-A17B-MXFP4
@@ -302,8 +344,8 @@ glm5-fp8-mi355x-sglang:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 glm5-fp8-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413
-  model: zai-org/GLM-5-FP8
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
+  model: zai-org/GLM-5.1-FP8
   model-prefix: glm5
   runner: mi355x
   precision: fp8
@@ -313,11 +355,13 @@ glm5-fp8-mi355x-sglang-mtp:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
 
 glm5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post
@@ -357,25 +401,23 @@ glm5.1-fp4-mi355x-sglang:
     - { tp: 2, conc-start: 4, conc-end: 256 }
     - { tp: 4, conc-start: 4, conc-end: 16 }
 
-glm5.1-fp8-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
-  model: zai-org/GLM-5-FP8
+glm5.1-fp4-mi355x-atom:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: amd/GLM-5.1-MXFP4
   model-prefix: glm5.1
   runner: mi355x
-  precision: fp8
-  framework: sglang
+  precision: fp4
+  framework: atom
   multinode: false
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, conc-start: 4, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, conc-start: 4, conc-end: 256 }
 
 kimik2.5-int4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.18.0
@@ -1431,4 +1473,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         additional-settings:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
-
diff --git a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh
deleted file mode 100644
index 504ba0184..000000000
--- a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env bash
-set -x
-
-source "$(dirname "$0")/../benchmark_lib.sh"
-
-check_env_vars \
-    MODEL \
-    TP \
-    CONC \
-    ISL \
-    OSL \
-    RANDOM_RANGE_RATIO \
-    RESULT_FILENAME
-
-if [[ -n "$SLURM_JOB_ID" ]]; then
-  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-fi
-
-hf download "$MODEL"
-
-# ROCm / SGLang performance tuning for MI355X
-export SGLANG_ROCM_FUSED_DECODE_MLA=0
-export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-export SAFETENSORS_FAST_GPU=1
-export SGLANG_ENABLE_SPEC_V2=1
-
-SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
-CONTEXT_LENGTH=$((ISL + OSL + 32))
-
-EVAL_CONTEXT_ARGS=""
-if [ "${EVAL_ONLY}" = "true" ]; then
-    setup_eval_context
-    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
-fi
-# Start GPU monitoring (power, temperature, clocks every second)
-start_gpu_monitor
-
-python3 -m sglang.launch_server \
-    --model-path $MODEL \
-    --host=0.0.0.0 \
-    --port $PORT \
-    --tensor-parallel-size $TP \
-    --trust-remote-code \
-    --cuda-graph-max-bs $CONC \
-    --context-length $CONTEXT_LENGTH \
-    --mem-fraction-static 0.85 \
-    --tool-call-parser glm47 \
-    --reasoning-parser glm45 \
-    --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
-    --nsa-prefill-backend tilelang \
-    --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS  \
-    --kv-cache-dtype fp8_e4m3 \
-    --speculative-algorithm EAGLE \
-    --speculative-num-steps 3 \
-    --speculative-eagle-topk 1 \
-    --speculative-num-draft-tokens 4 \
-    --tokenizer-worker-num $((TP*2)) \
-    --disable-radix-cache> $SERVER_LOG 2>&1 &
-
-SERVER_PID=$!
-
-# Wait for server to be ready
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
-
-run_benchmark_serving \
-    --model "$MODEL" \
-    --port "$PORT" \
-    --backend vllm \
-    --input-len "$ISL" \
-    --output-len "$OSL" \
-    --random-range-ratio "$RANDOM_RANGE_RATIO" \
-    --num-prompts "$((CONC * 10))" \
-    --max-concurrency "$CONC" \
-    --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
-
-# After throughput, run evaluation only if RUN_EVAL is true
-if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT"
-    append_lm_eval_summary
-fi
-
-# Stop GPU monitoring
-stop_gpu_monitor
-set +x
diff --git a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh
index f4b899011..504ba0184 100755
--- a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh
+++ b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env bash
+set -x
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
@@ -15,11 +16,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-# GLM-5 requires transformers with glm_moe_dsa model type support.
-# However, the Image rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 doesn't provide this support.
-python3 -m pip install -U --no-cache-dir \
-  "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2"
-
 hf download "$MODEL"
 
 # ROCm / SGLang performance tuning for MI355X
@@ -30,6 +26,7 @@ export SGLANG_ENABLE_SPEC_V2=1
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
+CONTEXT_LENGTH=$((ISL + OSL + 32))
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -45,9 +42,11 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
+    --cuda-graph-max-bs $CONC \
+    --context-length $CONTEXT_LENGTH \
+    --mem-fraction-static 0.85 \
     --tool-call-parser glm47 \
     --reasoning-parser glm45 \
-    --mem-fraction-static 0.85 \
     --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
     --nsa-prefill-backend tilelang \
     --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS  \
@@ -56,6 +55,7 @@ python3 -m sglang.launch_server \
     --speculative-num-steps 3 \
     --speculative-eagle-topk 1 \
     --speculative-num-draft-tokens 4 \
+    --tokenizer-worker-num $((TP*2)) \
     --disable-radix-cache> $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
@@ -73,8 +73,7 @@ run_benchmark_serving \
     --num-prompts "$((CONC * 10))" \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/ \
-    --use-chat-template
+    --result-dir /workspace/
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 18904c51f..78601b1fd 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,5 +1,5 @@
 - config-keys:
-    - glm5.1-fp8-mi355x-sglang-mtp
+    - glm5-fp8-mi355x-sglang-mtp
   description:
     - "Add GLM5.1 FP8 MTP MI355X SGLang Support" 
     - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"

From 80a61dc5d94006420de3ce5d9755273eec4ca131 Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Fri, 24 Apr 2026 13:36:01 +0530
Subject: [PATCH 05/10] AMD GLM5.1 FP8 MTP Support on MI355X - Merging with
 GLM5

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 78b412281..9593a3147 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -355,12 +355,12 @@ glm5-fp8-mi355x-sglang-mtp:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
     - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
     - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
 
 glm5-fp8-mi355x-atom:

From 52172fdb8690daf3f4dec5e799db3e0857c536a2 Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Fri, 24 Apr 2026 15:03:46 +0530
Subject: [PATCH 06/10] AMD GLM5.1 FP8 MTP Support on MI355X - Merging with
 GLM5

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 .github/configs/amd-master.yaml | 2 +-
 perf-changelog.yaml             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9593a3147..4c8e8d715 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -345,7 +345,7 @@ glm5-fp8-mi355x-sglang:
 
 glm5-fp8-mi355x-sglang-mtp:
   image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
-  model: zai-org/GLM-5.1-FP8
+  model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi355x
   precision: fp8
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e5a17957f..1bea1067b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,7 +1,7 @@
 - config-keys:
     - glm5-fp8-mi355x-sglang-mtp
   description:
-    - "Add GLM5.1 FP8 MTP MI355X SGLang Support" 
+    - "Add GLM5 FP8 MTP MI355X SGLang Support" 
     - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122
 

From 05ffd31a2bfb4abd04b77dfccd4b09660f9c51ad Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Thu, 30 Apr 2026 12:40:44 +0530
Subject: [PATCH 07/10] AMD GLM5 FP8 MTP Support on MI355X - Perf Change Log

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 perf-changelog.yaml | 1633 +++++++++++++++++--------------------------
 1 file changed, 624 insertions(+), 1009 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 84a598aed..29c72bfe0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,10 +1,3 @@
-- config-keys:
-    - glm5-fp8-mi355x-sglang-mtp
-  description:
-    - "Add GLM5 FP8 MTP MI355X SGLang Support" 
-    - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122
-
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
     - dsr1-fp8-h100-dynamo-sglang
@@ -133,12 +126,6 @@
     - "Extend concurrency to 128 for gptoss b200 TRT configurations"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/233
 
-- config-keys:
-    - gptoss-fp4-b200-trt
-  description:
-    - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/256
-
 - config-keys:
     - "*gb200-dynamo-sglang"
   description:
@@ -162,22 +149,18 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/273
 
 - config-keys:
-    - dsr1-fp4-b200-sglang
-    - dsr1-fp8-b200-sglang
-    - dsr1-fp8-h200-sglang
+    - gptoss-fp4-b200-trt
   description:
-    - "Update NVIDIA DeepSeek sglang Docker image from v0.5.5 to v0.5.6"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/276
-
+    - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/256
 
 - config-keys:
-    - gptoss-fp4-b200-vllm
-    - gptoss-fp4-h100-vllm
-    - gptoss-fp4-h200-vllm
+    - dsr1-fp4-gb200-dynamo-trt
+    - dsr1-fp4-gb200-dynamo-sglang
+    - dsr1-fp8-gb200-dynamo-sglang
   description:
-    - "Update vLLM image from v0.11.2 to v0.13.0"
-    - "Add VLLM_MXFP4_USE_MARLIN=1 to H100 and H200 benchmark scripts"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/327
+    - "Add more configurations for GB200 SGLang DSR1"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/335
 
 - config-keys:
     - dsr1-fp4-mi355x-sglang
@@ -185,22 +168,6 @@
     - "Update MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post1"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/330
 
-- config-keys:
-    - dsr1-fp8-mi300x-sglang
-    - dsr1-fp8-mi325x-sglang
-    - dsr1-fp8-mi355x-sglang
-  description:
-    - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/332
-
-- config-keys:
-    - dsr1-fp4-gb200-dynamo-trt
-    - dsr1-fp4-gb200-dynamo-sglang
-    - dsr1-fp8-gb200-dynamo-sglang
-  description:
-    - "Add more configurations for GB200 SGLang DSR1"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/335
-
 - config-keys:
     - dsr1-fp4-gb200-dynamo-sglang
     - dsr1-fp8-gb200-dynamo-sglang
@@ -213,7 +180,31 @@
   description:
     - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post2"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/369
+    
+- config-keys:
+    - dsr1-fp4-b200-sglang
+    - dsr1-fp8-b200-sglang
+    - dsr1-fp8-h200-sglang
+  description:
+    - "Update NVIDIA DeepSeek sglang Docker image from v0.5.5 to v0.5.6"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/276
+  
+- config-keys:
+    - gptoss-fp4-b200-vllm
+    - gptoss-fp4-h100-vllm
+    - gptoss-fp4-h200-vllm
+  description: 
+    - "Update vLLM image from v0.11.2 to v0.13.0"
+    - "Add VLLM_MXFP4_USE_MARLIN=1 to H100 and H200 benchmark scripts"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/327
 
+- config-keys:
+    - dsr1-fp8-mi300x-sglang
+    - dsr1-fp8-mi325x-sglang
+    - dsr1-fp8-mi355x-sglang
+  description:
+    - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/332
 
 - config-keys:
     - gptoss-fp4-gb200-dynamo-trt
@@ -224,14 +215,11 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/387
 
 - config-keys:
-    - dsr1-fp4-b200-trt-mtp
-    - dsr1-fp8-b200-trt-mtp
-    - dsr1-fp8-h200-trt-mtp
+    - dsr1-fp8-mi355x-sglang-disagg
   description:
-    - Add MTP (Multi-Token Prediction) support for single-node TRT configs
-    - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/392
-
+    - "Add PD disaggregation (1P2D) for Mi355X"
+    - "Includes with and without speculative decoding"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/348
 
 - config-keys:
     - dsr1-fp4-mi355x-sglang
@@ -239,20 +227,21 @@
     - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.7"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/395
 
-- config-keys:
-    - dsr1-fp8-mi355x-sglang-disagg
-  description:
-    - "Add PD disaggregation (1P2D) for Mi355X"
-    - "Includes with and without speculative decoding"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/409
-
 - config-keys:
     - dsr1-fp8-b200-sglang
   description:
     - "Adds TP4 configurations to DSR1-FP8 B200 SGLang deployment experiments"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/411
-
-
+  
+- config-keys:
+    - dsr1-fp4-b200-trt-mtp
+    - dsr1-fp8-b200-trt-mtp
+    - dsr1-fp8-h200-trt-mtp
+  description:
+    - Add MTP (Multi-Token Prediction) support for single-node TRT configs
+    - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/392
+  
 - config-keys:
     - dsr1-fp8-mi355x-atom
     - dsr1-fp4-mi355x-atom
@@ -271,22 +260,6 @@
     - "Add HIP_VISIBLE_DEVICES env var for Ray compatibility in vLLM 0.14+"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/496
 
-- config-keys:
-    - dsr1-fp4-gb200-dynamo-trt
-  description:
-    - "Update Dynamo TRT image from 0.5.1-rc0.pre3 to 0.8.1.post2"
-    - "Update TRT configurations"
-    - "Refactor configurations to use CONFIG_FILE-based recipes instead of inline parameter settings"
-    - "Introduce srt-slurm workflow for launching Dynamo jobs"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/510
-
-- config-keys:
-    - gptoss-fp4-mi300x-vllm
-    - gptoss-fp4-mi325x-vllm
-  description:
-    - "Fix AITER env vars for vLLM v0.14.0 on AMD MI300X and MI325X"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/535
-
 - config-keys:
     - dsr1-fp8-h200-sglang
   description:
@@ -302,7 +275,6 @@
     - "Set --attention-backend aiter for AMD aiter attention backend"
     - "Update chunked-prefill-size and max-prefill-tokens from 196608 to 131072"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/544
-
 - config-keys:
     - dsr1-fp8-mi325x-sglang
   description:
@@ -315,14 +287,11 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/545
 
 - config-keys:
-    - dsr1-fp8-h200-dynamo-trt
+    - gptoss-fp4-mi300x-vllm
+    - gptoss-fp4-mi325x-vllm
   description:
-    - "Add DSR1 FP8 H200 Dynamo TRT-LLM disaggregated multinode configuration"
-    - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
-    - "Runner: h200-dgxc with multinode and disagg enabled"
-    - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths"
-    - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/570
+    - "Fix AITER env vars for vLLM v0.14.0 on AMD MI300X and MI325X"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/535
 
 - config-keys:
     - dsr1-fp8-mi355x-sglang
@@ -331,24 +300,6 @@
     - "Key fix: Disables mla persistent kernel when not using fp8 kv_cache (https://github.com/sgl-project/sglang/pull/17327)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/572
 
-- config-keys:
-    - dsr1-fp8-h200-dynamo-sglang
-  description:
-    - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime"
-    - "Runner: h200-multinode-slurm with multinode and disagg enabled"
-    - "Recipes sourced from srtslurm repo (recipes/h200/)"
-    - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)"
-    - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)"
-    - "Concurrency levels range from 1 to 2048 depending on configuration"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/582
-
-- config-keys:
-  - dsr1-fp4-b300-dynamo-trt
-  description:
-    - "Add DSR1 FP4 B300 Dynamo TRT configurations"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/585
-
 - config-keys:
     # NVIDIA single-node
     - dsr1-fp4-b200-sglang
@@ -378,17 +329,27 @@
     - gptoss-fp4-mi355x-atom
   description:
     - Add official GSM8k eval results to GPT-OSS and DeepSeek R1 scenarios
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/587
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/558
   evals-only: true
 
 - config-keys:
-  - dsr1-fp4-b200-dynamo-trt
+    - dsr1-fp8-h200-sglang
   description:
-    - "Update DSR1 FP4 B200 Dynamo TRT configurations"
-    - "Update TRTLLM version to 1.2.0rc6.post2"
-    - "Transform to use srt-slurm recipes"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/588
+    - "Update H200 DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.9"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+  
+- config-keys:
+  - dsr1-fp4-b300-dynamo-trt
+  description:
+    - "Add DSR1 FP4 B300 Dynamo TRT configurations"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/585
 
+- config-keys:
+  - dsr1-fp4-mi355x-sglang
+  description:
+    - "Update SGLang image from v0.5.7 to v0.5.8 for DeepSeek-R1 FP4 on MI355x"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/595
+  
 - config-keys:
     - dsr1-fp8-b200-trt
   description:
@@ -400,14 +361,33 @@
     - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges"
     - "Add TLLM_OVERRIDE_LAYER_NUM=61 to avoid OOM errors"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/594
-
+  
+- config-keys:
+  - dsr1-fp4-b200-dynamo-trt
+  description:
+    - "Update DSR1 FP4 B200 Dynamo TRT configurations"
+    - "Update TRTLLM version to 1.2.0rc6.post2"
+    - "Transform to use srt-slurm recipes"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/588
 
 - config-keys:
-  - dsr1-fp4-mi355x-sglang
+    - dsr1-fp8-h200-dynamo-trt
   description:
-    - "Update SGLang image from v0.5.7 to v0.5.8 for DeepSeek-R1 FP4 on MI355x"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/595
+    - "Add DSR1 FP8 H200 Dynamo TRT-LLM disaggregated multinode configuration"
+    - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+    - "Runner: h200-dgxc with multinode and disagg enabled"
+    - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths"
+    - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/570
 
+- config-keys:
+    - dsr1-fp4-gb200-dynamo-trt
+  description:
+    - "Update Dynamo TRT image from 0.5.1-rc0.pre3 to 0.8.1.post2"
+    - "Update TRT configurations"
+    - "Refactor configurations to use CONFIG_FILE-based recipes instead of inline parameter settings"
+    - "Introduce srt-slurm workflow for launching Dynamo jobs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/510
 
 - config-keys:
     - dsr1-fp8-mi355x-sglang
@@ -417,21 +397,25 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/613
 
 - config-keys:
-    - dsr1-fp8-b200-dynamo-trt
+    - dsr1-fp8-h200-dynamo-sglang
   description:
-    - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/616
+    - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime"
+    - "Runner: h200-multinode-slurm with multinode and disagg enabled"
+    - "Recipes sourced from srtslurm repo (recipes/h200/)"
+    - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)"
+    - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)"
+    - "Concurrency levels range from 1 to 2048 depending on configuration"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/582
 
 - config-keys:
-    - dsr1-fp8-gb200-dynamo-trt
+    - dsr1-fp4-b200-trt
   description:
-    - "Add DeepSeek R1 FP8 GB200 Dynamo TRT-LLM disaggregated multinode configurations"
-    - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2"
-    - "1k1k: 14 scenarios (7 MTP, 7 STP) with varying DP attention/TEP modes"
-    - "1k8k: 10 scenarios (5 MTP, 5 STP) for long output generation"
-    - "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads"
-    - "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/617
+    - "Update TensorRT-LLM container from release:1.1.0rc2.post2 to release:1.2.0rc6.post2"
+    - "Change default MOE backend from DEEPGEMM to TRTLLM"
+    - "Add dynamic piecewise CUDA graphs for 1k1k (TEP8 and CONC64)"
+    - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/620
 
 - config-keys:
     - dsr1-fp4-gb300-dynamo-trt
@@ -442,15 +426,6 @@
     - "Add gb300-nv runner and launch script for srt-slurm integration"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/618
 
-- config-keys:
-    - dsr1-fp4-b200-trt
-  description:
-    - "Update TensorRT-LLM container from release:1.1.0rc2.post2 to release:1.2.0rc6.post2"
-    - "Change default MOE backend from DEEPGEMM to TRTLLM"
-    - "Add dynamic piecewise CUDA graphs for 1k1k (TEP8 and CONC64)"
-    - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/620
-
 - config-keys:
     - dsr1-fp4-mi355x-sglang-disagg
   description:
@@ -458,20 +433,21 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/622
 
 - config-keys:
-    - dsr1-fp8-b200-sglang-mtp
+    - dsr1-fp8-gb200-dynamo-trt
   description:
-    - "Add MTP (Multi-Token Prediction) support for DeepSeek R1 FP8 B200 SGLang using EAGLE speculative decoding"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130-amd64"
-    - "Add benchmark script dsr1_fp8_b200_mtp.sh with EAGLE speculative decoding (num-steps=2, draft-tokens=3, topk=1)"
-    - "Update launch_b200-dgxc.sh to support SPEC_SUFFIX for MTP script selection"
-    - "Configurations: TP=8, EP=1, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/626
+    - "Add DeepSeek R1 FP8 GB200 Dynamo TRT-LLM disaggregated multinode configurations"
+    - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2"
+    - "1k1k: 14 scenarios (7 MTP, 7 STP) with varying DP attention/TEP modes"
+    - "1k8k: 10 scenarios (5 MTP, 5 STP) for long output generation"
+    - "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads"
+    - "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/617
 
 - config-keys:
-    - dsr1-fp8-gb300-dynamo-trt
+    - dsr1-fp8-gb200-dynamo-trt
   description:
-    - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 8k1k and 1k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/627
+    - "Fix model_prefix argument in yaml configs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/646
 
 - config-keys:
     - dsr1-fp8-b200-trt-mtp
@@ -483,39 +459,10 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/632
 
 - config-keys:
-    - dsr1-fp4-gb200-dynamo-sglang
+    - dsr1-fp8-gb300-dynamo-trt
   description:
-    - "Update SGLang image from v0.5.5.post2 to v0.5.8-cu130"
-    - "Add FP4 model path separation via SRT_SLURM_MODEL_PREFIX in launch script"
-    - "Refactor to use CONFIG_FILE-based srt-slurm recipes instead of inline parameters"
-    - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)"
-    - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/633
-
-- config-keys:
-    - dsr1-fp8-gb200-dynamo-sglang
-    - dsr1-fp8-gb300-dynamo-sglang
-  description:
-    - "Update  GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130"
-    - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/635
-
-- config-keys:
-    - dsr1-fp4-gb300-dynamo-sglang
-  description:
-    - "Add GB300 FP4 Dynamo SGLang disaggregated multinode configuration"
-    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime"
-    - "Recipes sourced from srt-slurm repo (recipes/gb300-fp4/ folder)"
-    - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)"
-    - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/636
-
-- config-keys:
-    - dsr1-fp8-b300-dynamo-trt
-  description:
-    - "New B300 FP8 Dynamo TRT configurations"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/638
+    - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 8k1k and 1k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/627
 
 - config-keys:
     - gptoss-fp4-b200-trt
@@ -525,12 +472,21 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/639
 
 - config-keys:
-    - dsr1-fp8-h200-dynamo-sglang
+    - dsr1-fp8-mi355x-sglang-disagg
   description:
-    - "Add MTP (EAGLE speculative decoding) configs alongside STP"
-    - "Update container to lmsysorg/sglang:v0.5.8.post1-cu130"
-    - "Remove aggregated configs, keep only disaggregated"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/640
+    - "Add --use-chat-template argument to benchmark_serving script"
+    - "Without this arg, MTP acceptance rates are artificially high for DeepSeek with MTP"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/647
+
+- config-keys: 
+    - dsr1-fp8-b200-sglang-mtp
+  description:
+    - "Add MTP (Multi-Token Prediction) support for DeepSeek R1 FP8 B200 SGLang using EAGLE speculative decoding"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130-amd64"
+    - "Add benchmark script dsr1_fp8_b200_mtp.sh with EAGLE speculative decoding (num-steps=2, draft-tokens=3, topk=1)"
+    - "Update launch_b200-dgxc.sh to support SPEC_SUFFIX for MTP script selection"
+    - "Configurations: TP=8, EP=1, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/626
 
 - config-keys:
     - dsr1-fp4-b200-trt-mtp
@@ -541,35 +497,18 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/642
 
 - config-keys:
-    - dsr1-fp8-h100-dynamo-sglang
-  description:
-    - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang STP disaggregated multinode configurations"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130"
-    - "1k1k, 1k8k, 8k1k sequence lengths"
-    - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/643
-
-- config-keys:
-    - dsr1-fp8-h100-dynamo-sglang
-  description:
-    - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang MTP disaggregated multinode configurations"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130"
-    - "1k1k, 1k8k, 8k1k sequence lengths with MTP speculative decoding"
-    - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/644
-
-- config-keys:
-    - dsr1-fp8-gb200-dynamo-trt
+    - dsr1-fp8-b200-dynamo-sglang
   description:
-    - "Fix model_prefix argument in yaml configs"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/646
+    - "Add DSR1 FP8 B200 disaggregated SGLang multinode configuration"
+    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64"
+    - "9 recipes: 4x 1k1k + 5x 8k1k, low-latency and max-throughput profiles"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/658
 
 - config-keys:
-    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-gb300-dynamo-trt
   description:
-    - "Add --use-chat-template argument to benchmark_serving script"
-    - "Without this arg, MTP acceptance rates are artificially high for DeepSeek with MTP"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/649
+    - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 1k8k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/654
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
@@ -578,18 +517,18 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/651
 
 - config-keys:
-    - dsr1-fp8-gb300-dynamo-trt
+    - dsr1-fp8-h200-dynamo-sglang
   description:
-    - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 1k8k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/654
+    - "Add MTP (EAGLE speculative decoding) configs alongside STP"
+    - "Update container to lmsysorg/sglang:v0.5.8.post1-cu130"
+    - "Remove aggregated configs, keep only disaggregated"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/640
 
 - config-keys:
-    - dsr1-fp8-b200-dynamo-sglang
+    - dsr1-fp8-b300-dynamo-trt
   description:
-    - "Add DSR1 FP8 B200 disaggregated SGLang multinode configuration"
-    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64"
-    - "9 recipes: 4x 1k1k + 5x 8k1k, low-latency and max-throughput profiles"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/658
+    - "New B300 FP8 Dynamo TRT configurations"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/638
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
@@ -598,6 +537,25 @@
     - "fix model_prefix bug from https://github.com/SemiAnalysisAI/InferenceX/pull/651"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/663
 
+- config-keys:
+    - dsr1-fp4-gb200-dynamo-sglang
+  description:
+    - "Update SGLang image from v0.5.5.post2 to v0.5.8-cu130"
+    - "Add FP4 model path separation via SRT_SLURM_MODEL_PREFIX in launch script"
+    - "Refactor to use CONFIG_FILE-based srt-slurm recipes instead of inline parameters"
+    - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)"
+    - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/633  
+
+- config-keys:
+    - dsr1-fp8-gb200-dynamo-sglang
+    - dsr1-fp8-gb300-dynamo-sglang
+  description:
+    - "Update  GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130"
+    - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/635
+
 - config-keys:
     - dsr1-fp8-b200-dynamo-sglang-mtp
   description:
@@ -607,13 +565,22 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/667
 
 - config-keys:
-    - dsr1-fp4-b200-dynamo-sglang
+    - dsr1-fp8-h100-dynamo-sglang
   description:
-    - "Add DSR1 FP4 B200 Dynamo SGLang STP mode"
-    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime"
-    - "1k1k configs: low-latency DEP (1P5D, 1P6D), max-throughput DEP (1P1D, 1P2D)"
-    - "8k1k configs: low-latency DEP/TEP (1P1D, 1P5D, 2P5D), TEP (1P1D), max-throughput DEP (7P2D)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/672
+    - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang STP disaggregated multinode configurations"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130"
+    - "1k1k, 1k8k, 8k1k sequence lengths"
+    - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/643
+
+- config-keys:
+    - dsr1-fp8-h100-dynamo-sglang
+  description:
+    - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang MTP disaggregated multinode configurations"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130"
+    - "1k1k, 1k8k, 8k1k sequence lengths with MTP speculative decoding"
+    - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/644
 
 - config-keys:
     - dsr1-fp8-mi355x-atom-mtp
@@ -624,6 +591,21 @@
     - "Deepseek R1 with speculative decoding: 1k1k, 1k8k, 8k1k"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/673
 
+- config-keys:
+    - dsr1-fp4-b200-dynamo-sglang
+  description:
+    - "Add DSR1 FP4 B200 Dynamo SGLang STP mode"
+    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime"
+    - "1k1k configs: low-latency DEP (1P5D, 1P6D), max-throughput DEP (1P1D, 1P2D)"
+    - "8k1k configs: low-latency DEP/TEP (1P1D, 1P5D, 2P5D), TEP (1P1D), max-throughput DEP (7P2D)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/672
+
+- config-keys:
+    - dsr1-fp8-b200-dynamo-trt
+  description:
+    - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/616
+
 - config-keys:
     - dsr1-fp8-mi355x-sglang-disagg
     - dsr1-fp4-mi355x-sglang-disagg
@@ -642,17 +624,22 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/683
 
 - config-keys:
-    - dsr1-fp8-b200-dynamo-trt
+    - dsr1-fp4-gb300-dynamo-sglang
   description:
-    - "Update max_num_tokens and max_batch_size for min-latency decode workers"
-    - "See srt-slurm recipe changes: https://github.com/ishandhanani/srt-slurm/pull/173"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/686
+    - "Add GB300 FP4 Dynamo SGLang disaggregated multinode configuration"
+    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime"
+    - "Recipes sourced from srt-slurm repo (recipes/gb300-fp4/ folder)"
+    - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)"
+    - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/636
 
 - config-keys:
-    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-b200-dynamo-sglang-mtp
   description:
-    - "Add more sweep points for DSR1 FP8 both MTP and non-MTP 1k1k, 8k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/689
+    - "Patches one missing concurrency point for "
+    - "DSR1 FP8 B200 disaggregated SGLang MTP multinode configuration. "
+    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/691
 
 - config-keys:
     - dsr1-fp8-b300-dynamo-trt
@@ -661,25 +648,23 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/690
 
 - config-keys:
-    - dsr1-fp8-b200-dynamo-sglang-mtp
+    - dsr1-fp8-mi355x-sglang-disagg
   description:
-    - "Patches one missing concurrency point for "
-    - "DSR1 FP8 B200 disaggregated SGLang MTP multinode configuration. "
-    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/691
+    - "Add more sweep points for DSR1 FP8 both MTP and non-MTP 1k1k, 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/689
 
 - config-keys:
-    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp8-b200-dynamo-trt
   description:
-    - "Add more sweep points for DSR1 FP4 both MTP and non-MTP 1k1k, 8k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692
-
+    - "Update max_num_tokens and max_batch_size for min-latency decode workers"
+    - "See srt-slurm recipe changes: https://github.com/ishandhanani/srt-slurm/pull/173"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/686
 
 - config-keys:
     - dsr1-fp8-mi325x-sglang
   description:
     - "Update MI325X DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.8"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/695
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692
 
 - config-keys:
     - dsr1-fp8-mi300x-sglang
@@ -687,6 +672,12 @@
     - "Update MI300X DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.8"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/696
 
+- config-keys:
+    - dsr1-fp4-mi355x-sglang-disagg
+  description:
+    - "Add more sweep points for DSR1 FP4 both MTP and non-MTP 1k1k, 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692
+  
 - config-keys:
     - dsr1-fp8-b200-dynamo-sglang-mtp
   description:
@@ -741,15 +732,6 @@
     - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/734
 
-- config-keys:
-    - kimik2.5-int4-b200-vllm
-  description:
-    - "Add Kimi-K2.5 INT4 vLLM benchmark for B200"
-    - "Model: moonshotai/Kimi-K2.5 with --mm-encoder-tp-mode data and --trust-remote-code"
-    - "Image: vllm/vllm-openai:v0.15.1"
-    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/735
-
 - config-keys:
     - minimaxm2.5-fp8-mi355x-vllm
   description:
@@ -761,13 +743,12 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/755
 
 - config-keys:
-    - minimaxm2.5-fp8-b200-vllm
+    - qwen3.5-fp8-mi355x-sglang
   description:
-    - "Add MiniMax-M2.5 FP8 vLLM benchmark for B200"
-    - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code"
-    - "Image: vllm/vllm-openai:v0.17.0"
-    - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/757
+    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for MI355X"
+    - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218"
+    - "Uses triton attention backend, TP=8, concurrency 4-64"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/768
 
 - config-keys:
     - qwen3.5-bf16-b200-sglang
@@ -779,30 +760,12 @@
     - "Set cuda-graph-max-bs to match concurrency, scheduler-recv-interval based on concurrency"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/758
 
-- config-keys:
-    - glm5-fp8-mi355x-sglang
-  description:
-    - "Add GLM-5 FP8 SGLang benchmark for MI355X"
-    - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends"
-    - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219"
-    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/762
-
-- config-keys:
-    - qwen3.5-fp8-mi355x-sglang
-  description:
-    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for MI355X"
-    - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218"
-    - "Uses triton attention backend, TP=8, concurrency 4-64"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/768
-
 - config-keys:
     - dsr1-fp8-mi355x-sglang-disagg
   description:
     - "Add more configs for MI355X FP8 Disagg"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/770
-
-
+  
 - config-keys:
     - gptoss-fp4-mi300x-vllm
     - gptoss-fp4-mi325x-vllm
@@ -811,6 +774,15 @@
     - "Gains: ROCm skinny GEMM dispatch fix, MoRI EP all2all backend, KV cache shuffle + paged attention for AITER"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/781
 
+- config-keys:
+    - kimik2.5-int4-b200-vllm
+  description:
+    - "Add Kimi-K2.5 INT4 vLLM benchmark for B200"
+    - "Model: moonshotai/Kimi-K2.5 with --mm-encoder-tp-mode data and --trust-remote-code"
+    - "Image: vllm/vllm-openai:v0.15.1"
+    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/735
+
 - config-keys:
     - gptoss-fp4-b200-vllm
     - gptoss-fp4-h100-vllm
@@ -821,8 +793,7 @@
     - "Gains: CUTLASS MoE optimizations (~8% throughput), FP4 kernel improvements (~4% E2E on B200), torch.compile cold-start fix"
     - "v0.15.1 includes fix for prefix cache hit rate of 0% on GPT-OSS hybrid attention models"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/789
-
-
+  
 - config-keys:
     - dsr1-fp4-mi355x-atom
     - dsr1-fp4-mi355x-atom-mtp
@@ -831,16 +802,16 @@
     - "Comment out TP=4 configs, consolidate to TP=8 only"
     - "Extend concurrency range to conc-end: 256 across all sequence lengths (1k1k, 1k8k, 8k1k)"
     - "Fix MTP 1k8k conc-start from 256 to 4 to enable full concurrency sweep"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/792
-
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/699
+ 
 - config-keys:
-    - qwen3.5-fp8-b200-sglang
+    - glm5-fp8-mi355x-sglang
   description:
-    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for B200"
-    - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64"
-    - "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner"
-    - "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804
+    - "Add GLM-5 FP8 SGLang benchmark for MI355X"
+    - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends"
+    - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219"
+    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
 
 - config-keys:
     - gptoss-fp4-mi300x-vllm
@@ -864,61 +835,11 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/816
 
 - config-keys:
-    - qwen3.5-fp4-b200-sglang
-  description:
-    - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang benchmark config and launch script"
-    - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64"
-    - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
-    - "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820
-
-- config-keys:
-    - dsr1-fp8-mi355x-sglang-disagg
-    - dsr1-fp8-mi355x-sglang-disagg-mtp
-    - dsr1-fp4-mi355x-sglang-disagg
-    - dsr1-fp4-mi355x-sglang-disagg-mtp
-  description:
-    - "Add more sweep configs for MI355X FP8/FP4 Disagg"
-    - "Add TP/DP/EP size < 8 support "
-    - "Support DSR1-0528 MTP Disagg"
-    - "Bump SGL mori image to Feb 27"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823
-
-- config-keys:
-    - kimik2.5-fp4-mi355x-vllm
-  description:
-    - "Add Kimi-K2.5 MXFP4 vLLM benchmark for MI355X"
-    - "Model: amd/Kimi-K2.5-MXFP4 with --mm-encoder-tp-mode data and --trust-remote-code"
-    - "Image: vllm/vllm-openai-rocm:v0.15.1"
-    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/825
-
-- config-keys:
-    - minimaxm2.5-fp4-mi355x-vllm
-  description:
-    - "Add MiniMax M2.5 MXFP4 vLLM benchmark for MI355X"
-    - "Model: amd/MiniMax-M2.5-MXFP4 with --trust-remote-code and --block-size=32"
-    - "Image: vllm/vllm-openai-rocm:v0.19.1"
-    - "Environment: VLLM_ROCM_USE_AITER=1"
-    - "Tp=1, TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/827
-
-- config-keys:
-    - minimaxm2.5-fp8-h200-vllm
+    - minimaxm2.5-fp8-h200-vllm
   description:
     - "Add MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP4)"
     - "New benchmark script with --trust-remote-code for MiniMaxAI/MiniMax-M2.5"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/831
-
-- config-keys:
-    - minimaxm2.5-fp8-h100-vllm
-  description:
-    - "Add MiniMax-M2.5 FP8 vLLM benchmark for H100"
-    - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code"
-    - "Image: vllm/vllm-openai:v0.16.0"
-    - "Switch from TP=8/EP=8 to TP=4/EP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k"
-    - "Script uses conditional --enable-expert-parallel based on EP_SIZE env var"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/832
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
 
 - config-keys:
     - minimaxm2.5-fp8-mi325x-vllm
@@ -940,13 +861,22 @@
     - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/837
 
+- config-keys:
+    - kimik2.5-fp4-mi355x-vllm
+  description:
+    - "Add Kimi-K2.5 MXFP4 vLLM benchmark for MI355X"
+    - "Model: amd/Kimi-K2.5-MXFP4 with --mm-encoder-tp-mode data and --trust-remote-code"
+    - "Image: vllm/vllm-openai-rocm:v0.15.1"
+    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/825
+
 - config-keys:
     - qwen3.5-bf16-mi325x-sglang
   description:
     - "Add Qwen3.5-397B-A17B BF16 SGLang benchmark for MI325X"
     - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x"
     - "Uses triton attention backend, TP=8, concurrency 4-64"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/842
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
 
 - config-keys:
     - qwen3.5-bf16-mi300x-sglang
@@ -957,14 +887,13 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/843
 
 - config-keys:
-    - kimik2.5-int4-h200-vllm
+    - qwen3.5-fp8-mi325x-sglang
   description:
-    - "Add Kimi-K2.5 INT4 vLLM benchmark for H200"
-    - "Model: moonshotai/Kimi-K2.5 with --reasoning-parser kimi_k2 and --trust-remote-code"
-    - "Image: vllm/vllm-openai:v0.16.0"
+    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark for MI325X"
+    - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x"
+    - "Following AMD Andy Luo's recipe with triton attention backend"
     - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-    - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/847
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
 
 - config-keys:
     - qwen3.5-fp8-mi300x-sglang
@@ -976,23 +905,47 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/850
 
 - config-keys:
-    - qwen3.5-fp8-mi325x-sglang
+    - kimik2.5-int4-h200-vllm
   description:
-    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark for MI325X"
-    - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x"
-    - "Following AMD Andy Luo's recipe with triton attention backend"
+    - "Add Kimi-K2.5 INT4 vLLM benchmark for H200"
+    - "Model: moonshotai/Kimi-K2.5 with --reasoning-parser kimi_k2 and --trust-remote-code"
+    - "Image: vllm/vllm-openai:v0.16.0"
     - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/852
+    - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839
+  
+- config-keys:
+    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-mi355x-sglang-disagg-mtp
+    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg-mtp
+  description:
+    - "Add more sweep configs for MI355X FP8/FP4 Disagg"
+    - "Add TP/DP/EP size < 8 support "
+    - "Support DSR1-0528 MTP Disagg"
+    - "Bump SGL mori image to Feb 27"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823
 
 - config-keys:
-    - gptoss-fp4-h200-trt
+    - minimaxm2.5-fp8-h100-vllm
   description:
-    - "Upgrade TensorRT-LLM container from release:gpt-oss-dev to release:v1.3.0rc5"
-    - "Remove sed hack for TensorRT bug (fixed upstream in v1.3.0rc5)"
-    - "Remove enable_block_reuse: false from kv_cache_config (default true is now recommended)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/854
+    - "Add MiniMax-M2.5 FP8 vLLM benchmark for H100"
+    - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code"
+    - "Image: vllm/vllm-openai:v0.16.0"
+    - "Switch from TP=8/EP=8 to TP=4/EP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k"
+    - "Script uses conditional --enable-expert-parallel based on EP_SIZE env var"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/832
 
 - config-keys:
+    - qwen3.5-fp8-b200-sglang
+  description:
+    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for B200"
+    - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64"
+    - "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner"
+    - "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804
+
+- config-keys: 
     - qwen3.5-fp8-h200-sglang
   description:
     - "Add Qwen 3.5 FP8 H200 SGLang configuration"
@@ -1001,13 +954,6 @@
     - "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855
 
-- config-keys:
-    - kimik2.5-fp4-b200-vllm
-  description:
-    - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration"
-    - "Image: vllm/vllm-openai:v0.17.0"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862
-
 - config-keys:
     - dsr1-fp8-mi355x-sglang
   description:
@@ -1016,18 +962,20 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/865
 
 - config-keys:
-    - minimaxm2.5-fp8-h200-vllm
-  description:
-    - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869
-
-
-- config-keys:
-    - dsr1-fp8-h200-sglang
+    - qwen3.5-bf16-b200-sglang
+    - qwen3.5-bf16-mi300x-sglang
+    - qwen3.5-bf16-mi325x-sglang
+    - qwen3.5-bf16-mi355x-sglang
+    - qwen3.5-fp8-b200-sglang
+    - qwen3.5-fp8-h200-sglang
+    - qwen3.5-fp8-mi300x-sglang
+    - qwen3.5-fp8-mi325x-sglang
+    - qwen3.5-fp8-mi355x-sglang
   description:
-    - "Update H200 DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.9"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/887
-
+    - "Redo qwen eval"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/892
+  evals-only: true
+  
 - config-keys:
     - gptoss-fp4-mi300x-vllm
     - gptoss-fp4-mi325x-vllm
@@ -1039,31 +987,23 @@
     - "Add AMDGCN_USE_BUFFER_OPS=0 and VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 env vars"
     - "Switch to --attention-backend ROCM_AITER_UNIFIED_ATTN and add fuse_rope_kvcache compilation pass"
     - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/889
-
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867
+  
 - config-keys:
-    - qwen3.5-bf16-b200-sglang
-    - qwen3.5-bf16-mi300x-sglang
-    - qwen3.5-bf16-mi325x-sglang
-    - qwen3.5-bf16-mi355x-sglang
-    - qwen3.5-fp8-b200-sglang
-    - qwen3.5-fp8-h200-sglang
-    - qwen3.5-fp8-mi300x-sglang
-    - qwen3.5-fp8-mi325x-sglang
-    - qwen3.5-fp8-mi355x-sglang
+    - kimik2.5-fp4-b200-vllm
   description:
-    - "Redo qwen eval"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/892
-  evals-only: true
-
+    - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration"
+    - "Image: vllm/vllm-openai:v0.17.0"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862
 
 - config-keys:
-    - qwen3.5-fp8-b200-sglang-mtp
+    - minimaxm2.5-fp8-b200-vllm
   description:
-    - "Add Single Node Agg FP8 MTP config for Qwen3.5 B200 SGLang"
-    - "EAGLE speculative decoding: num-steps 3, draft-tokens 4, topk 1"
-    - "New script: benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/898
+    - "Add MiniMax-M2.5 FP8 vLLM benchmark for B200"
+    - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code"
+    - "Image: vllm/vllm-openai:v0.17.0"
+    - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/757
 
 - config-keys:
     - dsr1-fp4-mi355x-sglang-disagg
@@ -1074,12 +1014,11 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/899
 
 - config-keys:
-    - kimik2.5-int4-mi325x-vllm
+    - minimaxm2.5-fp8-h200-vllm
   description:
-    - "Add Kimi K2.5 INT4 single-node MI325X vLLM benchmark (TP8)"
-    - "Uses vLLM ROCm v0.16.0 image following AMD Andy Luo's recipe"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/901
-
+    - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869
+  
 - config-keys:
     - dsr1-fp8-b200-dynamo-sglang
     - dsr1-fp8-b200-dynamo-sglang-mtp
@@ -1089,66 +1028,6 @@
     - "14 variants: STP/MTP x low-latency/max-throughput with updated concurrencies and scale points"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/907
 
-- config-keys:
-    # NVIDIA single-node
-    - dsr1-fp4-b200-sglang
-    - dsr1-fp4-b200-trt
-    - dsr1-fp4-b200-trt-mtp
-    - dsr1-fp8-b200-sglang
-    - dsr1-fp8-b200-sglang-mtp
-    - dsr1-fp8-b200-trt
-    - dsr1-fp8-b200-trt-mtp
-    - dsr1-fp8-h200-sglang
-    - dsr1-fp8-h200-trt
-    - dsr1-fp8-h200-trt-mtp
-    - glm5-fp8-b200-sglang
-    - glm5-fp8-h200-sglang
-    - gptoss-fp4-b200-trt
-    - gptoss-fp4-b200-vllm
-    - gptoss-fp4-h100-vllm
-    - gptoss-fp4-h200-trt
-    - gptoss-fp4-h200-vllm
-    - kimik2.5-fp4-b200-vllm
-    - kimik2.5-int4-b200-vllm
-    - kimik2.5-int4-h200-vllm
-    - minimaxm2.5-fp8-b200-vllm
-    - minimaxm2.5-fp8-h100-vllm
-    - minimaxm2.5-fp8-h200-vllm
-    - qwen3.5-bf16-b200-sglang
-    - qwen3.5-fp8-b200-sglang
-    - qwen3.5-fp8-b200-sglang-mtp
-    - qwen3.5-fp8-h200-sglang
-    # AMD single-node
-    - dsr1-fp4-mi355x-atom
-    - dsr1-fp4-mi355x-atom-mtp
-    - dsr1-fp4-mi355x-sglang
-    - dsr1-fp8-mi325x-sglang
-    - dsr1-fp8-mi300x-sglang
-    - dsr1-fp8-mi355x-atom
-    - dsr1-fp8-mi355x-atom-mtp
-    - dsr1-fp8-mi355x-sglang
-    - glm5-fp8-mi355x-sglang
-    - gptoss-fp4-mi300x-vllm
-    - gptoss-fp4-mi325x-vllm
-    - gptoss-fp4-mi355x-atom
-    - gptoss-fp4-mi355x-vllm
-    - kimik2.5-fp4-mi355x-vllm
-    - kimik2.5-int4-mi325x-vllm
-    - kimik2.5-int4-mi355x-vllm
-    - minimaxm2.5-fp8-mi300x-vllm
-    - minimaxm2.5-fp8-mi325x-vllm
-    - minimaxm2.5-fp8-mi355x-vllm
-    - qwen3.5-bf16-mi300x-sglang
-    - qwen3.5-bf16-mi325x-sglang
-    - qwen3.5-bf16-mi355x-sglang
-    - qwen3.5-fp8-mi300x-sglang
-    - qwen3.5-fp8-mi325x-sglang
-    - qwen3.5-fp8-mi355x-sglang
-  description:
-    - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911
-  evals-only: true
-
 - config-keys:
     - glm5-fp8-h200-sglang
   description:
@@ -1164,40 +1043,17 @@
     - "Add GLM-5 FP8 SGLang benchmark for B200"
     - "Supports TP8 (low latency) and DEP8 (high throughput) modes with NSA attention backend"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/915
-
-
+  
 - config-keys:
-    - qwen3.5-fp8-b200-sglang
+    - qwen3.5-fp8-b200-sglang-mtp
   description:
-    - "Replace FP8 with combination of TP4 and TP8 config"
-    - "Add --enable-flashinfer-allreduce-fusion to TP8"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918
+    - "Add Single Node Agg FP8 MTP config for Qwen3.5 B200 SGLang"
+    - "EAGLE speculative decoding: num-steps 3, draft-tokens 4, topk 1"
+    - "New script: benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/898
 
 - config-keys:
-    - dsr1-fp8-b200-dynamo-trt
-    - dsr1-fp8-h200-dynamo-trt
-    - dsr1-fp4-gb200-dynamo-trt
-  description:
-    - "Fix metadata inconsistencies in nvidia-master.yaml - TP/EP/DP-attn values now match actual recipe files"
-    - "B200 FP8 TRT 8K/1K: prefill_ep 8→1 (15 entries), prefill_dp_attn true→false (1 entry)"
-    - "H200 FP8 TRT 1K/1K: prefill_dp_attn false→true (9 entries)"
-    - "H200 FP8 TRT 8K/1K: prefill_dp_attn true→false (8 entries)"
-    - "GB200 FP4 TRT 8K/1K: decode_dp_attn true→false (2 entries)"
-    - "All fixes are metadata-only; no recipe files were modified"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/919
-
-- config-keys:
-    - kimik2.5-int4-mi325x-vllm
-    - kimik2.5-int4-mi355x-vllm
-    - kimik2.5-int4-h200-vllm
-    - kimik2.5-fp4-mi355x-vllm
-    - kimik2.5-fp4-b200-vllm
-  description:
-    - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926
-
-- config-keys:
-    - minimaxm2.5-fp8-mi355x-vllm
+    - minimaxm2.5-fp8-mi355x-vllm
   description:
     - "ADD minimax TP=8 with EP, in config of 1k1k, 1k8k, and 8k1k sequence lengths"
     - "Config concurrency: 32-256"
@@ -1229,7 +1085,13 @@
     - "Add --exclusive flag to MI355X single-node salloc and multi-node sbatch to prevent node sharing during benchmarks"
     - "Only non-TP8 configs listed; TP8 already uses all GPUs on the node"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/934
-
+  
+- config-keys:
+    - qwen3.5-fp8-b200-sglang
+  description:
+    - "Replace FP8 with combination of TP4 and TP8 config"
+    - "Add --enable-flashinfer-allreduce-fusion to TP8"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918
 
 - config-keys:
     - kimik2.5-int4-b200-vllm
@@ -1237,15 +1099,6 @@
     - "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935
 
-- config-keys:
-    - kimik2.5-fp4-mi355x-vllm
-  description:
-    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
-    - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)"
-    - "Add expert parallel, TP4, and TP4/EP4 search spaces"
-    - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936
-
 - config-keys:
     - dsr1-fp4-b200-sglang
     - dsr1-fp8-b200-sglang
@@ -1258,15 +1111,22 @@
     - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130"
     - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943
-
+  
+- config-keys:
+    - minimaxm2.5-fp8-mi325x-vllm
+  description:
+    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
+    - "Replace TP4 with TP8/EP8, add conc range 4-256"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/953
 
 - config-keys:
-    - minimaxm2.5-fp8-b200-vllm
+    - kimik2.5-fp4-mi355x-vllm
   description:
-    - "Update vLLM image from v0.17.0 to v0.19.0 for MiniMax-M2.5 FP8 B200"
-    - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs"
-    - "Remove ISL 1024 / OSL 8192 seq-len config"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947
+    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
+    - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)"
+    - "Add expert parallel, TP4, and TP4/EP4 search spaces"
+    - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936
 
 - config-keys:
     - kimik2.5-int4-mi355x-vllm
@@ -1277,13 +1137,6 @@
     - "Add --max-num-seqs 256, remove --disable-log-requests"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/950
 
-- config-keys:
-    - minimaxm2.5-fp8-mi325x-vllm
-  description:
-    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
-    - "Replace TP4 with TP8/EP8, add conc range 4-256"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/953
-
 - config-keys:
     - kimik2.5-int4-mi325x-vllm
   description:
@@ -1293,13 +1146,6 @@
     - "Add --max-num-seqs 256, remove --disable-log-requests"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/957
 
-- config-keys:
-    - minimaxm2.5-fp8-h100-vllm
-    - minimaxm2.5-fp8-h200-vllm
-  description:
-    - "Update vLLM image from v0.16.0 to v0.18.0 for minimax h100 and h200 configs"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/958
-
 - config-keys:
     - gptoss-fp4-h100-vllm
     - gptoss-fp4-h200-vllm
@@ -1307,6 +1153,16 @@
     - "Update vLLM image from v0.15.1 to v0.18.0 for gptoss H100 and H200 configs"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/960
 
+- config-keys:
+    - kimik2.5-int4-mi325x-vllm
+    - kimik2.5-int4-mi355x-vllm
+    - kimik2.5-int4-h200-vllm
+    - kimik2.5-fp4-mi355x-vllm
+    - kimik2.5-fp4-b200-vllm
+  description:
+    - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926
+
 - config-keys:
     - minimaxm2.5-fp8-b200-vllm
     - minimaxm2.5-fp8-h100-vllm
@@ -1318,6 +1174,66 @@
     - "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966
 
+- config-keys:
+    # NVIDIA single-node
+    - dsr1-fp4-b200-sglang
+    - dsr1-fp4-b200-trt
+    - dsr1-fp4-b200-trt-mtp
+    - dsr1-fp8-b200-sglang
+    - dsr1-fp8-b200-sglang-mtp
+    - dsr1-fp8-b200-trt
+    - dsr1-fp8-b200-trt-mtp
+    - dsr1-fp8-h200-sglang
+    - dsr1-fp8-h200-trt
+    - dsr1-fp8-h200-trt-mtp
+    - glm5-fp8-b200-sglang
+    - glm5-fp8-h200-sglang
+    - gptoss-fp4-b200-trt
+    - gptoss-fp4-b200-vllm
+    - gptoss-fp4-h100-vllm
+    - gptoss-fp4-h200-trt
+    - gptoss-fp4-h200-vllm
+    - kimik2.5-fp4-b200-vllm
+    - kimik2.5-int4-b200-vllm
+    - kimik2.5-int4-h200-vllm
+    - minimaxm2.5-fp8-b200-vllm
+    - minimaxm2.5-fp8-h100-vllm
+    - minimaxm2.5-fp8-h200-vllm
+    - qwen3.5-bf16-b200-sglang
+    - qwen3.5-fp8-b200-sglang
+    - qwen3.5-fp8-b200-sglang-mtp
+    - qwen3.5-fp8-h200-sglang
+    # AMD single-node
+    - dsr1-fp4-mi355x-atom
+    - dsr1-fp4-mi355x-atom-mtp
+    - dsr1-fp4-mi355x-sglang
+    - dsr1-fp8-mi325x-sglang
+    - dsr1-fp8-mi300x-sglang
+    - dsr1-fp8-mi355x-atom
+    - dsr1-fp8-mi355x-atom-mtp
+    - dsr1-fp8-mi355x-sglang
+    - glm5-fp8-mi355x-sglang
+    - gptoss-fp4-mi300x-vllm
+    - gptoss-fp4-mi325x-vllm
+    - gptoss-fp4-mi355x-atom
+    - gptoss-fp4-mi355x-vllm
+    - kimik2.5-fp4-mi355x-vllm
+    - kimik2.5-int4-mi325x-vllm
+    - kimik2.5-int4-mi355x-vllm
+    - minimaxm2.5-fp8-mi300x-vllm
+    - minimaxm2.5-fp8-mi325x-vllm
+    - minimaxm2.5-fp8-mi355x-vllm
+    - qwen3.5-bf16-mi300x-sglang
+    - qwen3.5-bf16-mi325x-sglang
+    - qwen3.5-bf16-mi355x-sglang
+    - qwen3.5-fp8-mi300x-sglang
+    - qwen3.5-fp8-mi325x-sglang
+    - qwen3.5-fp8-mi355x-sglang
+  description:
+    - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911
+  evals-only: true
+
 - config-keys:
     - qwen3.5-bf16-mi300x-sglang
     - qwen3.5-bf16-mi325x-sglang
@@ -1335,13 +1251,6 @@
     - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973
 
-- config-keys:
-    - kimik2.5-int4-mi300x-vllm
-  description:
-    - "Add Kimi K2.5 INT4 single-node MI300X vLLM benchmark (TP8)"
-    - "Uses vLLM ROCm v0.18.0 image following AMD Andy Luo's recipe"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/975
-
 - config-keys:
     - dsr1-fp8-mi355x-atom-mtp
   description:
@@ -1355,45 +1264,22 @@
   description:
     - "New model support on ATOM framework"
     - "Kimi-K2.5 FP4, and MiniMax-M2.5 FP8 configs added for MI355X ATOM"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/992
-
-- config-keys:
-    - minimaxm2.5-fp4-b200-vllm
-  description:
-    - "Optimize MiniMax-M2.5 NVFP4 B200 vLLM search-space"
-    - "Expand from tp2/tp4 to tp1/tp2/tp4/tp8 with expert parallel and dp-attn variants"
-    - "Add ep2, ep4, and dp-attn configurations for higher concurrency sweeps"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/996
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/963
 
 - config-keys:
-    - dsr1-fp4-b200-dynamo-trt
-    - dsr1-fp8-b200-dynamo-trt
-    - dsr1-fp4-b200-dynamo-sglang
-    - dsr1-fp8-b200-dynamo-sglang
-    - dsr1-fp8-b200-dynamo-sglang-mtp
-    - dsr1-fp4-b200-dynamo-sglang-mtp
-    - dsr1-fp4-b300-dynamo-trt
-    - dsr1-fp8-b300-dynamo-trt
-    - dsr1-fp4-gb300-dynamo-trt
-    - dsr1-fp8-gb300-dynamo-trt
-    - dsr1-fp4-gb300-dynamo-sglang
-    - dsr1-fp8-gb300-dynamo-sglang
-    - dsr1-fp8-mi355x-sglang-disagg
-    - dsr1-fp8-mi355x-sglang-disagg-mtp
-    - dsr1-fp4-mi355x-sglang-disagg
-    - dsr1-fp4-mi355x-sglang-disagg-mtp
+    - minimaxm2.5-fp8-b200-vllm
   description:
-    - "Add multi-node lm-eval accuracy runs"
-    - "Eval picks the config with highest max eligible concurrency per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) group on 8k1k"
-    - "Eval concurrency set to the median eligible conc (>= MIN_EVAL_CONC=16) of the selected config to avoid OOM"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1000
-  evals-only: true
+    - "Update vLLM image from v0.17.0 to v0.19.0 for MiniMax-M2.5 FP8 B200"
+    - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs"
+    - "Remove ISL 1024 / OSL 8192 seq-len config"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947
 
 - config-keys:
-    - qwen3.5-fp8-h200-sglang-mtp
+    - minimaxm2.5-fp8-mi355x-vllm
   description:
-    - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001
+    - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space"
+    - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1002
 
 - config-keys:
     - minimaxm2.5-fp8-mi355x-vllm
@@ -1403,14 +1289,28 @@
     - "Upgrade vLLM image to v0.19.0"
     - "Enable FP8 KV cache + AITER FA for minimaxm2.5-fp8-mi355x-vllm"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1003
-
+  
+- config-keys:
+    - minimaxm2.5-fp4-mi355x-vllm
+  description:
+    - "Add MiniMax M2.5 MXFP4 vLLM benchmark for MI355X"
+    - "Model: amd/MiniMax-M2.5-MXFP4 with --trust-remote-code and --block-size=32"
+    - "Image: vllm/vllm-openai-rocm:v0.19.1"
+    - "Environment: VLLM_ROCM_USE_AITER=1"
+    - "Tp=1, TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/827
 
 - config-keys:
-    - qwen3.5-fp4-mi355x-sglang
+    - qwen3.5-fp8-h200-sglang-mtp
   description:
-    - "Qwen3.5 fp4 support on SGL"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1006
+    - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001
 
+- config-keys:
+    - glm5-fp8-mi355x-atom
+  description:
+    - "GLM5 FP8 configs added for MI355X ATOM"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1009
 
 - config-keys:
     - kimik2.5-fp4-gb200-dynamo-vllm
@@ -1425,18 +1325,39 @@
     - "Runner script updated to clone NVIDIA/srt-slurm and map vLLM container image"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1008
 
-- config-keys:
-    - glm5-fp8-mi355x-atom
-  description:
-    - "GLM5 FP8 configs added for MI355X ATOM"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1009
-
 - config-keys:
     - minimaxm2.5-fp8-b200-vllm
   description:
     - "Update MiniMax-M2.5 FP8 B200 config with new search spaces"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1010
 
+- config-keys:
+    - minimaxm2.5-fp4-b200-vllm
+  description:
+    - "Optimize MiniMax-M2.5 NVFP4 B200 vLLM search-space"
+    - "Expand from tp2/tp4 to tp1/tp2/tp4/tp8 with expert parallel and dp-attn variants"
+    - "Add ep2, ep4, and dp-attn configurations for higher concurrency sweeps"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/996
+
+- config-keys:
+    - qwen3.5-fp4-b200-sglang
+  description:
+    - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang benchmark config and launch script"
+    - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64"
+    - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
+    - "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820
+
+- config-keys:
+    - qwen3.5-bf16-mi300x-sglang
+    - qwen3.5-bf16-mi325x-sglang
+    - qwen3.5-fp8-mi300x-sglang
+    - qwen3.5-fp8-mi325x-sglang
+  description:
+    - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and  MI325X to achieve better performance"
+    - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1063
+
 - config-keys:
     - glm5-fp4-b200-sglang
   description:
@@ -1447,25 +1368,31 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1011
 
 - config-keys:
-    - glm5-fp8-b200-sglang
+    - qwen3.5-fp4-mi355x-sglang
   description:
-    - "Bump GLM-5 FP8 B200 SGLang concurrency from 128 to 256"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1012
-
-
+    - "Qwen3.5 fp4 support on SGL"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1006
+  
 - config-keys:
-    - qwen3.5-fp8-h200-sglang-mtp
+    - gptoss-fp4-h200-trt
   description:
-    - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017
+    - "Upgrade TensorRT-LLM container from release:gpt-oss-dev to release:v1.3.0rc5"
+    - "Remove sed hack for TensorRT bug (fixed upstream in v1.3.0rc5)"
+    - "Remove enable_block_reuse: false from kv_cache_config (default true is now recommended)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/854
 
 - config-keys:
-    - qwen3.5-fp4-mi355x-sglang
+    - glm5-fp8-b200-sglang
+  description:
+    - "Bump GLM-5 FP8 B200 SGLang concurrency from 128 to 256"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1012
+  
+- config-keys:
+    - qwen3.5-fp4-mi355x-sglang
   description:
     - "TP2/TP4 seach space exploration for Qwen3.5 fp4 on SGL"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1022
-
-
+  
 - config-keys:
     - glm5-fp8-mi355x-sglang
   description:
@@ -1474,28 +1401,10 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1023
 
 - config-keys:
-    - kimik2.5-fp4-gb200-dynamo-trt
-  description:
-    - "Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo (14 STP configs)"
-    - "New framework: dynamo-trt (Dynamo frontend + TensorRT-LLM backend)"
-    - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
-    - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026"
-    - "Runner script updated to support kimik2.5 model prefix with dynamo-trt framework"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1026
-
-- config-keys:
-    - glm5-fp4-b200-sglang
-  description:
-    - "Update SGLang image from nightly-dev-cu13-20260328-a27651d5 to v0.5.10.post1-cu130"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1031
-
-- config-keys:
-    - qwen3.5-fp8-b300-sglang-mtp
+    - qwen3.5-fp8-h200-sglang-mtp
   description:
-    - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang MTP benchmark"
-    - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
-    - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035
+    - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017
 
 - config-keys:
     - qwen3.5-fp8-mi355x-sglang
@@ -1508,33 +1417,18 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1036
 
 - config-keys:
-    - qwen3.5-fp8-mi355x-atom
-    - qwen3.5-fp8-mi355x-atom-mtp
-  description:
-    - "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1040
-
-
-- config-keys:
-    - qwen3.5-fp4-mi355x-sglang
-  description:
-    - "Update SGLang image from 'lmsysorg/sglang:v0.5.10-rocm720-mi35x' to 'rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413'"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1041
-
-- config-keys:
-    - glm5.1-fp4-mi355x-atom
+    - glm5-fp4-b200-sglang
   description:
-    - "Add GLM-5.1 MXFP4 single-node MI355X ATOM benchmark"
-    - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post"
-    - "TP=2 and TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths"
-    - "Add --max-num-seqs and --gpu-memory-utilization 0.9 to server launch"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1043
+    - "Update SGLang image from nightly-dev-cu13-20260328-a27651d5 to v0.5.10.post1-cu130"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1031
 
 - config-keys:
-    - kimik2.5-fp4-b200-vllm
+    - qwen3.5-fp8-b300-sglang-mtp
   description:
-    - "Add kv-cache-dtype fp8, max-cudagraph-capture-size 2048, max-num-batched-tokens, and stream-interval 20 to server launch args"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1047
+    - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang MTP benchmark"
+    - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
+    - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035
 
 - config-keys:
     - qwen3.5-fp8-b300-sglang
@@ -1568,22 +1462,6 @@
     - "At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 does not have a B300-specific recipe, so this reuses the existing GLM5 FP8 B200 SGLang recipe as-is"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1051
 
-- config-keys:
-    - minimaxm2.5-fp8-b300-vllm
-  description:
-    - "Add MiniMax-M2.5 FP8 B300 vLLM benchmark"
-    - "Image: vllm/vllm-openai:v0.19.0-cu130"
-    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054
-
-- config-keys:
-    - minimaxm2.5-fp4-b300-vllm
-  description:
-    - "Add MiniMax-M2.5 FP4 (NVFP4) B300 vLLM benchmark"
-    - "Image: vllm/vllm-openai:v0.19.0-cu130"
-    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP4 B200 vLLM recipe as-is"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1055
-
 - config-keys:
     - glm5-fp4-b300-sglang
   description:
@@ -1602,33 +1480,59 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1059
 
 - config-keys:
-    - gptoss-fp4-mi300x-vllm
+    - minimaxm2.5-fp4-b300-vllm
   description:
-    - "Expand GPT-OSS 120B FP4 MI300X TP=1 concurrency from 64 to 256 for 1k1k"
-    - "Higher concurrency improves MoE weight amortization: 8552 total TPS at conc=256 vs 4016 at conc=64 (2.1x)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1061
+    - "Add MiniMax-M2.5 FP4 (NVFP4) B300 vLLM benchmark"
+    - "Image: vllm/vllm-openai:v0.19.0-cu130"
+    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP4 B200 vLLM recipe as-is"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1055
 
 - config-keys:
-    - qwen3.5-bf16-mi300x-sglang
-    - qwen3.5-bf16-mi325x-sglang
-    - qwen3.5-fp8-mi300x-sglang
-    - qwen3.5-fp8-mi325x-sglang
+    - minimaxm2.5-fp8-b300-vllm
   description:
-    - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and  MI325X to achieve better performance"
-    - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1063
+    - "Add MiniMax-M2.5 FP8 B300 vLLM benchmark"
+    - "Image: vllm/vllm-openai:v0.19.0-cu130"
+    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054
 
 - config-keys:
-    - minimaxm2.5-fp8-b200-vllm
+    - kimik2.5-fp4-b300-vllm
   description:
-    - "Add VLLM_FLOAT32_MATMUL_PRECISION=high, remove VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1068
+    - "Add Kimi-K2.5 FP4 (NVFP4) B300 vLLM benchmark"
+    - "Image: vllm/vllm-openai:v0.19.0-cu130"
+    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1056
 
 - config-keys:
-    - minimaxm2.5-fp4-b200-vllm
+    - gptoss-fp4-mi300x-vllm
   description:
-    - "Add VLLM_FLOAT32_MATMUL_PRECISION=high"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1069
+    - "Expand GPT-OSS 120B FP4 MI300X TP=1 concurrency from 64 to 256 for 1k1k"
+    - "Higher concurrency improves MoE weight amortization: 8552 total TPS at conc=256 vs 4016 at conc=64 (2.1x)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1053
+
+- config-keys:
+    - dsr1-fp4-b200-dynamo-trt
+    - dsr1-fp8-b200-dynamo-trt
+    - dsr1-fp4-b200-dynamo-sglang
+    - dsr1-fp8-b200-dynamo-sglang
+    - dsr1-fp8-b200-dynamo-sglang-mtp
+    - dsr1-fp4-b200-dynamo-sglang-mtp
+    - dsr1-fp4-b300-dynamo-trt
+    - dsr1-fp8-b300-dynamo-trt
+    - dsr1-fp4-gb300-dynamo-trt
+    - dsr1-fp8-gb300-dynamo-trt
+    - dsr1-fp4-gb300-dynamo-sglang
+    - dsr1-fp8-gb300-dynamo-sglang
+    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-mi355x-sglang-disagg-mtp
+    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg-mtp
+  description:
+    - "Add multi-node lm-eval accuracy runs"
+    - "Eval picks the config with highest max eligible concurrency per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) group on 8k1k"
+    - "Eval concurrency set to the median eligible conc (>= MIN_EVAL_CONC=16) of the selected config to avoid OOM"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1000
+  evals-only: true
 
 - config-keys:
     - qwen3.5-fp4-b300-sglang
@@ -1639,47 +1543,7 @@
     - "Follows the SGLang cookbook recipe at https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5 as of 2026-04-17"
     - "Mirrors the B200 FP4 recipe with mem-fraction-static lowered to 0.8 and an extra TP2/EP2 search-space entry"
     - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1072
-
-- config-keys:
-    - qwen3.5-bf16-b200-sglang-mtp
-  description:
-    - "Add Qwen3.5-397B-A17B BF16 B200 SGLang MTP benchmark"
-    - "Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e"
-    - "Model: Qwen/Qwen3.5-397B-A17B"
-    - "Mirrors the qwen3.5-bf16-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
-    - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-64 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1074
-
-- config-keys:
-    - qwen3.5-fp4-b200-sglang-mtp
-  description:
-    - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang MTP benchmark"
-    - "Image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6"
-    - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
-    - "Mirrors the qwen3.5-fp4-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
-    - "Configs: 1k1k and 8k1k, TP=4/EP=1 conc 4-128 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1075
-
-- config-keys:
-    - qwen3.5-fp8-mi355x-sglang-mtp
-  description:
-    - "Add Qwen3.5-397B-A17B FP8 MI355X SGLang MTP benchmark"
-    - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414"
-    - "Model: Qwen/Qwen3.5-397B-A17B-FP8"
-    - "Mirrors the qwen3.5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
-    - "Configs: 1k1k (TP8/EP1, TP8/EP8, TP2/EP2) and 8k1k (TP2/EP2, TP4/EP1) with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1076
-
-- config-keys:
-    - qwen3.5-bf16-mi355x-sglang-mtp
-  description:
-    - "Add Qwen3.5-397B-A17B BF16 MI355X SGLang MTP benchmark"
-    - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
-    - "Model: Qwen/Qwen3.5-397B-A17B"
-    - "Mirrors the qwen3.5-bf16-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
-    - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1077
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
 
 - config-keys:
     - qwen3.5-bf16-b300-sglang
@@ -1689,7 +1553,17 @@
     - "Model: Qwen/Qwen3.5-397B-A17B"
     - "Mirrors the B200 BF16 recipe with an extra TP4/EP1 search-space entry alongside the existing TP8/EP1 sweep"
     - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1081
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+
+- config-keys:
+    - qwen3.5-bf16-b200-sglang-mtp
+  description:
+    - "Add Qwen3.5-397B-A17B BF16 B200 SGLang MTP benchmark"
+    - "Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e"
+    - "Model: Qwen/Qwen3.5-397B-A17B"
+    - "Mirrors the qwen3.5-bf16-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
+    - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-64 with spec-decoding=mtp"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
 
 - config-keys:
     - qwen3.5-bf16-b300-sglang-mtp
@@ -1699,7 +1573,7 @@
     - "Model: Qwen/Qwen3.5-397B-A17B"
     - "Mirrors the qwen3.5-bf16-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
     - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64, spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1082
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
 
 - config-keys:
     - qwen3.5-fp4-b300-sglang-mtp
@@ -1709,7 +1583,7 @@
     - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
     - "Mirrors the qwen3.5-fp4-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
     - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128, spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1083
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
 
 - config-keys:
     - glm5-fp8-b300-sglang-mtp
@@ -1719,7 +1593,17 @@
     - "Model: zai-org/GLM-5-FP8"
     - "Mirrors the glm5-fp8-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
     - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1084
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+
+- config-keys:
+    - qwen3.5-bf16-mi355x-sglang-mtp
+  description:
+    - "Add Qwen3.5-397B-A17B BF16 MI355X SGLang MTP benchmark"
+    - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
+    - "Model: Qwen/Qwen3.5-397B-A17B"
+    - "Mirrors the qwen3.5-bf16-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
+    - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
 
 - config-keys:
     - glm5-fp8-b200-sglang-mtp
@@ -1729,7 +1613,27 @@
     - "Model: zai-org/GLM-5-FP8"
     - "Mirrors the glm5-fp8-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
     - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1085
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+
+- config-keys:
+    - glm5-fp4-b300-sglang-mtp
+  description:
+    - "Add GLM-5 NVFP4 B300 SGLang MTP benchmark (draft)"
+    - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
+    - "Model: nvidia/GLM-5-NVFP4"
+    - "Follows the glm5-fp8-b300-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
+    - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+
+- config-keys:
+    - qwen3.5-fp8-mi355x-sglang-mtp
+  description:
+    - "Add Qwen3.5-397B-A17B FP8 MI355X SGLang MTP benchmark"
+    - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414"
+    - "Model: Qwen/Qwen3.5-397B-A17B-FP8"
+    - "Mirrors the qwen3.5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
+    - "Configs: 1k1k (TP8/EP1, TP8/EP8, TP2/EP2) and 8k1k (TP2/EP2, TP4/EP1) with spec-decoding=mtp"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
 
 - config-keys:
     - glm5-fp8-mi355x-sglang-mtp
@@ -1739,27 +1643,27 @@
     - "Model: zai-org/GLM-5-FP8"
     - "Mirrors the glm5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
     - "Configs: 1k1k and 8k1k, TP=8 conc 4-64 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1086
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
 
 - config-keys:
-    - glm5-fp4-b200-sglang-mtp
+    - qwen3.5-fp4-b200-sglang-mtp
   description:
-    - "Add GLM-5 NVFP4 B200 SGLang MTP benchmark (draft)"
-    - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
-    - "Model: nvidia/GLM-5-NVFP4"
-    - "Follows the glm5-fp8-b200-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
-    - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1087
+    - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang MTP benchmark"
+    - "Image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6"
+    - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
+    - "Mirrors the qwen3.5-fp4-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
+    - "Configs: 1k1k and 8k1k, TP=4/EP=1 conc 4-128 with spec-decoding=mtp"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
 
 - config-keys:
-    - glm5-fp4-b300-sglang-mtp
+    - glm5-fp4-b200-sglang-mtp
   description:
-    - "Add GLM-5 NVFP4 B300 SGLang MTP benchmark (draft)"
+    - "Add GLM-5 NVFP4 B200 SGLang MTP benchmark (draft)"
     - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
     - "Model: nvidia/GLM-5-NVFP4"
-    - "Follows the glm5-fp8-b300-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
+    - "Follows the glm5-fp8-b200-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
     - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1088
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
 
 - config-keys:
     - gptoss-fp4-mi300x-vllm
@@ -1768,6 +1672,12 @@
     - "low-latency endpoint for users prioritizing interactive single-user use cases (chat, copilot, agentic)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1092
 
+- config-keys:
+    - kimik2.5-fp4-b200-vllm
+  description:
+    - "Add kv-cache-dtype fp8, max-cudagraph-capture-size 2048, max-num-batched-tokens, and stream-interval 20 to server launch args"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1047
+
 - config-keys:
     - dsr1-fp8-h200-dynamo-trt
     - dsr1-fp8-h200-dynamo-sglang
@@ -1776,21 +1686,6 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1094
   evals-only: true
 
-- config-keys:
-    - glm5.1-fp4-mi355x-sglang
-  description:
-    - "Add GLM5.1 MXFP4 (FP4) MI355X SGLang Support"
-    - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1098
-
-- config-keys:
-    - kimik2.5-fp4-b300-vllm
-  description:
-    - "Add Kimi-K2.5 FP4 (NVFP4) B300 vLLM benchmark"
-    - "Image: vllm/vllm-openai:v0.19.0-cu130"
-    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1100
-
 - config-keys:
     - minimaxm2.5-fp8-b300-vllm
   description:
@@ -1804,11 +1699,16 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1107
 
 - config-keys:
-    - dsr1-fp8-h100-dynamo-trt
-    - dsr1-fp8-h100-dynamo-sglang
+    - minimaxm2.5-fp8-b200-vllm
   description:
-    - "Trigger H100 multinode evals after dist-timeout and health-check timeout fixes"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1119
+    - "Add VLLM_FLOAT32_MATMUL_PRECISION=high, remove VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1068
+
+- config-keys:
+    - minimaxm2.5-fp4-b200-vllm
+  description:
+    - "Add VLLM_FLOAT32_MATMUL_PRECISION=high"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1069
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
@@ -1816,289 +1716,4 @@
   description:
     - "Trigger H100 multinode evals after NVSHEMM fixes"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1120
-  evals-only: true
-
-- config-keys:
-    - dsv4-fp4-gb200-dynamo-vllm
-  description:
-    - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (1k/1k sweep; 8k/1k currently commented out)"
-    - "Container: vllm/vllm-openai:deepseekv4-cu130; model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)"
-    - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern"
-    - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129
-
-
-- config-keys:
-    - dsv4-fp8-h200-vllm
-  description:
-    - "Add DeepSeek-V4-Pro vLLM H200 benchmark per https://vllm.ai/blog/deepseek-v4"
-    - "Image: vllm/vllm-openai:deepseekv4-cu129"
-    - "Model: deepseek-ai/DeepSeek-V4-Pro"
-    - "EP + DP=8, FP8 KV cache, block size 256, max-model-len 800000, prefix caching disabled"
-    - "H200 has no FP4 path, so --attention_config.use_fp4_indexer_cache is omitted"
-    - "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading"
-    - "Configs: 1k1k conc 4-64, 8k1k conc 4-64"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130
-
-- config-keys:
-    - dsv4-fp4-b200-sglang
-  description:
-    - "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)"
-    - "Container: lmsysorg/sglang:deepseek-v4-blackwell"
-    - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-    - "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config"
-    - "Prefix caching and speculative decoding disabled for baseline numbers"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131
-
-- config-keys:
-    - dsv4-fp8-mi355x-sglang
-  description:
-    - "Day 0 DeepSeek-V4-Pro FP8 MI355X SGLang benchmark"
-    - "Image: rocm/sgl-dev:deepseek-v4-mi35x (from sgl-project/sglang#23608)"
-    - "Model: sgl-project/DeepSeek-V4-Pro-FP8"
-    - "https://github.com/sgl-project/sglang/pull/23608#issuecomment-4311952977"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1134
-
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark (low-latency fallback)"
-    - "Image: lmsysorg/sglang:deepseek-v4-b300"
-    - "Model: deepseek-ai/DeepSeek-V4-Pro"
-    - "Low-latency only (TP=8, EP=1, no DP-attn, no DeepEP) — DeepEP FP8 weight-postprocess path is broken for this checkpoint on B300"
-    - "Prefix caching disabled, no speculative decoding"
-    - "Configs: 1k1k conc 4-1024, 8k1k conc 4-512"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143
-
-- config-keys:
-    - dsv4-fp4-b300-vllm
-  description:
-    - "Add DeepSeek-V4-Pro single-node B300 vLLM aggregate benchmark"
-    - "Image: vllm/vllm-openai:deepseekv4-cu130"
-    - "Model: deepseek-ai/DeepSeek-V4-Pro"
-    - "Uses the submitted B300 pareto schedule for both 1k1k and 8k1k, excluding conc 1: TP8 at conc 4/128, TP4 at conc 4/8/16/32/64/128, DP4 at conc 256/512"
-    - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048"
-    - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1144
-
-- config-keys:
-    - dsv4-fp8-mi355x-sglang
-  description:
-    - "Bump MI355X SLURM allocation from --time=180 to --time=300 in runners/launch_mi355x-amds.sh"
-    - "DSv4-Pro on MI355X exceeded the 3h cap (STEP CANCELLED DUE TO TIME LIMIT) due to ~30min MoE JIT compile plus slow torch-fallback kernels (SGLANG_HACK_FLASHMLA_BACKEND=torch et al.) from sgl-project/sglang#23608"
-    - "300 minutes matches the GH Actions outer timeout-minutes cap in benchmark-tmpl.yml"
-    - "Retriggering dsv4-fp8-mi355x-sglang"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1148
-
-- config-keys:
-    - dsv4-fp8-mi355x-sglang
-  description:
-    - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh"
-    - "Bump --chunked-prefill-size from 4096 to 8192"
-    - "Retrigger dsv4-fp8-mi355x-sglang"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160
-
-- config-keys:
-    - dsv4-fp4-mi355x-atom
-  description:
-    - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)"
-    - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script"
-    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton"
-    - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
-    - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
-    - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1165
-
-- config-keys:
-    - dsv4-fp4-mi355x-atom
-  description:
-    - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)"
-    - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script"
-    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton"
-    - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
-    - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
-    - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170
-
-- config-keys:
-    - dsv4-fp4-b300-sglang-mtp
-  description:
-    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
-    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
-    - "Model: deepseek-ai/DeepSeek-V4-Pro"
-    - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
-    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
-    - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
-    - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166
-
-- config-keys:
-   - dsv4-fp4-b300-vllm
-  description:
-    - "Update search space based on B300 pareto sweep results"
-    - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192"
-    - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155
-
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "Recipe-per-CONC split for DeepSeek-V4-Pro on B300: low-latency (TP=8, EP=1), balanced (TP=4, EP=1) at conc=32, max-throughput (TP=4, EP=4, DP-attn + DeepEP) at conc=512, for both 1k1k and 8k1k"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-    - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3"
-    - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185
-  
-- config-keys:
-    - dsv4-fp4-b200-sglang
-  description:
-    - "Two-recipe dispatch for DeepSeek-V4-Pro on B200, selected by DP_ATTENTION knob: low-latency (TP=8, EP=1, flashinfer_mxfp4) for conc 1-32, DP-attention (TP=8, EP=8, DP-attn + DeepEP + mega_moe) for conc 64-{512,1024}. The DP-attention recipe uses identical flags across balanced and max-throughput CONC ranges; only --max-running-requests scales with CONC."
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-    - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b"
-    - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187
-
-- config-keys:
-    - dsv4-fp4-b300-sglang-mtp
-  description:
-    - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182
-
-- config-keys:
-    - dsv4-fp4-b300-vllm
-  description:
-    - Add low-latency configs and remove non-pareto configs
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1193
-
-- config-keys:
-    - dsv4-fp4-b200-vllm
-  description:
-    - "Add DeepSeek-V4-Pro single-node B200 vLLM benchmark derived from B200 pareto sweep"
-    - "ISL=1024: TP8 conc 4-128; DP8 (dp-attn) conc 256-4096"
-    - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156
-
-- config-keys:
-   - dsv4-fp4-b300-sglang-mtp
-  description:
-    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
-    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
-    - "Model: deepseek-ai/DeepSeek-V4-Pro"
-    - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
-    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
-    - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
-    - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180
-
-- config-keys:
-    - dsv4-fp8-mi355x-vllm
-  description:
-    - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)"
-    - "Base image rocm/atom:rocm7.2.2 (MI355X ROCm 7.2.2, aiter with MLA decode); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime via --no-deps overlay"
-    - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1"
-    - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k"
-    - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188
-
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8"
-    - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768"
-    - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179
-
-- config-keys:
-    - dsv4-fp4-mi355x-atom
-  description:
-    - "Use ROCm/aiter#2916 mhc_pre device-allocation fix instead of disabling ATOM mhc_pre"
-    - "Patch installed aiter/ops/mhc.py at runtime to allocate mhc_pre intermediates on residual.device, preserving the aiter MHC fast path without rebuilding aiter"
-    - "Remove the ATOM deepseek_v4.py sed workaround that forced mhc_pre to torch fallback"
-    - "Keep dsv4-fp4-mi355x-atom at CONC=1 only; run 24953107645 showed high-concurrency DSv4 ATOM OOMs in PR #650 torch sparse-attention fallbacks before upstream AITER sparse-attention support lands"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1202
-
-- config-keys:
-    - dsv4-fp4-b300-vllm-mtp
-  description:
-    - "Add preliminary vLLM MTP configs for DeepSeek-V4-Pro on B300"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1210
-
-- config-keys:
-    - dsv4-fp4-b200-vllm
-  description:
-    - "Pin image to vllm/vllm-openai:v0.20.0-cu130 (was floating deepseekv4-cu130 tag); DeepGEMM is preinstalled in this image"
-    - "Use --attention_config.use_fp4_indexer_cache=True and --compilation-config {\"cudagraph_mode\": \"FULL_AND_PIECEWISE\", \"custom_ops\": [\"all\"]} for all configs"
-    - "Gate --moe-backend deep_gemm_mega_moe and --gpu-memory-utilization 0.85 on DP_ATTENTION=true per the v0.20.0 recipe"
-    - "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204
-
-- config-keys:
-    - minimaxm2.5-fp4-mi355x-atom
-  description:
-    - "Add MiniMax-M2.5 MXFP4 MI355X Atom benchmark (rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post)"
-    - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042
- 
-- config-keys:  
-    - dsv4-fp4-gb200-dynamo-vllm
-  description:
-    - "DSV4-Pro FP4 GB200 dynamo-vLLM disagg against srt-slurm aflowers/vllm-gb200-v0.20.0"
-    - "Keeps the three validated 8k/1k points: low-latency 1P/1D TP8 conc=1, mid-curve 1P/1D DEP8 conc=256, and max-tpt 3P/1D DEP8 conc=4096"
-    - "All three recipes run NATS/etcd on a dedicated infra node and use compute-node local NVMe model weights via /mnt/numa1/models/deepseek-v4-pro/"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1163
-
-- config-keys:
-    - dsv4-fp4-gb200-dynamo-vllm
-  description:
-    - "Add GB200 Dynamo vLLM MegaMOE max-throughput recipe at conc=4096"
-    - "Topology matches max-tpt: 3 prefill DEP8 workers and 1 decode DEP8 worker with dedicated NATS/etcd"
-    - "Uses deep_gemm_mega_moe on prefill/decode, TORCH_SYMMMEM=NVSHMEM, and no offload"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218
-
-- config-keys:
-    - dsv4-fp4-gb200-dynamo-vllm
-  description:
-    - "Add GB200 Dynamo vLLM low-middle curve recipe at conc=256/512"
-    - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd"
-    - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218
-
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "Add conc=8192 recipe for 1k1k: deepep mega_moe backend with cuda-graph-max-bs 1088, max-running-requests 8192, mem-fraction-static 0.80, swa-full-tokens-ratio 0.3, tokenizer-worker-num 16"
-    - "conc=8192 enables SGLANG_OPT_USE_ONLINE_COMPRESS=1 and --stream-interval 30"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1209
-  
-- config-keys:
-    - dsv4-fp4-b300-vllm
-  description:
-    - "Change image to vllm/vllm-openai:v0.20.0-cu130"
-    - "Use Mega MoE for DEP configs"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1221
-
-- config-keys:
-    - dsv4-fp4-b200-vllm-mtp
-  description:
-    - "Add preliminary vLLM MTP configs for DeepSeek-V4-Pro on B200"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1230
-
-- config-keys:
-    - dsv4-fp4-gb200-dynamo-vllm
-  description:
-    - "Keep the GB200 Dynamo vLLM MegaMOE max-throughput recipe at 3P/1D DEP8 conc=4096"
-    - "Add GB200 Dynamo vLLM MegaMOE high-throughput recipe at 2P/1D DEP8 conc=4096"
-    - "Add GB200 Dynamo vLLM MegaMOE mid-curve recipe at 1P/1D DEP8 conc=256/512/1024"
-    - "Remove stale offload recipe copies and the old no-MegaMOE mid/max-throughput points from the GB200 Dynamo vLLM matrix"
-    - "Disable FlashInfer autotune on GB200 decode workers for accuracy stability, matching the srt-slurm recipe fix"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1223
-
-- config-keys:
-    - dsv4-fp4-gb300-dynamo-sglang
-  description:
-    - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)"
-    - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)"
-    - "Topologies mirror the dsv4-fp4-gb300-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B"
-    - "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157
+  evals-only: true
\ No newline at end of file

From ff116c2102ca09df55e78145c6f20bae49358c48 Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Thu, 30 Apr 2026 12:41:57 +0530
Subject: [PATCH 08/10] AMD GLM5 FP8 MTP Support on MI355X - Perf Change Log

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 perf-changelog.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 29c72bfe0..3d69bb58d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,10 @@
+- config-keys:
+    - glm5-fp8-mi355x-sglang-mtp
+  description:
+    - "Add GLM5 FP8 MTP MI355X SGLang Support" 
+    - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122
+  
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
     - dsr1-fp8-h100-dynamo-sglang

From cad10bc1ccdeb2a9e4c6206182b0a4f45e3cf5f6 Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Thu, 30 Apr 2026 12:52:44 +0530
Subject: [PATCH 09/10] Recover Perf Change Log

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 perf-changelog.yaml | 1726 +++++++++++++++++++++++++------------------
 1 file changed, 1020 insertions(+), 706 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3d69bb58d..4e5926a56 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,74 +1,3 @@
-- config-keys:
-    - glm5-fp8-mi355x-sglang-mtp
-  description:
-    - "Add GLM5 FP8 MTP MI355X SGLang Support" 
-    - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122
-  
-- config-keys:
-    - dsr1-fp8-h100-dynamo-trt
-    - dsr1-fp8-h100-dynamo-sglang
-  description:
-    - "Trigger H100 multinode evals after dist-timeout and health-check timeout fixes"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD
-
-- config-keys:
-    - glm5.1-fp4-mi355x-sglang
-  description:
-    - "Add GLM5.1 MXFP4 (FP4) MI355X SGLang Support" 
-    - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1098
-
-- config-keys:
-    - kimik2.5-fp4-gb200-dynamo-trt
-  description:
-    - "Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo (14 STP configs)"
-    - "New framework: dynamo-trt (Dynamo frontend + TensorRT-LLM backend)"
-    - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
-    - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026"
-    - "Runner script updated to support kimik2.5 model prefix with dynamo-trt framework"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1026
-
-- config-keys:
-    - qwen3.5-fp4-mi355x-sglang
-  description:
-    - "Update SGLang image from 'lmsysorg/sglang:v0.5.10-rocm720-mi35x' to 'rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413'"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1041
-
-- config-keys:
-    - kimik2.5-int4-mi300x-vllm
-  description:
-    - "Add Kimi K2.5 INT4 single-node MI300X vLLM benchmark (TP8)"
-    - "Uses vLLM ROCm v0.18.0 image following AMD Andy Luo's recipe"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
-
-- config-keys:
-    - minimaxm2.5-fp8-h100-vllm
-    - minimaxm2.5-fp8-h200-vllm
-  description:
-    - "Update vLLM image from v0.16.0 to v0.18.0 for minimax h100 and h200 configs"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
-
-- config-keys:
-    - dsr1-fp8-b200-dynamo-trt
-    - dsr1-fp8-h200-dynamo-trt
-    - dsr1-fp4-gb200-dynamo-trt
-  description:
-    - "Fix metadata inconsistencies in nvidia-master.yaml - TP/EP/DP-attn values now match actual recipe files"
-    - "B200 FP8 TRT 8K/1K: prefill_ep 8→1 (15 entries), prefill_dp_attn true→false (1 entry)"
-    - "H200 FP8 TRT 1K/1K: prefill_dp_attn false→true (9 entries)"
-    - "H200 FP8 TRT 8K/1K: prefill_dp_attn true→false (8 entries)"
-    - "GB200 FP4 TRT 8K/1K: decode_dp_attn true→false (2 entries)"
-    - "All fixes are metadata-only; no recipe files were modified"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/919
-
-- config-keys:
-    - kimik2.5-int4-mi325x-vllm
-  description:
-    - "Add Kimi K2.5 INT4 single-node MI325X vLLM benchmark (TP8)"
-    - "Uses vLLM ROCm v0.16.0 image following AMD Andy Luo's recipe"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/857
-
 - config-keys:
     - 70b-fp8-*-vllm
   description:
@@ -133,6 +62,12 @@
     - "Extend concurrency to 128 for gptoss b200 TRT configurations"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/233
 
+- config-keys:
+    - gptoss-fp4-b200-trt
+  description:
+    - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/256
+
 - config-keys:
     - "*gb200-dynamo-sglang"
   description:
@@ -156,18 +91,22 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/273
 
 - config-keys:
-    - gptoss-fp4-b200-trt
+    - dsr1-fp4-b200-sglang
+    - dsr1-fp8-b200-sglang
+    - dsr1-fp8-h200-sglang
   description:
-    - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/256
+    - "Update NVIDIA DeepSeek sglang Docker image from v0.5.5 to v0.5.6"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/276
+
 
 - config-keys:
-    - dsr1-fp4-gb200-dynamo-trt
-    - dsr1-fp4-gb200-dynamo-sglang
-    - dsr1-fp8-gb200-dynamo-sglang
+    - gptoss-fp4-b200-vllm
+    - gptoss-fp4-h100-vllm
+    - gptoss-fp4-h200-vllm
   description:
-    - "Add more configurations for GB200 SGLang DSR1"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/335
+    - "Update vLLM image from v0.11.2 to v0.13.0"
+    - "Add VLLM_MXFP4_USE_MARLIN=1 to H100 and H200 benchmark scripts"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/327
 
 - config-keys:
     - dsr1-fp4-mi355x-sglang
@@ -175,6 +114,22 @@
     - "Update MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post1"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/330
 
+- config-keys:
+    - dsr1-fp8-mi300x-sglang
+    - dsr1-fp8-mi325x-sglang
+    - dsr1-fp8-mi355x-sglang
+  description:
+    - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/332
+
+- config-keys:
+    - dsr1-fp4-gb200-dynamo-trt
+    - dsr1-fp4-gb200-dynamo-sglang
+    - dsr1-fp8-gb200-dynamo-sglang
+  description:
+    - "Add more configurations for GB200 SGLang DSR1"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/335
+
 - config-keys:
     - dsr1-fp4-gb200-dynamo-sglang
     - dsr1-fp8-gb200-dynamo-sglang
@@ -187,31 +142,7 @@
   description:
     - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.6.post2"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/369
-    
-- config-keys:
-    - dsr1-fp4-b200-sglang
-    - dsr1-fp8-b200-sglang
-    - dsr1-fp8-h200-sglang
-  description:
-    - "Update NVIDIA DeepSeek sglang Docker image from v0.5.5 to v0.5.6"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/276
-  
-- config-keys:
-    - gptoss-fp4-b200-vllm
-    - gptoss-fp4-h100-vllm
-    - gptoss-fp4-h200-vllm
-  description: 
-    - "Update vLLM image from v0.11.2 to v0.13.0"
-    - "Add VLLM_MXFP4_USE_MARLIN=1 to H100 and H200 benchmark scripts"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/327
 
-- config-keys:
-    - dsr1-fp8-mi300x-sglang
-    - dsr1-fp8-mi325x-sglang
-    - dsr1-fp8-mi355x-sglang
-  description:
-    - Use upstream SGLang images on mi300, mi325 and mi355 for dsr1fp8
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/332
 
 - config-keys:
     - gptoss-fp4-gb200-dynamo-trt
@@ -222,11 +153,14 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/387
 
 - config-keys:
-    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp4-b200-trt-mtp
+    - dsr1-fp8-b200-trt-mtp
+    - dsr1-fp8-h200-trt-mtp
   description:
-    - "Add PD disaggregation (1P2D) for Mi355X"
-    - "Includes with and without speculative decoding"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/348
+    - Add MTP (Multi-Token Prediction) support for single-node TRT configs
+    - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/392
+
 
 - config-keys:
     - dsr1-fp4-mi355x-sglang
@@ -234,21 +168,20 @@
     - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.7"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/395
 
+- config-keys:
+    - dsr1-fp8-mi355x-sglang-disagg
+  description:
+    - "Add PD disaggregation (1P2D) for Mi355X"
+    - "Includes with and without speculative decoding"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/409
+
 - config-keys:
     - dsr1-fp8-b200-sglang
   description:
     - "Adds TP4 configurations to DSR1-FP8 B200 SGLang deployment experiments"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/411
-  
-- config-keys:
-    - dsr1-fp4-b200-trt-mtp
-    - dsr1-fp8-b200-trt-mtp
-    - dsr1-fp8-h200-trt-mtp
-  description:
-    - Add MTP (Multi-Token Prediction) support for single-node TRT configs
-    - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/392
-  
+
+
 - config-keys:
     - dsr1-fp8-mi355x-atom
     - dsr1-fp4-mi355x-atom
@@ -267,6 +200,22 @@
     - "Add HIP_VISIBLE_DEVICES env var for Ray compatibility in vLLM 0.14+"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/496
 
+- config-keys:
+    - dsr1-fp4-gb200-dynamo-trt
+  description:
+    - "Update Dynamo TRT image from 0.5.1-rc0.pre3 to 0.8.1.post2"
+    - "Update TRT configurations"
+    - "Refactor configurations to use CONFIG_FILE-based recipes instead of inline parameter settings"
+    - "Introduce srt-slurm workflow for launching Dynamo jobs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/510
+
+- config-keys:
+    - gptoss-fp4-mi300x-vllm
+    - gptoss-fp4-mi325x-vllm
+  description:
+    - "Fix AITER env vars for vLLM v0.14.0 on AMD MI300X and MI325X"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/535
+
 - config-keys:
     - dsr1-fp8-h200-sglang
   description:
@@ -282,6 +231,7 @@
     - "Set --attention-backend aiter for AMD aiter attention backend"
     - "Update chunked-prefill-size and max-prefill-tokens from 196608 to 131072"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/544
+
 - config-keys:
     - dsr1-fp8-mi325x-sglang
   description:
@@ -294,11 +244,14 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/545
 
 - config-keys:
-    - gptoss-fp4-mi300x-vllm
-    - gptoss-fp4-mi325x-vllm
+    - dsr1-fp8-h200-dynamo-trt
   description:
-    - "Fix AITER env vars for vLLM v0.14.0 on AMD MI300X and MI325X"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/535
+    - "Add DSR1 FP8 H200 Dynamo TRT-LLM disaggregated multinode configuration"
+    - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+    - "Runner: h200-dgxc with multinode and disagg enabled"
+    - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths"
+    - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/570
 
 - config-keys:
     - dsr1-fp8-mi355x-sglang
@@ -307,6 +260,24 @@
     - "Key fix: Disables mla persistent kernel when not using fp8 kv_cache (https://github.com/sgl-project/sglang/pull/17327)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/572
 
+- config-keys:
+    - dsr1-fp8-h200-dynamo-sglang
+  description:
+    - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime"
+    - "Runner: h200-multinode-slurm with multinode and disagg enabled"
+    - "Recipes sourced from srtslurm repo (recipes/h200/)"
+    - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)"
+    - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)"
+    - "Concurrency levels range from 1 to 2048 depending on configuration"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/582
+
+- config-keys:
+  - dsr1-fp4-b300-dynamo-trt
+  description:
+    - "Add DSR1 FP4 B300 Dynamo TRT configurations"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/585
+
 - config-keys:
     # NVIDIA single-node
     - dsr1-fp4-b200-sglang
@@ -336,27 +307,17 @@
     - gptoss-fp4-mi355x-atom
   description:
     - Add official GSM8k eval results to GPT-OSS and DeepSeek R1 scenarios
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/558
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/587
   evals-only: true
 
 - config-keys:
-    - dsr1-fp8-h200-sglang
-  description:
-    - "Update H200 DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.9"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
-  
-- config-keys:
-  - dsr1-fp4-b300-dynamo-trt
+  - dsr1-fp4-b200-dynamo-trt
   description:
-    - "Add DSR1 FP4 B300 Dynamo TRT configurations"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/585
+    - "Update DSR1 FP4 B200 Dynamo TRT configurations"
+    - "Update TRTLLM version to 1.2.0rc6.post2"
+    - "Transform to use srt-slurm recipes"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/588
 
-- config-keys:
-  - dsr1-fp4-mi355x-sglang
-  description:
-    - "Update SGLang image from v0.5.7 to v0.5.8 for DeepSeek-R1 FP4 on MI355x"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/595
-  
 - config-keys:
     - dsr1-fp8-b200-trt
   description:
@@ -368,33 +329,14 @@
     - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges"
     - "Add TLLM_OVERRIDE_LAYER_NUM=61 to avoid OOM errors"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/594
-  
-- config-keys:
-  - dsr1-fp4-b200-dynamo-trt
-  description:
-    - "Update DSR1 FP4 B200 Dynamo TRT configurations"
-    - "Update TRTLLM version to 1.2.0rc6.post2"
-    - "Transform to use srt-slurm recipes"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/588
 
-- config-keys:
-    - dsr1-fp8-h200-dynamo-trt
-  description:
-    - "Add DSR1 FP8 H200 Dynamo TRT-LLM disaggregated multinode configuration"
-    - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
-    - "Runner: h200-dgxc with multinode and disagg enabled"
-    - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths"
-    - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/570
 
 - config-keys:
-    - dsr1-fp4-gb200-dynamo-trt
+  - dsr1-fp4-mi355x-sglang
   description:
-    - "Update Dynamo TRT image from 0.5.1-rc0.pre3 to 0.8.1.post2"
-    - "Update TRT configurations"
-    - "Refactor configurations to use CONFIG_FILE-based recipes instead of inline parameter settings"
-    - "Introduce srt-slurm workflow for launching Dynamo jobs"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/510
+    - "Update SGLang image from v0.5.7 to v0.5.8 for DeepSeek-R1 FP4 on MI355x"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/595
+
 
 - config-keys:
     - dsr1-fp8-mi355x-sglang
@@ -404,25 +346,21 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/613
 
 - config-keys:
-    - dsr1-fp8-h200-dynamo-sglang
+    - dsr1-fp8-b200-dynamo-trt
   description:
-    - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime"
-    - "Runner: h200-multinode-slurm with multinode and disagg enabled"
-    - "Recipes sourced from srtslurm repo (recipes/h200/)"
-    - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)"
-    - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)"
-    - "Concurrency levels range from 1 to 2048 depending on configuration"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/582
+    - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/616
 
 - config-keys:
-    - dsr1-fp4-b200-trt
+    - dsr1-fp8-gb200-dynamo-trt
   description:
-    - "Update TensorRT-LLM container from release:1.1.0rc2.post2 to release:1.2.0rc6.post2"
-    - "Change default MOE backend from DEEPGEMM to TRTLLM"
-    - "Add dynamic piecewise CUDA graphs for 1k1k (TEP8 and CONC64)"
-    - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/620
+    - "Add DeepSeek R1 FP8 GB200 Dynamo TRT-LLM disaggregated multinode configurations"
+    - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2"
+    - "1k1k: 14 scenarios (7 MTP, 7 STP) with varying DP attention/TEP modes"
+    - "1k8k: 10 scenarios (5 MTP, 5 STP) for long output generation"
+    - "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads"
+    - "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/617
 
 - config-keys:
     - dsr1-fp4-gb300-dynamo-trt
@@ -433,6 +371,15 @@
     - "Add gb300-nv runner and launch script for srt-slurm integration"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/618
 
+- config-keys:
+    - dsr1-fp4-b200-trt
+  description:
+    - "Update TensorRT-LLM container from release:1.1.0rc2.post2 to release:1.2.0rc6.post2"
+    - "Change default MOE backend from DEEPGEMM to TRTLLM"
+    - "Add dynamic piecewise CUDA graphs for 1k1k (TEP8 and CONC64)"
+    - "Update search space: remove EP=TP constraint, add TP=4 configurations, extend concurrency ranges"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/620
+
 - config-keys:
     - dsr1-fp4-mi355x-sglang-disagg
   description:
@@ -440,21 +387,20 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/622
 
 - config-keys:
-    - dsr1-fp8-gb200-dynamo-trt
+    - dsr1-fp8-b200-sglang-mtp
   description:
-    - "Add DeepSeek R1 FP8 GB200 Dynamo TRT-LLM disaggregated multinode configurations"
-    - "Image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2"
-    - "1k1k: 14 scenarios (7 MTP, 7 STP) with varying DP attention/TEP modes"
-    - "1k8k: 10 scenarios (5 MTP, 5 STP) for long output generation"
-    - "8k1k: 14 scenarios (7 MTP, 7 STP) for long context workloads"
-    - "Prefill workers: 1-5P, Decode workers: 1-4D, TP/EP: 8/16/32"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/617
+    - "Add MTP (Multi-Token Prediction) support for DeepSeek R1 FP8 B200 SGLang using EAGLE speculative decoding"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130-amd64"
+    - "Add benchmark script dsr1_fp8_b200_mtp.sh with EAGLE speculative decoding (num-steps=2, draft-tokens=3, topk=1)"
+    - "Update launch_b200-dgxc.sh to support SPEC_SUFFIX for MTP script selection"
+    - "Configurations: TP=8, EP=1, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/626
 
 - config-keys:
-    - dsr1-fp8-gb200-dynamo-trt
+    - dsr1-fp8-gb300-dynamo-trt
   description:
-    - "Fix model_prefix argument in yaml configs"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/646
+    - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 8k1k and 1k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/627
 
 - config-keys:
     - dsr1-fp8-b200-trt-mtp
@@ -466,10 +412,39 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/632
 
 - config-keys:
-    - dsr1-fp8-gb300-dynamo-trt
+    - dsr1-fp4-gb200-dynamo-sglang
   description:
-    - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 8k1k and 1k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/627
+    - "Update SGLang image from v0.5.5.post2 to v0.5.8-cu130"
+    - "Add FP4 model path separation via SRT_SLURM_MODEL_PREFIX in launch script"
+    - "Refactor to use CONFIG_FILE-based srt-slurm recipes instead of inline parameters"
+    - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)"
+    - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/633
+
+- config-keys:
+    - dsr1-fp8-gb200-dynamo-sglang
+    - dsr1-fp8-gb300-dynamo-sglang
+  description:
+    - "Update  GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130"
+    - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/635
+
+- config-keys:
+    - dsr1-fp4-gb300-dynamo-sglang
+  description:
+    - "Add GB300 FP4 Dynamo SGLang disaggregated multinode configuration"
+    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime"
+    - "Recipes sourced from srt-slurm repo (recipes/gb300-fp4/ folder)"
+    - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)"
+    - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/636
+
+- config-keys:
+    - dsr1-fp8-b300-dynamo-trt
+  description:
+    - "New B300 FP8 Dynamo TRT configurations"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/638
 
 - config-keys:
     - gptoss-fp4-b200-trt
@@ -479,21 +454,12 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/639
 
 - config-keys:
-    - dsr1-fp8-mi355x-sglang-disagg
-  description:
-    - "Add --use-chat-template argument to benchmark_serving script"
-    - "Without this arg, MTP acceptance rates are artificially high for DeepSeek with MTP"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/647
-
-- config-keys: 
-    - dsr1-fp8-b200-sglang-mtp
+    - dsr1-fp8-h200-dynamo-sglang
   description:
-    - "Add MTP (Multi-Token Prediction) support for DeepSeek R1 FP8 B200 SGLang using EAGLE speculative decoding"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130-amd64"
-    - "Add benchmark script dsr1_fp8_b200_mtp.sh with EAGLE speculative decoding (num-steps=2, draft-tokens=3, topk=1)"
-    - "Update launch_b200-dgxc.sh to support SPEC_SUFFIX for MTP script selection"
-    - "Configurations: TP=8, EP=1, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/626
+    - "Add MTP (EAGLE speculative decoding) configs alongside STP"
+    - "Update container to lmsysorg/sglang:v0.5.8.post1-cu130"
+    - "Remove aggregated configs, keep only disaggregated"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/640
 
 - config-keys:
     - dsr1-fp4-b200-trt-mtp
@@ -504,18 +470,35 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/642
 
 - config-keys:
-    - dsr1-fp8-b200-dynamo-sglang
+    - dsr1-fp8-h100-dynamo-sglang
   description:
-    - "Add DSR1 FP8 B200 disaggregated SGLang multinode configuration"
-    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64"
-    - "9 recipes: 4x 1k1k + 5x 8k1k, low-latency and max-throughput profiles"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/658
+    - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang STP disaggregated multinode configurations"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130"
+    - "1k1k, 1k8k, 8k1k sequence lengths"
+    - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/643
 
 - config-keys:
-    - dsr1-fp8-gb300-dynamo-trt
+    - dsr1-fp8-h100-dynamo-sglang
   description:
-    - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 1k8k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/654
+    - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang MTP disaggregated multinode configurations"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130"
+    - "1k1k, 1k8k, 8k1k sequence lengths with MTP speculative decoding"
+    - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/644
+
+- config-keys:
+    - dsr1-fp8-gb200-dynamo-trt
+  description:
+    - "Fix model_prefix argument in yaml configs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/646
+
+- config-keys:
+    - dsr1-fp8-mi355x-sglang-disagg
+  description:
+    - "Add --use-chat-template argument to benchmark_serving script"
+    - "Without this arg, MTP acceptance rates are artificially high for DeepSeek with MTP"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/649
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
@@ -524,18 +507,18 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/651
 
 - config-keys:
-    - dsr1-fp8-h200-dynamo-sglang
+    - dsr1-fp8-gb300-dynamo-trt
   description:
-    - "Add MTP (EAGLE speculative decoding) configs alongside STP"
-    - "Update container to lmsysorg/sglang:v0.5.8.post1-cu130"
-    - "Remove aggregated configs, keep only disaggregated"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/640
+    - "Add DeepSeek R1 FP8 GB300 Dynamo TRT-LLM disaggregated multinode configurations for 1k8k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/654
 
 - config-keys:
-    - dsr1-fp8-b300-dynamo-trt
+    - dsr1-fp8-b200-dynamo-sglang
   description:
-    - "New B300 FP8 Dynamo TRT configurations"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/638
+    - "Add DSR1 FP8 B200 disaggregated SGLang multinode configuration"
+    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64"
+    - "9 recipes: 4x 1k1k + 5x 8k1k, low-latency and max-throughput profiles"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/658
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
@@ -544,25 +527,6 @@
     - "fix model_prefix bug from https://github.com/SemiAnalysisAI/InferenceX/pull/651"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/663
 
-- config-keys:
-    - dsr1-fp4-gb200-dynamo-sglang
-  description:
-    - "Update SGLang image from v0.5.5.post2 to v0.5.8-cu130"
-    - "Add FP4 model path separation via SRT_SLURM_MODEL_PREFIX in launch script"
-    - "Refactor to use CONFIG_FILE-based srt-slurm recipes instead of inline parameters"
-    - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)"
-    - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/633  
-
-- config-keys:
-    - dsr1-fp8-gb200-dynamo-sglang
-    - dsr1-fp8-gb300-dynamo-sglang
-  description:
-    - "Update  GB200 and GB300 configs for DSR1 FP8 SGLANG STP mode"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130"
-    - "Update prefill/decode worker counts, TP/EP parallelism, and dp-attn settings for 1k1k and 8k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/635
-
 - config-keys:
     - dsr1-fp8-b200-dynamo-sglang-mtp
   description:
@@ -571,33 +535,6 @@
     - "9 recipes: 4x 1k1k + 5x 8k1k, low-latency and max-throughput with EAGLE speculative decoding"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/667
 
-- config-keys:
-    - dsr1-fp8-h100-dynamo-sglang
-  description:
-    - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang STP disaggregated multinode configurations"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130"
-    - "1k1k, 1k8k, 8k1k sequence lengths"
-    - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/643
-
-- config-keys:
-    - dsr1-fp8-h100-dynamo-sglang
-  description:
-    - "Add DeepSeek-R1 FP8 H100 Dynamo SGLang MTP disaggregated multinode configurations"
-    - "Image: lmsysorg/sglang:v0.5.8-cu130"
-    - "1k1k, 1k8k, 8k1k sequence lengths with MTP speculative decoding"
-    - "Two modes per seq-len: Max throughput TEP (1P2D) and Max throughput DEP (1P1D with dp-attention)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/644
-
-- config-keys:
-    - dsr1-fp8-mi355x-atom-mtp
-    - dsr1-fp4-mi355x-atom-mtp
-  description:
-    - "Add DSR1 FP8/FP4 MI355X ATOM with MTP configuration"
-    - "Image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1"
-    - "Deepseek R1 with speculative decoding: 1k1k, 1k8k, 8k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/673
-
 - config-keys:
     - dsr1-fp4-b200-dynamo-sglang
   description:
@@ -608,10 +545,13 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/672
 
 - config-keys:
-    - dsr1-fp8-b200-dynamo-trt
+    - dsr1-fp8-mi355x-atom-mtp
+    - dsr1-fp4-mi355x-atom-mtp
   description:
-    - "Introduce new DSR1 FP8 B200 Dynamo TRT configurations for 8k1k and 1k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/616
+    - "Add DSR1 FP8/FP4 MI355X ATOM with MTP configuration"
+    - "Image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1"
+    - "Deepseek R1 with speculative decoding: 1k1k, 1k8k, 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/673
 
 - config-keys:
     - dsr1-fp8-mi355x-sglang-disagg
@@ -631,22 +571,17 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/683
 
 - config-keys:
-    - dsr1-fp4-gb300-dynamo-sglang
+    - dsr1-fp8-b200-dynamo-trt
   description:
-    - "Add GB300 FP4 Dynamo SGLang disaggregated multinode configuration"
-    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime"
-    - "Recipes sourced from srt-slurm repo (recipes/gb300-fp4/ folder)"
-    - "Add 1k1k configurations: low-latency (1P2D), mid-curve (4P8D), max-tpt (4P12D)"
-    - "Add 8k1k configurations: low-latency (1P4D), mid-curve (6P12D), max-tpt (10P8D)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/636
+    - "Update max_num_tokens and max_batch_size for min-latency decode workers"
+    - "See srt-slurm recipe changes: https://github.com/ishandhanani/srt-slurm/pull/173"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/686
 
 - config-keys:
-    - dsr1-fp8-b200-dynamo-sglang-mtp
+    - dsr1-fp8-mi355x-sglang-disagg
   description:
-    - "Patches one missing concurrency point for "
-    - "DSR1 FP8 B200 disaggregated SGLang MTP multinode configuration. "
-    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/691
+    - "Add more sweep points for DSR1 FP8 both MTP and non-MTP 1k1k, 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/689
 
 - config-keys:
     - dsr1-fp8-b300-dynamo-trt
@@ -655,23 +590,25 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/690
 
 - config-keys:
-    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-b200-dynamo-sglang-mtp
   description:
-    - "Add more sweep points for DSR1 FP8 both MTP and non-MTP 1k1k, 8k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/689
+    - "Patches one missing concurrency point for "
+    - "DSR1 FP8 B200 disaggregated SGLang MTP multinode configuration. "
+    - "Image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/691
 
 - config-keys:
-    - dsr1-fp8-b200-dynamo-trt
+    - dsr1-fp4-mi355x-sglang-disagg
   description:
-    - "Update max_num_tokens and max_batch_size for min-latency decode workers"
-    - "See srt-slurm recipe changes: https://github.com/ishandhanani/srt-slurm/pull/173"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/686
+    - "Add more sweep points for DSR1 FP4 both MTP and non-MTP 1k1k, 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692
+
 
 - config-keys:
     - dsr1-fp8-mi325x-sglang
   description:
     - "Update MI325X DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.8"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/695
 
 - config-keys:
     - dsr1-fp8-mi300x-sglang
@@ -679,12 +616,6 @@
     - "Update MI300X DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.8"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/696
 
-- config-keys:
-    - dsr1-fp4-mi355x-sglang-disagg
-  description:
-    - "Add more sweep points for DSR1 FP4 both MTP and non-MTP 1k1k, 8k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/692
-  
 - config-keys:
     - dsr1-fp8-b200-dynamo-sglang-mtp
   description:
@@ -739,6 +670,15 @@
     - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/734
 
+- config-keys:
+    - kimik2.5-int4-b200-vllm
+  description:
+    - "Add Kimi-K2.5 INT4 vLLM benchmark for B200"
+    - "Model: moonshotai/Kimi-K2.5 with --mm-encoder-tp-mode data and --trust-remote-code"
+    - "Image: vllm/vllm-openai:v0.15.1"
+    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/735
+
 - config-keys:
     - minimaxm2.5-fp8-mi355x-vllm
   description:
@@ -750,12 +690,13 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/755
 
 - config-keys:
-    - qwen3.5-fp8-mi355x-sglang
+    - minimaxm2.5-fp8-b200-vllm
   description:
-    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for MI355X"
-    - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218"
-    - "Uses triton attention backend, TP=8, concurrency 4-64"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/768
+    - "Add MiniMax-M2.5 FP8 vLLM benchmark for B200"
+    - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code"
+    - "Image: vllm/vllm-openai:v0.17.0"
+    - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/757
 
 - config-keys:
     - qwen3.5-bf16-b200-sglang
@@ -767,12 +708,30 @@
     - "Set cuda-graph-max-bs to match concurrency, scheduler-recv-interval based on concurrency"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/758
 
+- config-keys:
+    - glm5-fp8-mi355x-sglang
+  description:
+    - "Add GLM-5 FP8 SGLang benchmark for MI355X"
+    - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends"
+    - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219"
+    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/762
+
+- config-keys:
+    - qwen3.5-fp8-mi355x-sglang
+  description:
+    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for MI355X"
+    - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218"
+    - "Uses triton attention backend, TP=8, concurrency 4-64"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/768
+
 - config-keys:
     - dsr1-fp8-mi355x-sglang-disagg
   description:
     - "Add more configs for MI355X FP8 Disagg"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/770
-  
+
+
 - config-keys:
     - gptoss-fp4-mi300x-vllm
     - gptoss-fp4-mi325x-vllm
@@ -781,15 +740,6 @@
     - "Gains: ROCm skinny GEMM dispatch fix, MoRI EP all2all backend, KV cache shuffle + paged attention for AITER"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/781
 
-- config-keys:
-    - kimik2.5-int4-b200-vllm
-  description:
-    - "Add Kimi-K2.5 INT4 vLLM benchmark for B200"
-    - "Model: moonshotai/Kimi-K2.5 with --mm-encoder-tp-mode data and --trust-remote-code"
-    - "Image: vllm/vllm-openai:v0.15.1"
-    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/735
-
 - config-keys:
     - gptoss-fp4-b200-vllm
     - gptoss-fp4-h100-vllm
@@ -800,7 +750,8 @@
     - "Gains: CUTLASS MoE optimizations (~8% throughput), FP4 kernel improvements (~4% E2E on B200), torch.compile cold-start fix"
     - "v0.15.1 includes fix for prefix cache hit rate of 0% on GPT-OSS hybrid attention models"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/789
-  
+
+
 - config-keys:
     - dsr1-fp4-mi355x-atom
     - dsr1-fp4-mi355x-atom-mtp
@@ -809,16 +760,16 @@
     - "Comment out TP=4 configs, consolidate to TP=8 only"
     - "Extend concurrency range to conc-end: 256 across all sequence lengths (1k1k, 1k8k, 8k1k)"
     - "Fix MTP 1k8k conc-start from 256 to 4 to enable full concurrency sweep"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/699
- 
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/792
+
 - config-keys:
-    - glm5-fp8-mi355x-sglang
+    - qwen3.5-fp8-b200-sglang
   description:
-    - "Add GLM-5 FP8 SGLang benchmark for MI355X"
-    - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends"
-    - "Image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219"
-    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for B200"
+    - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64"
+    - "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner"
+    - "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804
 
 - config-keys:
     - gptoss-fp4-mi300x-vllm
@@ -841,12 +792,62 @@
     - "Key changes: AITER v0.1.10.post3 with FP8 Prefill/Decode/KV Cache, FP8 prefill attention kernel, MORI EP two-batch overlapping, OOM fix for DeepSeek weight loading"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/816
 
+- config-keys:
+    - qwen3.5-fp4-b200-sglang
+  description:
+    - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang benchmark config and launch script"
+    - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64"
+    - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
+    - "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820
+
+- config-keys:
+    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-mi355x-sglang-disagg-mtp
+    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg-mtp
+  description:
+    - "Add more sweep configs for MI355X FP8/FP4 Disagg"
+    - "Add TP/DP/EP size < 8 support "
+    - "Support DSR1-0528 MTP Disagg"
+    - "Bump SGL mori image to Feb 27"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823
+
+- config-keys:
+    - kimik2.5-fp4-mi355x-vllm
+  description:
+    - "Add Kimi-K2.5 MXFP4 vLLM benchmark for MI355X"
+    - "Model: amd/Kimi-K2.5-MXFP4 with --mm-encoder-tp-mode data and --trust-remote-code"
+    - "Image: vllm/vllm-openai-rocm:v0.15.1"
+    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/825
+
+- config-keys:
+    - minimaxm2.5-fp4-mi355x-vllm
+  description:
+    - "Add MiniMax M2.5 MXFP4 vLLM benchmark for MI355X"
+    - "Model: amd/MiniMax-M2.5-MXFP4 with --trust-remote-code and --block-size=32"
+    - "Image: vllm/vllm-openai-rocm:v0.19.1"
+    - "Environment: VLLM_ROCM_USE_AITER=1"
+    - "Tp=1, TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/827
+
 - config-keys:
     - minimaxm2.5-fp8-h200-vllm
   description:
     - "Add MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP4)"
     - "New benchmark script with --trust-remote-code for MiniMaxAI/MiniMax-M2.5"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/831
+
+- config-keys:
+    - minimaxm2.5-fp8-h100-vllm
+  description:
+    - "Add MiniMax-M2.5 FP8 vLLM benchmark for H100"
+    - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code"
+    - "Image: vllm/vllm-openai:v0.16.0"
+    - "Switch from TP=8/EP=8 to TP=4/EP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k"
+    - "Script uses conditional --enable-expert-parallel based on EP_SIZE env var"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/832
 
 - config-keys:
     - minimaxm2.5-fp8-mi325x-vllm
@@ -868,22 +869,13 @@
     - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/837
 
-- config-keys:
-    - kimik2.5-fp4-mi355x-vllm
-  description:
-    - "Add Kimi-K2.5 MXFP4 vLLM benchmark for MI355X"
-    - "Model: amd/Kimi-K2.5-MXFP4 with --mm-encoder-tp-mode data and --trust-remote-code"
-    - "Image: vllm/vllm-openai-rocm:v0.15.1"
-    - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/825
-
 - config-keys:
     - qwen3.5-bf16-mi325x-sglang
   description:
     - "Add Qwen3.5-397B-A17B BF16 SGLang benchmark for MI325X"
     - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x"
     - "Uses triton attention backend, TP=8, concurrency 4-64"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/842
 
 - config-keys:
     - qwen3.5-bf16-mi300x-sglang
@@ -894,13 +886,14 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/843
 
 - config-keys:
-    - qwen3.5-fp8-mi325x-sglang
+    - kimik2.5-int4-h200-vllm
   description:
-    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark for MI325X"
-    - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x"
-    - "Following AMD Andy Luo's recipe with triton attention backend"
+    - "Add Kimi-K2.5 INT4 vLLM benchmark for H200"
+    - "Model: moonshotai/Kimi-K2.5 with --reasoning-parser kimi_k2 and --trust-remote-code"
+    - "Image: vllm/vllm-openai:v0.16.0"
     - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+    - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/847
 
 - config-keys:
     - qwen3.5-fp8-mi300x-sglang
@@ -912,47 +905,23 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/850
 
 - config-keys:
-    - kimik2.5-int4-h200-vllm
+    - qwen3.5-fp8-mi325x-sglang
   description:
-    - "Add Kimi-K2.5 INT4 vLLM benchmark for H200"
-    - "Model: moonshotai/Kimi-K2.5 with --reasoning-parser kimi_k2 and --trust-remote-code"
-    - "Image: vllm/vllm-openai:v0.16.0"
+    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark for MI325X"
+    - "Image: lmsysorg/sglang:v0.5.9-rocm720-mi30x"
+    - "Following AMD Andy Luo's recipe with triton attention backend"
     - "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-    - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839
-  
-- config-keys:
-    - dsr1-fp8-mi355x-sglang-disagg
-    - dsr1-fp8-mi355x-sglang-disagg-mtp
-    - dsr1-fp4-mi355x-sglang-disagg
-    - dsr1-fp4-mi355x-sglang-disagg-mtp
-  description:
-    - "Add more sweep configs for MI355X FP8/FP4 Disagg"
-    - "Add TP/DP/EP size < 8 support "
-    - "Support DSR1-0528 MTP Disagg"
-    - "Bump SGL mori image to Feb 27"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/852
 
 - config-keys:
-    - minimaxm2.5-fp8-h100-vllm
+    - gptoss-fp4-h200-trt
   description:
-    - "Add MiniMax-M2.5 FP8 vLLM benchmark for H100"
-    - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code"
-    - "Image: vllm/vllm-openai:v0.16.0"
-    - "Switch from TP=8/EP=8 to TP=4/EP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k"
-    - "Script uses conditional --enable-expert-parallel based on EP_SIZE env var"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/832
+    - "Upgrade TensorRT-LLM container from release:gpt-oss-dev to release:v1.3.0rc5"
+    - "Remove sed hack for TensorRT bug (fixed upstream in v1.3.0rc5)"
+    - "Remove enable_block_reuse: false from kv_cache_config (default true is now recommended)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/854
 
 - config-keys:
-    - qwen3.5-fp8-b200-sglang
-  description:
-    - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for B200"
-    - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64"
-    - "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner"
-    - "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804
-
-- config-keys: 
     - qwen3.5-fp8-h200-sglang
   description:
     - "Add Qwen 3.5 FP8 H200 SGLang configuration"
@@ -961,6 +930,13 @@
     - "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855
 
+- config-keys:
+    - kimik2.5-fp4-b200-vllm
+  description:
+    - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration"
+    - "Image: vllm/vllm-openai:v0.17.0"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862
+
 - config-keys:
     - dsr1-fp8-mi355x-sglang
   description:
@@ -969,20 +945,18 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/865
 
 - config-keys:
-    - qwen3.5-bf16-b200-sglang
-    - qwen3.5-bf16-mi300x-sglang
-    - qwen3.5-bf16-mi325x-sglang
-    - qwen3.5-bf16-mi355x-sglang
-    - qwen3.5-fp8-b200-sglang
-    - qwen3.5-fp8-h200-sglang
-    - qwen3.5-fp8-mi300x-sglang
-    - qwen3.5-fp8-mi325x-sglang
-    - qwen3.5-fp8-mi355x-sglang
+    - minimaxm2.5-fp8-h200-vllm
   description:
-    - "Redo qwen eval"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/892
-  evals-only: true
-  
+    - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869
+
+
+- config-keys:
+    - dsr1-fp8-h200-sglang
+  description:
+    - "Update H200 DeepSeek R1 FP8 SGLang image from v0.5.7 to v0.5.9"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/887
+
 - config-keys:
     - gptoss-fp4-mi300x-vllm
     - gptoss-fp4-mi325x-vllm
@@ -994,23 +968,31 @@
     - "Add AMDGCN_USE_BUFFER_OPS=0 and VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 env vars"
     - "Switch to --attention-backend ROCM_AITER_UNIFIED_ATTN and add fuse_rope_kvcache compilation pass"
     - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867
-  
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/889
+
 - config-keys:
-    - kimik2.5-fp4-b200-vllm
+    - qwen3.5-bf16-b200-sglang
+    - qwen3.5-bf16-mi300x-sglang
+    - qwen3.5-bf16-mi325x-sglang
+    - qwen3.5-bf16-mi355x-sglang
+    - qwen3.5-fp8-b200-sglang
+    - qwen3.5-fp8-h200-sglang
+    - qwen3.5-fp8-mi300x-sglang
+    - qwen3.5-fp8-mi325x-sglang
+    - qwen3.5-fp8-mi355x-sglang
   description:
-    - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration"
-    - "Image: vllm/vllm-openai:v0.17.0"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862
+    - "Redo qwen eval"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/892
+  evals-only: true
+
 
 - config-keys:
-    - minimaxm2.5-fp8-b200-vllm
+    - qwen3.5-fp8-b200-sglang-mtp
   description:
-    - "Add MiniMax-M2.5 FP8 vLLM benchmark for B200"
-    - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code"
-    - "Image: vllm/vllm-openai:v0.17.0"
-    - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/757
+    - "Add Single Node Agg FP8 MTP config for Qwen3.5 B200 SGLang"
+    - "EAGLE speculative decoding: num-steps 3, draft-tokens 4, topk 1"
+    - "New script: benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/898
 
 - config-keys:
     - dsr1-fp4-mi355x-sglang-disagg
@@ -1021,11 +1003,12 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/899
 
 - config-keys:
-    - minimaxm2.5-fp8-h200-vllm
+    - kimik2.5-int4-mi325x-vllm
   description:
-    - "Extend MiniMax M2.5 FP8 single-node config for H200 with vLLM v0.16.0 (TP8)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/869
-  
+    - "Add Kimi K2.5 INT4 single-node MI325X vLLM benchmark (TP8)"
+    - "Uses vLLM ROCm v0.16.0 image following AMD Andy Luo's recipe"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/901
+
 - config-keys:
     - dsr1-fp8-b200-dynamo-sglang
     - dsr1-fp8-b200-dynamo-sglang-mtp
@@ -1036,28 +1019,111 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/907
 
 - config-keys:
-    - glm5-fp8-h200-sglang
-  description:
-    - "Add GLM-5 FP8 SGLang H200 single-node benchmark"
-    - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper"
-    - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
-    - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
-
-- config-keys:
+    # NVIDIA single-node
+    - dsr1-fp4-b200-sglang
+    - dsr1-fp4-b200-trt
+    - dsr1-fp4-b200-trt-mtp
+    - dsr1-fp8-b200-sglang
+    - dsr1-fp8-b200-sglang-mtp
+    - dsr1-fp8-b200-trt
+    - dsr1-fp8-b200-trt-mtp
+    - dsr1-fp8-h200-sglang
+    - dsr1-fp8-h200-trt
+    - dsr1-fp8-h200-trt-mtp
     - glm5-fp8-b200-sglang
-  description:
-    - "Add GLM-5 FP8 SGLang benchmark for B200"
-    - "Supports TP8 (low latency) and DEP8 (high throughput) modes with NSA attention backend"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/915
-  
-- config-keys:
+    - glm5-fp8-h200-sglang
+    - gptoss-fp4-b200-trt
+    - gptoss-fp4-b200-vllm
+    - gptoss-fp4-h100-vllm
+    - gptoss-fp4-h200-trt
+    - gptoss-fp4-h200-vllm
+    - kimik2.5-fp4-b200-vllm
+    - kimik2.5-int4-b200-vllm
+    - kimik2.5-int4-h200-vllm
+    - minimaxm2.5-fp8-b200-vllm
+    - minimaxm2.5-fp8-h100-vllm
+    - minimaxm2.5-fp8-h200-vllm
+    - qwen3.5-bf16-b200-sglang
+    - qwen3.5-fp8-b200-sglang
     - qwen3.5-fp8-b200-sglang-mtp
+    - qwen3.5-fp8-h200-sglang
+    # AMD single-node
+    - dsr1-fp4-mi355x-atom
+    - dsr1-fp4-mi355x-atom-mtp
+    - dsr1-fp4-mi355x-sglang
+    - dsr1-fp8-mi325x-sglang
+    - dsr1-fp8-mi300x-sglang
+    - dsr1-fp8-mi355x-atom
+    - dsr1-fp8-mi355x-atom-mtp
+    - dsr1-fp8-mi355x-sglang
+    - glm5-fp8-mi355x-sglang
+    - gptoss-fp4-mi300x-vllm
+    - gptoss-fp4-mi325x-vllm
+    - gptoss-fp4-mi355x-atom
+    - gptoss-fp4-mi355x-vllm
+    - kimik2.5-fp4-mi355x-vllm
+    - kimik2.5-int4-mi325x-vllm
+    - kimik2.5-int4-mi355x-vllm
+    - minimaxm2.5-fp8-mi300x-vllm
+    - minimaxm2.5-fp8-mi325x-vllm
+    - minimaxm2.5-fp8-mi355x-vllm
+    - qwen3.5-bf16-mi300x-sglang
+    - qwen3.5-bf16-mi325x-sglang
+    - qwen3.5-bf16-mi355x-sglang
+    - qwen3.5-fp8-mi300x-sglang
+    - qwen3.5-fp8-mi325x-sglang
+    - qwen3.5-fp8-mi355x-sglang
   description:
-    - "Add Single Node Agg FP8 MTP config for Qwen3.5 B200 SGLang"
-    - "EAGLE speculative decoding: num-steps 3, draft-tokens 4, topk 1"
-    - "New script: benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/898
+    - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911
+  evals-only: true
+
+- config-keys:
+    - glm5-fp8-h200-sglang
+  description:
+    - "Add GLM-5 FP8 SGLang H200 single-node benchmark"
+    - "Model: zai-org/GLM-5-FP8, image: lmsysorg/sglang:glm5-hopper"
+    - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
+    - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
+
+- config-keys:
+    - glm5-fp8-b200-sglang
+  description:
+    - "Add GLM-5 FP8 SGLang benchmark for B200"
+    - "Supports TP8 (low latency) and DEP8 (high throughput) modes with NSA attention backend"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/915
+
+
+- config-keys:
+    - qwen3.5-fp8-b200-sglang
+  description:
+    - "Replace FP8 with combination of TP4 and TP8 config"
+    - "Add --enable-flashinfer-allreduce-fusion to TP8"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918
+
+- config-keys:
+    - dsr1-fp8-b200-dynamo-trt
+    - dsr1-fp8-h200-dynamo-trt
+    - dsr1-fp4-gb200-dynamo-trt
+  description:
+    - "Fix metadata inconsistencies in nvidia-master.yaml - TP/EP/DP-attn values now match actual recipe files"
+    - "B200 FP8 TRT 8K/1K: prefill_ep 8→1 (15 entries), prefill_dp_attn true→false (1 entry)"
+    - "H200 FP8 TRT 1K/1K: prefill_dp_attn false→true (9 entries)"
+    - "H200 FP8 TRT 8K/1K: prefill_dp_attn true→false (8 entries)"
+    - "GB200 FP4 TRT 8K/1K: decode_dp_attn true→false (2 entries)"
+    - "All fixes are metadata-only; no recipe files were modified"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/919
+
+- config-keys:
+    - kimik2.5-int4-mi325x-vllm
+    - kimik2.5-int4-mi355x-vllm
+    - kimik2.5-int4-h200-vllm
+    - kimik2.5-fp4-mi355x-vllm
+    - kimik2.5-fp4-b200-vllm
+  description:
+    - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926
 
 - config-keys:
     - minimaxm2.5-fp8-mi355x-vllm
@@ -1092,13 +1158,7 @@
     - "Add --exclusive flag to MI355X single-node salloc and multi-node sbatch to prevent node sharing during benchmarks"
     - "Only non-TP8 configs listed; TP8 already uses all GPUs on the node"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/934
-  
-- config-keys:
-    - qwen3.5-fp8-b200-sglang
-  description:
-    - "Replace FP8 with combination of TP4 and TP8 config"
-    - "Add --enable-flashinfer-allreduce-fusion to TP8"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918
+
 
 - config-keys:
     - kimik2.5-int4-b200-vllm
@@ -1106,6 +1166,15 @@
     - "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935
 
+- config-keys:
+    - kimik2.5-fp4-mi355x-vllm
+  description:
+    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
+    - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)"
+    - "Add expert parallel, TP4, and TP4/EP4 search spaces"
+    - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936
+
 - config-keys:
     - dsr1-fp4-b200-sglang
     - dsr1-fp8-b200-sglang
@@ -1118,22 +1187,15 @@
     - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130"
     - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943
-  
-- config-keys:
-    - minimaxm2.5-fp8-mi325x-vllm
-  description:
-    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
-    - "Replace TP4 with TP8/EP8, add conc range 4-256"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/953
+
 
 - config-keys:
-    - kimik2.5-fp4-mi355x-vllm
+    - minimaxm2.5-fp8-b200-vllm
   description:
-    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
-    - "Enable AITER with INT4 quick reduce; disable AITER RMSNorm for TP < 8 (accuracy)"
-    - "Add expert parallel, TP4, and TP4/EP4 search spaces"
-    - "Switch block-size 64 to 1 gpu-memory-utilization 0.95 to 0.90"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/936
+    - "Update vLLM image from v0.17.0 to v0.19.0 for MiniMax-M2.5 FP8 B200"
+    - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs"
+    - "Remove ISL 1024 / OSL 8192 seq-len config"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947
 
 - config-keys:
     - kimik2.5-int4-mi355x-vllm
@@ -1144,6 +1206,13 @@
     - "Add --max-num-seqs 256, remove --disable-log-requests"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/950
 
+- config-keys:
+    - minimaxm2.5-fp8-mi325x-vllm
+  description:
+    - "Upgrade vLLM ROCm image from v0.16.0 to v0.18.0"
+    - "Replace TP4 with TP8/EP8, add conc range 4-256"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/953
+
 - config-keys:
     - kimik2.5-int4-mi325x-vllm
   description:
@@ -1153,6 +1222,13 @@
     - "Add --max-num-seqs 256, remove --disable-log-requests"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/957
 
+- config-keys:
+    - minimaxm2.5-fp8-h100-vllm
+    - minimaxm2.5-fp8-h200-vllm
+  description:
+    - "Update vLLM image from v0.16.0 to v0.18.0 for minimax h100 and h200 configs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/958
+
 - config-keys:
     - gptoss-fp4-h100-vllm
     - gptoss-fp4-h200-vllm
@@ -1160,16 +1236,6 @@
     - "Update vLLM image from v0.15.1 to v0.18.0 for gptoss H100 and H200 configs"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/960
 
-- config-keys:
-    - kimik2.5-int4-mi325x-vllm
-    - kimik2.5-int4-mi355x-vllm
-    - kimik2.5-int4-h200-vllm
-    - kimik2.5-fp4-mi355x-vllm
-    - kimik2.5-fp4-b200-vllm
-  description:
-    - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926
-
 - config-keys:
     - minimaxm2.5-fp8-b200-vllm
     - minimaxm2.5-fp8-h100-vllm
@@ -1181,66 +1247,6 @@
     - "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966
 
-- config-keys:
-    # NVIDIA single-node
-    - dsr1-fp4-b200-sglang
-    - dsr1-fp4-b200-trt
-    - dsr1-fp4-b200-trt-mtp
-    - dsr1-fp8-b200-sglang
-    - dsr1-fp8-b200-sglang-mtp
-    - dsr1-fp8-b200-trt
-    - dsr1-fp8-b200-trt-mtp
-    - dsr1-fp8-h200-sglang
-    - dsr1-fp8-h200-trt
-    - dsr1-fp8-h200-trt-mtp
-    - glm5-fp8-b200-sglang
-    - glm5-fp8-h200-sglang
-    - gptoss-fp4-b200-trt
-    - gptoss-fp4-b200-vllm
-    - gptoss-fp4-h100-vllm
-    - gptoss-fp4-h200-trt
-    - gptoss-fp4-h200-vllm
-    - kimik2.5-fp4-b200-vllm
-    - kimik2.5-int4-b200-vllm
-    - kimik2.5-int4-h200-vllm
-    - minimaxm2.5-fp8-b200-vllm
-    - minimaxm2.5-fp8-h100-vllm
-    - minimaxm2.5-fp8-h200-vllm
-    - qwen3.5-bf16-b200-sglang
-    - qwen3.5-fp8-b200-sglang
-    - qwen3.5-fp8-b200-sglang-mtp
-    - qwen3.5-fp8-h200-sglang
-    # AMD single-node
-    - dsr1-fp4-mi355x-atom
-    - dsr1-fp4-mi355x-atom-mtp
-    - dsr1-fp4-mi355x-sglang
-    - dsr1-fp8-mi325x-sglang
-    - dsr1-fp8-mi300x-sglang
-    - dsr1-fp8-mi355x-atom
-    - dsr1-fp8-mi355x-atom-mtp
-    - dsr1-fp8-mi355x-sglang
-    - glm5-fp8-mi355x-sglang
-    - gptoss-fp4-mi300x-vllm
-    - gptoss-fp4-mi325x-vllm
-    - gptoss-fp4-mi355x-atom
-    - gptoss-fp4-mi355x-vllm
-    - kimik2.5-fp4-mi355x-vllm
-    - kimik2.5-int4-mi325x-vllm
-    - kimik2.5-int4-mi355x-vllm
-    - minimaxm2.5-fp8-mi300x-vllm
-    - minimaxm2.5-fp8-mi325x-vllm
-    - minimaxm2.5-fp8-mi355x-vllm
-    - qwen3.5-bf16-mi300x-sglang
-    - qwen3.5-bf16-mi325x-sglang
-    - qwen3.5-bf16-mi355x-sglang
-    - qwen3.5-fp8-mi300x-sglang
-    - qwen3.5-fp8-mi325x-sglang
-    - qwen3.5-fp8-mi355x-sglang
-  description:
-    - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911
-  evals-only: true
-
 - config-keys:
     - qwen3.5-bf16-mi300x-sglang
     - qwen3.5-bf16-mi325x-sglang
@@ -1258,6 +1264,13 @@
     - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973
 
+- config-keys:
+    - kimik2.5-int4-mi300x-vllm
+  description:
+    - "Add Kimi K2.5 INT4 single-node MI300X vLLM benchmark (TP8)"
+    - "Uses vLLM ROCm v0.18.0 image following AMD Andy Luo's recipe"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/975
+
 - config-keys:
     - dsr1-fp8-mi355x-atom-mtp
   description:
@@ -1271,53 +1284,62 @@
   description:
     - "New model support on ATOM framework"
     - "Kimi-K2.5 FP4, and MiniMax-M2.5 FP8 configs added for MI355X ATOM"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/963
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/992
 
 - config-keys:
-    - minimaxm2.5-fp8-b200-vllm
+    - minimaxm2.5-fp4-b200-vllm
   description:
-    - "Update vLLM image from v0.17.0 to v0.19.0 for MiniMax-M2.5 FP8 B200"
-    - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs"
-    - "Remove ISL 1024 / OSL 8192 seq-len config"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947
+    - "Optimize MiniMax-M2.5 NVFP4 B200 vLLM search-space"
+    - "Expand from tp2/tp4 to tp1/tp2/tp4/tp8 with expert parallel and dp-attn variants"
+    - "Add ep2, ep4, and dp-attn configurations for higher concurrency sweeps"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/996
 
 - config-keys:
-    - minimaxm2.5-fp8-mi355x-vllm
+    - dsr1-fp4-b200-dynamo-trt
+    - dsr1-fp8-b200-dynamo-trt
+    - dsr1-fp4-b200-dynamo-sglang
+    - dsr1-fp8-b200-dynamo-sglang
+    - dsr1-fp8-b200-dynamo-sglang-mtp
+    - dsr1-fp4-b200-dynamo-sglang-mtp
+    - dsr1-fp4-b300-dynamo-trt
+    - dsr1-fp8-b300-dynamo-trt
+    - dsr1-fp4-gb300-dynamo-trt
+    - dsr1-fp8-gb300-dynamo-trt
+    - dsr1-fp4-gb300-dynamo-sglang
+    - dsr1-fp8-gb300-dynamo-sglang
+    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-mi355x-sglang-disagg-mtp
+    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg-mtp
   description:
-    - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space"
-    - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1002
+    - "Add multi-node lm-eval accuracy runs"
+    - "Eval picks the config with highest max eligible concurrency per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) group on 8k1k"
+    - "Eval concurrency set to the median eligible conc (>= MIN_EVAL_CONC=16) of the selected config to avoid OOM"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1000
+  evals-only: true
 
 - config-keys:
-    - minimaxm2.5-fp8-mi355x-vllm
+    - qwen3.5-fp8-h200-sglang-mtp
+  description:
+    - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001
+
+- config-keys:
+    - minimaxm2.5-fp8-mi355x-vllm
   description:
     - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space"
     - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs"
     - "Upgrade vLLM image to v0.19.0"
     - "Enable FP8 KV cache + AITER FA for minimaxm2.5-fp8-mi355x-vllm"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1003
-  
-- config-keys:
-    - minimaxm2.5-fp4-mi355x-vllm
-  description:
-    - "Add MiniMax M2.5 MXFP4 vLLM benchmark for MI355X"
-    - "Model: amd/MiniMax-M2.5-MXFP4 with --trust-remote-code and --block-size=32"
-    - "Image: vllm/vllm-openai-rocm:v0.19.1"
-    - "Environment: VLLM_ROCM_USE_AITER=1"
-    - "Tp=1, TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/827
 
-- config-keys:
-    - qwen3.5-fp8-h200-sglang-mtp
-  description:
-    - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001
 
 - config-keys:
-    - glm5-fp8-mi355x-atom
+    - qwen3.5-fp4-mi355x-sglang
   description:
-    - "GLM5 FP8 configs added for MI355X ATOM"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1009
+    - "Qwen3.5 fp4 support on SGL"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1006
+
 
 - config-keys:
     - kimik2.5-fp4-gb200-dynamo-vllm
@@ -1333,37 +1355,16 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1008
 
 - config-keys:
-    - minimaxm2.5-fp8-b200-vllm
-  description:
-    - "Update MiniMax-M2.5 FP8 B200 config with new search spaces"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1010
-
-- config-keys:
-    - minimaxm2.5-fp4-b200-vllm
-  description:
-    - "Optimize MiniMax-M2.5 NVFP4 B200 vLLM search-space"
-    - "Expand from tp2/tp4 to tp1/tp2/tp4/tp8 with expert parallel and dp-attn variants"
-    - "Add ep2, ep4, and dp-attn configurations for higher concurrency sweeps"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/996
-
-- config-keys:
-    - qwen3.5-fp4-b200-sglang
+    - glm5-fp8-mi355x-atom
   description:
-    - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang benchmark config and launch script"
-    - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64"
-    - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
-    - "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820
+    - "GLM5 FP8 configs added for MI355X ATOM"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1009
 
 - config-keys:
-    - qwen3.5-bf16-mi300x-sglang
-    - qwen3.5-bf16-mi325x-sglang
-    - qwen3.5-fp8-mi300x-sglang
-    - qwen3.5-fp8-mi325x-sglang
+    - minimaxm2.5-fp8-b200-vllm
   description:
-    - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and  MI325X to achieve better performance"
-    - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1063
+    - "Update MiniMax-M2.5 FP8 B200 config with new search spaces"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1010
 
 - config-keys:
     - glm5-fp4-b200-sglang
@@ -1374,32 +1375,26 @@
     - "Tune mem-fraction-static to 0.9, chunked-prefill-size to 32768, add tokenizer-worker-num 6"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1011
 
-- config-keys:
-    - qwen3.5-fp4-mi355x-sglang
-  description:
-    - "Qwen3.5 fp4 support on SGL"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1006
-  
-- config-keys:
-    - gptoss-fp4-h200-trt
-  description:
-    - "Upgrade TensorRT-LLM container from release:gpt-oss-dev to release:v1.3.0rc5"
-    - "Remove sed hack for TensorRT bug (fixed upstream in v1.3.0rc5)"
-    - "Remove enable_block_reuse: false from kv_cache_config (default true is now recommended)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/854
-
 - config-keys:
     - glm5-fp8-b200-sglang
   description:
     - "Bump GLM-5 FP8 B200 SGLang concurrency from 128 to 256"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1012
-  
+
+
+- config-keys:
+    - qwen3.5-fp8-h200-sglang-mtp
+  description:
+    - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017
+
 - config-keys:
     - qwen3.5-fp4-mi355x-sglang
   description:
     - "TP2/TP4 seach space exploration for Qwen3.5 fp4 on SGL"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1022
-  
+
+
 - config-keys:
     - glm5-fp8-mi355x-sglang
   description:
@@ -1408,10 +1403,28 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1023
 
 - config-keys:
-    - qwen3.5-fp8-h200-sglang-mtp
+    - kimik2.5-fp4-gb200-dynamo-trt
   description:
-    - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017
+    - "Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo (14 STP configs)"
+    - "New framework: dynamo-trt (Dynamo frontend + TensorRT-LLM backend)"
+    - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+    - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026"
+    - "Runner script updated to support kimik2.5 model prefix with dynamo-trt framework"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1026
+
+- config-keys:
+    - glm5-fp4-b200-sglang
+  description:
+    - "Update SGLang image from nightly-dev-cu13-20260328-a27651d5 to v0.5.10.post1-cu130"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1031
+
+- config-keys:
+    - qwen3.5-fp8-b300-sglang-mtp
+  description:
+    - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang MTP benchmark"
+    - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
+    - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035
 
 - config-keys:
     - qwen3.5-fp8-mi355x-sglang
@@ -1424,18 +1437,33 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1036
 
 - config-keys:
-    - glm5-fp4-b200-sglang
+    - qwen3.5-fp8-mi355x-atom
+    - qwen3.5-fp8-mi355x-atom-mtp
   description:
-    - "Update SGLang image from nightly-dev-cu13-20260328-a27651d5 to v0.5.10.post1-cu130"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1031
+    - "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1040
+
 
 - config-keys:
-    - qwen3.5-fp8-b300-sglang-mtp
+    - qwen3.5-fp4-mi355x-sglang
   description:
-    - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang MTP benchmark"
-    - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
-    - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035
+    - "Update SGLang image from 'lmsysorg/sglang:v0.5.10-rocm720-mi35x' to 'rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413'"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1041
+
+- config-keys:
+    - glm5.1-fp4-mi355x-atom
+  description:
+    - "Add GLM-5.1 MXFP4 single-node MI355X ATOM benchmark"
+    - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post"
+    - "TP=2 and TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths"
+    - "Add --max-num-seqs and --gpu-memory-utilization 0.9 to server launch"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1043
+
+- config-keys:
+    - kimik2.5-fp4-b200-vllm
+  description:
+    - "Add kv-cache-dtype fp8, max-cudagraph-capture-size 2048, max-num-batched-tokens, and stream-interval 20 to server launch args"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1047
 
 - config-keys:
     - qwen3.5-fp8-b300-sglang
@@ -1469,6 +1497,22 @@
     - "At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 does not have a B300-specific recipe, so this reuses the existing GLM5 FP8 B200 SGLang recipe as-is"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1051
 
+- config-keys:
+    - minimaxm2.5-fp8-b300-vllm
+  description:
+    - "Add MiniMax-M2.5 FP8 B300 vLLM benchmark"
+    - "Image: vllm/vllm-openai:v0.19.0-cu130"
+    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054
+
+- config-keys:
+    - minimaxm2.5-fp4-b300-vllm
+  description:
+    - "Add MiniMax-M2.5 FP4 (NVFP4) B300 vLLM benchmark"
+    - "Image: vllm/vllm-openai:v0.19.0-cu130"
+    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP4 B200 vLLM recipe as-is"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1055
+
 - config-keys:
     - glm5-fp4-b300-sglang
   description:
@@ -1487,59 +1531,33 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1059
 
 - config-keys:
-    - minimaxm2.5-fp4-b300-vllm
-  description:
-    - "Add MiniMax-M2.5 FP4 (NVFP4) B300 vLLM benchmark"
-    - "Image: vllm/vllm-openai:v0.19.0-cu130"
-    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP4 B200 vLLM recipe as-is"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1055
-
-- config-keys:
-    - minimaxm2.5-fp8-b300-vllm
+    - gptoss-fp4-mi300x-vllm
   description:
-    - "Add MiniMax-M2.5 FP8 B300 vLLM benchmark"
-    - "Image: vllm/vllm-openai:v0.19.0-cu130"
-    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054
+    - "Expand GPT-OSS 120B FP4 MI300X TP=1 concurrency from 64 to 256 for 1k1k"
+    - "Higher concurrency improves MoE weight amortization: 8552 total TPS at conc=256 vs 4016 at conc=64 (2.1x)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1061
 
 - config-keys:
-    - kimik2.5-fp4-b300-vllm
+    - qwen3.5-bf16-mi300x-sglang
+    - qwen3.5-bf16-mi325x-sglang
+    - qwen3.5-fp8-mi300x-sglang
+    - qwen3.5-fp8-mi325x-sglang
   description:
-    - "Add Kimi-K2.5 FP4 (NVFP4) B300 vLLM benchmark"
-    - "Image: vllm/vllm-openai:v0.19.0-cu130"
-    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1056
+    - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and  MI325X to achieve better performance"
+    - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1063
 
 - config-keys:
-    - gptoss-fp4-mi300x-vllm
+    - minimaxm2.5-fp8-b200-vllm
   description:
-    - "Expand GPT-OSS 120B FP4 MI300X TP=1 concurrency from 64 to 256 for 1k1k"
-    - "Higher concurrency improves MoE weight amortization: 8552 total TPS at conc=256 vs 4016 at conc=64 (2.1x)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1053
+    - "Add VLLM_FLOAT32_MATMUL_PRECISION=high, remove VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1068
 
 - config-keys:
-    - dsr1-fp4-b200-dynamo-trt
-    - dsr1-fp8-b200-dynamo-trt
-    - dsr1-fp4-b200-dynamo-sglang
-    - dsr1-fp8-b200-dynamo-sglang
-    - dsr1-fp8-b200-dynamo-sglang-mtp
-    - dsr1-fp4-b200-dynamo-sglang-mtp
-    - dsr1-fp4-b300-dynamo-trt
-    - dsr1-fp8-b300-dynamo-trt
-    - dsr1-fp4-gb300-dynamo-trt
-    - dsr1-fp8-gb300-dynamo-trt
-    - dsr1-fp4-gb300-dynamo-sglang
-    - dsr1-fp8-gb300-dynamo-sglang
-    - dsr1-fp8-mi355x-sglang-disagg
-    - dsr1-fp8-mi355x-sglang-disagg-mtp
-    - dsr1-fp4-mi355x-sglang-disagg
-    - dsr1-fp4-mi355x-sglang-disagg-mtp
+    - minimaxm2.5-fp4-b200-vllm
   description:
-    - "Add multi-node lm-eval accuracy runs"
-    - "Eval picks the config with highest max eligible concurrency per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) group on 8k1k"
-    - "Eval concurrency set to the median eligible conc (>= MIN_EVAL_CONC=16) of the selected config to avoid OOM"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1000
-  evals-only: true
+    - "Add VLLM_FLOAT32_MATMUL_PRECISION=high"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1069
 
 - config-keys:
     - qwen3.5-fp4-b300-sglang
@@ -1550,17 +1568,7 @@
     - "Follows the SGLang cookbook recipe at https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5 as of 2026-04-17"
     - "Mirrors the B200 FP4 recipe with mem-fraction-static lowered to 0.8 and an extra TP2/EP2 search-space entry"
     - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
-
-- config-keys:
-    - qwen3.5-bf16-b300-sglang
-  description:
-    - "Add Qwen3.5-397B-A17B BF16 B300 SGLang benchmark"
-    - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
-    - "Model: Qwen/Qwen3.5-397B-A17B"
-    - "Mirrors the B200 BF16 recipe with an extra TP4/EP1 search-space entry alongside the existing TP8/EP1 sweep"
-    - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1072
 
 - config-keys:
     - qwen3.5-bf16-b200-sglang-mtp
@@ -1570,7 +1578,47 @@
     - "Model: Qwen/Qwen3.5-397B-A17B"
     - "Mirrors the qwen3.5-bf16-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
     - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-64 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1074
+
+- config-keys:
+    - qwen3.5-fp4-b200-sglang-mtp
+  description:
+    - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang MTP benchmark"
+    - "Image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6"
+    - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
+    - "Mirrors the qwen3.5-fp4-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
+    - "Configs: 1k1k and 8k1k, TP=4/EP=1 conc 4-128 with spec-decoding=mtp"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1075
+
+- config-keys:
+    - qwen3.5-fp8-mi355x-sglang-mtp
+  description:
+    - "Add Qwen3.5-397B-A17B FP8 MI355X SGLang MTP benchmark"
+    - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414"
+    - "Model: Qwen/Qwen3.5-397B-A17B-FP8"
+    - "Mirrors the qwen3.5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
+    - "Configs: 1k1k (TP8/EP1, TP8/EP8, TP2/EP2) and 8k1k (TP2/EP2, TP4/EP1) with spec-decoding=mtp"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1076
+
+- config-keys:
+    - qwen3.5-bf16-mi355x-sglang-mtp
+  description:
+    - "Add Qwen3.5-397B-A17B BF16 MI355X SGLang MTP benchmark"
+    - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
+    - "Model: Qwen/Qwen3.5-397B-A17B"
+    - "Mirrors the qwen3.5-bf16-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
+    - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1077
+
+- config-keys:
+    - qwen3.5-bf16-b300-sglang
+  description:
+    - "Add Qwen3.5-397B-A17B BF16 B300 SGLang benchmark"
+    - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
+    - "Model: Qwen/Qwen3.5-397B-A17B"
+    - "Mirrors the B200 BF16 recipe with an extra TP4/EP1 search-space entry alongside the existing TP8/EP1 sweep"
+    - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1081
 
 - config-keys:
     - qwen3.5-bf16-b300-sglang-mtp
@@ -1580,7 +1628,7 @@
     - "Model: Qwen/Qwen3.5-397B-A17B"
     - "Mirrors the qwen3.5-bf16-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
     - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64, spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1082
 
 - config-keys:
     - qwen3.5-fp4-b300-sglang-mtp
@@ -1590,7 +1638,7 @@
     - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
     - "Mirrors the qwen3.5-fp4-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
     - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128, spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1083
 
 - config-keys:
     - glm5-fp8-b300-sglang-mtp
@@ -1600,17 +1648,7 @@
     - "Model: zai-org/GLM-5-FP8"
     - "Mirrors the glm5-fp8-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
     - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
-
-- config-keys:
-    - qwen3.5-bf16-mi355x-sglang-mtp
-  description:
-    - "Add Qwen3.5-397B-A17B BF16 MI355X SGLang MTP benchmark"
-    - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
-    - "Model: Qwen/Qwen3.5-397B-A17B"
-    - "Mirrors the qwen3.5-bf16-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
-    - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1084
 
 - config-keys:
     - glm5-fp8-b200-sglang-mtp
@@ -1620,27 +1658,7 @@
     - "Model: zai-org/GLM-5-FP8"
     - "Mirrors the glm5-fp8-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
     - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
-
-- config-keys:
-    - glm5-fp4-b300-sglang-mtp
-  description:
-    - "Add GLM-5 NVFP4 B300 SGLang MTP benchmark (draft)"
-    - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
-    - "Model: nvidia/GLM-5-NVFP4"
-    - "Follows the glm5-fp8-b300-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
-    - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
-
-- config-keys:
-    - qwen3.5-fp8-mi355x-sglang-mtp
-  description:
-    - "Add Qwen3.5-397B-A17B FP8 MI355X SGLang MTP benchmark"
-    - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414"
-    - "Model: Qwen/Qwen3.5-397B-A17B-FP8"
-    - "Mirrors the qwen3.5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
-    - "Configs: 1k1k (TP8/EP1, TP8/EP8, TP2/EP2) and 8k1k (TP2/EP2, TP4/EP1) with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1085
 
 - config-keys:
     - glm5-fp8-mi355x-sglang-mtp
@@ -1650,17 +1668,7 @@
     - "Model: zai-org/GLM-5-FP8"
     - "Mirrors the glm5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
     - "Configs: 1k1k and 8k1k, TP=8 conc 4-64 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
-
-- config-keys:
-    - qwen3.5-fp4-b200-sglang-mtp
-  description:
-    - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang MTP benchmark"
-    - "Image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6"
-    - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
-    - "Mirrors the qwen3.5-fp4-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)"
-    - "Configs: 1k1k and 8k1k, TP=4/EP=1 conc 4-128 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1086
 
 - config-keys:
     - glm5-fp4-b200-sglang-mtp
@@ -1670,7 +1678,17 @@
     - "Model: nvidia/GLM-5-NVFP4"
     - "Follows the glm5-fp8-b200-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
     - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1087
+
+- config-keys:
+    - glm5-fp4-b300-sglang-mtp
+  description:
+    - "Add GLM-5 NVFP4 B300 SGLang MTP benchmark (draft)"
+    - "Image: lmsysorg/sglang:v0.5.10.post1-cu130"
+    - "Model: nvidia/GLM-5-NVFP4"
+    - "Follows the glm5-fp8-b300-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1"
+    - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1088
 
 - config-keys:
     - gptoss-fp4-mi300x-vllm
@@ -1679,12 +1697,6 @@
     - "low-latency endpoint for users prioritizing interactive single-user use cases (chat, copilot, agentic)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1092
 
-- config-keys:
-    - kimik2.5-fp4-b200-vllm
-  description:
-    - "Add kv-cache-dtype fp8, max-cudagraph-capture-size 2048, max-num-batched-tokens, and stream-interval 20 to server launch args"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1047
-
 - config-keys:
     - dsr1-fp8-h200-dynamo-trt
     - dsr1-fp8-h200-dynamo-sglang
@@ -1693,6 +1705,21 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1094
   evals-only: true
 
+- config-keys:
+    - glm5.1-fp4-mi355x-sglang
+  description:
+    - "Add GLM5.1 MXFP4 (FP4) MI355X SGLang Support"
+    - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1098
+
+- config-keys:
+    - kimik2.5-fp4-b300-vllm
+  description:
+    - "Add Kimi-K2.5 FP4 (NVFP4) B300 vLLM benchmark"
+    - "Image: vllm/vllm-openai:v0.19.0-cu130"
+    - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1100
+
 - config-keys:
     - minimaxm2.5-fp8-b300-vllm
   description:
@@ -1706,16 +1733,11 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1107
 
 - config-keys:
-    - minimaxm2.5-fp8-b200-vllm
-  description:
-    - "Add VLLM_FLOAT32_MATMUL_PRECISION=high, remove VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1068
-
-- config-keys:
-    - minimaxm2.5-fp4-b200-vllm
+    - dsr1-fp8-h100-dynamo-trt
+    - dsr1-fp8-h100-dynamo-sglang
   description:
-    - "Add VLLM_FLOAT32_MATMUL_PRECISION=high"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1069
+    - "Trigger H100 multinode evals after dist-timeout and health-check timeout fixes"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1119
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
@@ -1723,4 +1745,296 @@
   description:
     - "Trigger H100 multinode evals after NVSHEMM fixes"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1120
-  evals-only: true
\ No newline at end of file
+  evals-only: true
+
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-vllm
+  description:
+    - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (1k/1k sweep; 8k/1k currently commented out)"
+    - "Container: vllm/vllm-openai:deepseekv4-cu130; model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)"
+    - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern"
+    - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129
+
+
+- config-keys:
+    - dsv4-fp8-h200-vllm
+  description:
+    - "Add DeepSeek-V4-Pro vLLM H200 benchmark per https://vllm.ai/blog/deepseek-v4"
+    - "Image: vllm/vllm-openai:deepseekv4-cu129"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro"
+    - "EP + DP=8, FP8 KV cache, block size 256, max-model-len 800000, prefix caching disabled"
+    - "H200 has no FP4 path, so --attention_config.use_fp4_indexer_cache is omitted"
+    - "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading"
+    - "Configs: 1k1k conc 4-64, 8k1k conc 4-64"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130
+
+- config-keys:
+    - dsv4-fp4-b200-sglang
+  description:
+    - "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)"
+    - "Container: lmsysorg/sglang:deepseek-v4-blackwell"
+    - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+    - "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config"
+    - "Prefix caching and speculative decoding disabled for baseline numbers"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131
+
+- config-keys:
+    - dsv4-fp8-mi355x-sglang
+  description:
+    - "Day 0 DeepSeek-V4-Pro FP8 MI355X SGLang benchmark"
+    - "Image: rocm/sgl-dev:deepseek-v4-mi35x (from sgl-project/sglang#23608)"
+    - "Model: sgl-project/DeepSeek-V4-Pro-FP8"
+    - "https://github.com/sgl-project/sglang/pull/23608#issuecomment-4311952977"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1134
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark (low-latency fallback)"
+    - "Image: lmsysorg/sglang:deepseek-v4-b300"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro"
+    - "Low-latency only (TP=8, EP=1, no DP-attn, no DeepEP) — DeepEP FP8 weight-postprocess path is broken for this checkpoint on B300"
+    - "Prefix caching disabled, no speculative decoding"
+    - "Configs: 1k1k conc 4-1024, 8k1k conc 4-512"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143
+
+- config-keys:
+    - dsv4-fp4-b300-vllm
+  description:
+    - "Add DeepSeek-V4-Pro single-node B300 vLLM aggregate benchmark"
+    - "Image: vllm/vllm-openai:deepseekv4-cu130"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro"
+    - "Uses the submitted B300 pareto schedule for both 1k1k and 8k1k, excluding conc 1: TP8 at conc 4/128, TP4 at conc 4/8/16/32/64/128, DP4 at conc 256/512"
+    - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048"
+    - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1144
+
+- config-keys:
+    - dsv4-fp8-mi355x-sglang
+  description:
+    - "Bump MI355X SLURM allocation from --time=180 to --time=300 in runners/launch_mi355x-amds.sh"
+    - "DSv4-Pro on MI355X exceeded the 3h cap (STEP CANCELLED DUE TO TIME LIMIT) due to ~30min MoE JIT compile plus slow torch-fallback kernels (SGLANG_HACK_FLASHMLA_BACKEND=torch et al.) from sgl-project/sglang#23608"
+    - "300 minutes matches the GH Actions outer timeout-minutes cap in benchmark-tmpl.yml"
+    - "Retriggering dsv4-fp8-mi355x-sglang"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1148
+
+- config-keys:
+    - dsv4-fp8-mi355x-sglang
+  description:
+    - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh"
+    - "Bump --chunked-prefill-size from 4096 to 8192"
+    - "Retrigger dsv4-fp8-mi355x-sglang"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)"
+    - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script"
+    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
+    - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
+    - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1165
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)"
+    - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script"
+    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
+    - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
+    - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170
+
+- config-keys:
+    - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
+    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro"
+    - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
+    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
+    - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
+    - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166
+
+- config-keys:
+   - dsv4-fp4-b300-vllm
+  description:
+    - "Update search space based on B300 pareto sweep results"
+    - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192"
+    - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Recipe-per-CONC split for DeepSeek-V4-Pro on B300: low-latency (TP=8, EP=1), balanced (TP=4, EP=1) at conc=32, max-throughput (TP=4, EP=4, DP-attn + DeepEP) at conc=512, for both 1k1k and 8k1k"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+    - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3"
+    - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185
+  
+- config-keys:
+    - dsv4-fp4-b200-sglang
+  description:
+    - "Two-recipe dispatch for DeepSeek-V4-Pro on B200, selected by DP_ATTENTION knob: low-latency (TP=8, EP=1, flashinfer_mxfp4) for conc 1-32, DP-attention (TP=8, EP=8, DP-attn + DeepEP + mega_moe) for conc 64-{512,1024}. The DP-attention recipe uses identical flags across balanced and max-throughput CONC ranges; only --max-running-requests scales with CONC."
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+    - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b"
+    - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187
+
+- config-keys:
+    - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182
+
+- config-keys:
+    - dsv4-fp4-b300-vllm
+  description:
+    - Add low-latency configs and remove non-pareto configs
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1193
+
+- config-keys:
+    - dsv4-fp4-b200-vllm
+  description:
+    - "Add DeepSeek-V4-Pro single-node B200 vLLM benchmark derived from B200 pareto sweep"
+    - "ISL=1024: TP8 conc 4-128; DP8 (dp-attn) conc 256-4096"
+    - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156
+
+- config-keys:
+   - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
+    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro"
+    - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
+    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
+    - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
+    - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180
+
+- config-keys:
+    - dsv4-fp8-mi355x-vllm
+  description:
+    - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)"
+    - "Base image rocm/atom:rocm7.2.2 (MI355X ROCm 7.2.2, aiter with MLA decode); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime via --no-deps overlay"
+    - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1"
+    - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k"
+    - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8"
+    - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768"
+    - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Use ROCm/aiter#2916 mhc_pre device-allocation fix instead of disabling ATOM mhc_pre"
+    - "Patch installed aiter/ops/mhc.py at runtime to allocate mhc_pre intermediates on residual.device, preserving the aiter MHC fast path without rebuilding aiter"
+    - "Remove the ATOM deepseek_v4.py sed workaround that forced mhc_pre to torch fallback"
+    - "Keep dsv4-fp4-mi355x-atom at CONC=1 only; run 24953107645 showed high-concurrency DSv4 ATOM OOMs in PR #650 torch sparse-attention fallbacks before upstream AITER sparse-attention support lands"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1202
+
+- config-keys:
+    - dsv4-fp4-b300-vllm-mtp
+  description:
+    - "Add preliminary vLLM MTP configs for DeepSeek-V4-Pro on B300"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1210
+
+- config-keys:
+    - dsv4-fp4-b200-vllm
+  description:
+    - "Pin image to vllm/vllm-openai:v0.20.0-cu130 (was floating deepseekv4-cu130 tag); DeepGEMM is preinstalled in this image"
+    - "Use --attention_config.use_fp4_indexer_cache=True and --compilation-config {\"cudagraph_mode\": \"FULL_AND_PIECEWISE\", \"custom_ops\": [\"all\"]} for all configs"
+    - "Gate --moe-backend deep_gemm_mega_moe and --gpu-memory-utilization 0.85 on DP_ATTENTION=true per the v0.20.0 recipe"
+    - "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204
+
+- config-keys:
+    - minimaxm2.5-fp4-mi355x-atom
+  description:
+    - "Add MiniMax-M2.5 MXFP4 MI355X Atom benchmark (rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post)"
+    - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042
+ 
+- config-keys:  
+    - dsv4-fp4-gb200-dynamo-vllm
+  description:
+    - "DSV4-Pro FP4 GB200 dynamo-vLLM disagg against srt-slurm aflowers/vllm-gb200-v0.20.0"
+    - "Keeps the three validated 8k/1k points: low-latency 1P/1D TP8 conc=1, mid-curve 1P/1D DEP8 conc=256, and max-tpt 3P/1D DEP8 conc=4096"
+    - "All three recipes run NATS/etcd on a dedicated infra node and use compute-node local NVMe model weights via /mnt/numa1/models/deepseek-v4-pro/"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1163
+
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-vllm
+  description:
+    - "Add GB200 Dynamo vLLM MegaMOE max-throughput recipe at conc=4096"
+    - "Topology matches max-tpt: 3 prefill DEP8 workers and 1 decode DEP8 worker with dedicated NATS/etcd"
+    - "Uses deep_gemm_mega_moe on prefill/decode, TORCH_SYMMMEM=NVSHMEM, and no offload"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218
+
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-vllm
+  description:
+    - "Add GB200 Dynamo vLLM low-middle curve recipe at conc=256/512"
+    - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd"
+    - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Add conc=8192 recipe for 1k1k: deepep mega_moe backend with cuda-graph-max-bs 1088, max-running-requests 8192, mem-fraction-static 0.80, swa-full-tokens-ratio 0.3, tokenizer-worker-num 16"
+    - "conc=8192 enables SGLANG_OPT_USE_ONLINE_COMPRESS=1 and --stream-interval 30"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1209
+  
+- config-keys:
+    - dsv4-fp4-b300-vllm
+  description:
+    - "Change image to vllm/vllm-openai:v0.20.0-cu130"
+    - "Use Mega MoE for DEP configs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1221
+
+- config-keys:
+    - dsv4-fp4-b200-vllm-mtp
+  description:
+    - "Add preliminary vLLM MTP configs for DeepSeek-V4-Pro on B200"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1230
+
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-vllm
+  description:
+    - "Keep the GB200 Dynamo vLLM MegaMOE max-throughput recipe at 3P/1D DEP8 conc=4096"
+    - "Add GB200 Dynamo vLLM MegaMOE high-throughput recipe at 2P/1D DEP8 conc=4096"
+    - "Add GB200 Dynamo vLLM MegaMOE mid-curve recipe at 1P/1D DEP8 conc=256/512/1024"
+    - "Remove stale offload recipe copies and the old no-MegaMOE mid/max-throughput points from the GB200 Dynamo vLLM matrix"
+    - "Disable FlashInfer autotune on GB200 decode workers for accuracy stability, matching the srt-slurm recipe fix"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1223
+
+- config-keys:
+    - dsv4-fp4-gb300-dynamo-sglang
+  description:
+    - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)"
+    - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)"
+    - "Topologies mirror the dsv4-fp4-gb300-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B"
+    - "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157
+
+- config-keys:
+    - glm5-fp8-mi355x-sglang-mtp
+  description:
+    - "Add GLM5 FP8 MTP MI355X SGLang Support" 
+    - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122

From f1cb159fdaae8bce8db981765884b447dfbf8a56 Mon Sep 17 00:00:00 2001
From: ajith-sirra-amd <ajith.sirra@amd.com>
Date: Thu, 30 Apr 2026 13:49:12 +0530
Subject: [PATCH 10/10] Added Chat Template

Signed-off-by: ajith-sirra-amd <ajith.sirra@amd.com>
---
 benchmarks/single_node/glm5_fp8_mi355x_mtp.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh
index 504ba0184..5c28ebeaf 100755
--- a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh
+++ b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh
@@ -73,7 +73,8 @@ run_benchmark_serving \
     --num-prompts "$((CONC * 10))" \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --use-chat-template
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then