From ad9ac4841b17b20e1bb4d3cac3fc8b17165f6537 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 29 Apr 2026 17:34:36 -0700
Subject: [PATCH 01/14] Add DSv4 TRT B200/300 test

---
 .github/configs/nvidia-master.yaml          |  38 +++++++
 benchmarks/single_node/dsv4_fp4_b200_trt.sh | 116 ++++++++++++++++++++
 benchmarks/single_node/dsv4_fp4_b300_trt.sh |   7 ++
 perf-changelog.yaml                         |   9 ++
 4 files changed, 170 insertions(+)
 create mode 100644 benchmarks/single_node/dsv4_fp4_b200_trt.sh
 create mode 100644 benchmarks/single_node/dsv4_fp4_b300_trt.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a841cb704..7680ba232 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1727,6 +1727,26 @@ dsv4-fp4-b200-vllm:
     - { tp: 8, conc-start: 1, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
 
+# DeepSeek-V4-Pro TRTLLM bring-up. Keep this TP-only and below the eval
+# threshold until TRTLLM has a DSv4 chat-template/parser path wired in this repo.
+dsv4-fp4-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b200-dsv4
+  precision: fp4
+  framework: trt
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
+
 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
 # B200 SGLang recipe as-is until B300-specific tuning is available.
@@ -2540,6 +2560,24 @@ dsv4-fp4-b300-vllm:
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
+dsv4-fp4-b300-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: trt
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
+
 dsv4-fp4-b300-vllm-mtp:
   image: vllm/vllm-openai:v0.20.0-cu130
   model: deepseek-ai/DeepSeek-V4-Pro
diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
new file mode 100644
index 000000000..115fe71f0
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+
+# DeepSeek-V4-Pro single-node TRTLLM bring-up recipe. This intentionally starts
+# with low-concurrency TP-only STP points; DSv4 has no Jinja chat template and
+# TRTLLM does not currently have a DSv4-specific chat parser wired here.
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    DP_ATTENTION \
+    EP_SIZE
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
+
+if [[ "$MODEL" != /* ]]; then
+    hf download "$MODEL"
+fi
+
+nvidia-smi
+
+SERVER_LOG="$PWD/server.log"
+PORT=${PORT:-8888}
+EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"
+
+MOE_BACKEND="TRTLLM"
+MAX_BATCH_SIZE=$(( CONC > 8 ? CONC : 8 ))
+CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
+KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.80}"
+
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    echo "DSv4 TRTLLM bring-up only supports TP-only search-space entries for now." >&2
+    exit 1
+fi
+
+cat > "$EXTRA_CONFIG_FILE" << EOF
+cuda_graph_config:
+    enable_padding: true
+    max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE
+enable_attention_dp: false
+print_iter_log: true
+kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
+    enable_block_reuse: false
+stream_interval: 10
+num_postprocess_workers: 4
+moe_config:
+    backend: $MOE_BACKEND
+EOF
+
+echo "Generated config file contents:"
+cat "$EXTRA_CONFIG_FILE"
+
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
+MAX_NUM_TOKENS=$(( ISL + OSL + 256 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
+
+set -x
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+    trtllm-serve "$MODEL" --port="$PORT" \
+    --trust_remote_code \
+    --backend=pytorch \
+    --max_batch_size="$MAX_BATCH_SIZE" \
+    --max_seq_len="$MAX_MODEL_LEN" \
+    --max_num_tokens="$MAX_NUM_TOKENS" \
+    --tp_size="$TP" \
+    --ep_size="$EP_SIZE" \
+    --extra_llm_api_options="$EXTRA_CONFIG_FILE" \
+    > "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$(( CONC * 10 ))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir "$PWD/" \
+    --dsv4 \
+    --trust-remote-code \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
new file mode 100644
index 000000000..9ced0f972
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# B300 uses the same low-concurrency TRTLLM bring-up recipe as B200. The B300
+# runner may rewrite MODEL to the pre-staged /data/models/dsv4-pro path before
+# this script is invoked.
+
+bash "$(dirname "$0")/dsv4_fp4_b200_trt.sh"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 422d5347f..502d93eda 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2006,3 +2006,12 @@
     - "Change image to vllm/vllm-openai:v0.20.0-cu130"
     - "Use Mega MoE for DEP configs"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1221
+
+- config-keys:
+    - dsv4-fp4-b200-trt
+    - dsv4-fp4-b300-trt
+  description:
+    - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300"
+    - "Use TensorRT-LLM release 1.2.0rc6.post2 with TP8, EP1, STP-only conc 1-8 for 1k1k and 8k1k"
+    - "Benchmark prompts use the repository DSv4 encoder via --dsv4; evals are intentionally avoided by keeping conc below the eval threshold until TRTLLM DSv4 chat handling is wired"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX

From 488ab3d3a97cbc92ccf672fad02a3971151e1c1c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 29 Apr 2026 19:12:49 -0700
Subject: [PATCH 02/14] fix: use TensorRT-LLM DeepSeek-V4 branch image

---
 .github/configs/nvidia-master.yaml          | 17 ++--
 benchmarks/benchmark_lib.sh                 | 12 ++-
 benchmarks/single_node/dsv4_fp4_b200_trt.sh | 43 +++++----
 perf-changelog.yaml                         |  4 +-
 utils/bench_serving/backend_request_func.py |  3 +-
 utils/build_trtllm_deepseek_v4_image.sh     | 99 +++++++++++++++++++++
 6 files changed, 148 insertions(+), 30 deletions(-)
 create mode 100755 utils/build_trtllm_deepseek_v4_image.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c03b54f05..e6a70bbd5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1727,10 +1727,11 @@ dsv4-fp4-b200-vllm:
     - { tp: 8, conc-start: 1, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
 
-# DeepSeek-V4-Pro TRTLLM bring-up. Keep this TP-only and below the eval
-# threshold until TRTLLM has a DSv4 chat-template/parser path wired in this repo.
+# DeepSeek-V4-Pro TRTLLM bring-up. Requires a TensorRT-LLM image built from
+# NVIDIA/TensorRT-LLM@feat/deepseek_v4; public release images do not include
+# the DSv4 model, sparse attention, tokenizer, and cache-manager code.
 dsv4-fp4-b200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -1741,11 +1742,11 @@ dsv4-fp4-b200-trt:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
+    - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
+    - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 }
 
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
@@ -2586,7 +2587,7 @@ dsv4-fp4-b300-vllm:
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
 dsv4-fp4-b300-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -2597,11 +2598,11 @@ dsv4-fp4-b300-trt:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
+    - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
+    - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 }
 
 dsv4-fp4-b300-vllm-mtp:
   image: vllm/vllm-openai:v0.20.0-cu130
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 268745735..d5323c599 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -165,11 +165,12 @@ wait_for_server_ready() {
 }
 
 # Run benchmark serving with standardized parameters
-# All parameters are required except --use-chat-template, --dsv4, and --trust-remote-code
+# All parameters are required except --endpoint, --use-chat-template, --dsv4, and --trust-remote-code
 # Parameters:
 #   --model: Model name
 #   --port: Server port
 #   --backend: Backend type - e.g., 'vllm' or 'openai'
+#   --endpoint: Optional API endpoint override
 #   --input-len: Random input sequence length
 #   --output-len: Random output sequence length
 #   --random-range-ratio: Random range ratio
@@ -194,6 +195,7 @@ run_benchmark_serving() {
     local model=""
     local port=""
     local backend=""
+    local endpoint=""
     local input_len=""
     local output_len=""
     local random_range_ratio=""
@@ -221,6 +223,10 @@ run_benchmark_serving() {
                 backend="$2"
                 shift 2
                 ;;
+            --endpoint)
+                endpoint="$2"
+                shift 2
+                ;;
             --input-len)
                 input_len="$2"
                 shift 2
@@ -356,6 +362,10 @@ run_benchmark_serving() {
         --result-dir "$result_dir"
         --result-filename "$result_filename.json"
     )
+
+    if [[ -n "$endpoint" ]]; then
+        benchmark_cmd+=(--endpoint "$endpoint")
+    fi
     
     # Add --use-chat-template if requested
     if [[ "$use_chat_template" == true ]]; then
diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
index 115fe71f0..d33ce6fde 100644
--- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
@@ -1,8 +1,7 @@
 #!/usr/bin/env bash
 
-# DeepSeek-V4-Pro single-node TRTLLM bring-up recipe. This intentionally starts
-# with low-concurrency TP-only STP points; DSv4 has no Jinja chat template and
-# TRTLLM does not currently have a DSv4-specific chat parser wired here.
+# DeepSeek-V4-Pro single-node TRTLLM bring-up recipe for NVIDIA/TensorRT-LLM
+# feat/deepseek_v4. The public release images do not contain this model path.
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
@@ -35,22 +34,27 @@ PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"
 
 MOE_BACKEND="TRTLLM"
-MAX_BATCH_SIZE=$(( CONC > 8 ? CONC : 8 ))
+MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
-KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.80}"
+KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
 
+ATTENTION_DP_CONFIG=""
 if [[ "$DP_ATTENTION" == "true" ]]; then
-    echo "DSv4 TRTLLM bring-up only supports TP-only search-space entries for now." >&2
-    exit 1
+    ATTENTION_DP_CONFIG="
+attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60"
 fi
 
 cat > "$EXTRA_CONFIG_FILE" << EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE
-enable_attention_dp: false
+enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG
 print_iter_log: true
 kv_cache_config:
+    tokens_per_block: 128
     dtype: fp8
     free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
     enable_block_reuse: false
@@ -77,15 +81,18 @@ start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x
 mpirun -n 1 --oversubscribe --allow-run-as-root \
-    trtllm-serve "$MODEL" --port="$PORT" \
+    trtllm-serve "$MODEL" \
+    --host 0.0.0.0 \
+    --port "$PORT" \
     --trust_remote_code \
-    --backend=pytorch \
-    --max_batch_size="$MAX_BATCH_SIZE" \
-    --max_seq_len="$MAX_MODEL_LEN" \
-    --max_num_tokens="$MAX_NUM_TOKENS" \
-    --tp_size="$TP" \
-    --ep_size="$EP_SIZE" \
-    --extra_llm_api_options="$EXTRA_CONFIG_FILE" \
+    --backend pytorch \
+    --max_batch_size "$MAX_BATCH_SIZE" \
+    --max_seq_len "$MAX_MODEL_LEN" \
+    --max_num_tokens "$MAX_NUM_TOKENS" \
+    --tp_size "$TP" \
+    --ep_size "$EP_SIZE" \
+    --custom_tokenizer deepseek_v4 \
+    --config "$EXTRA_CONFIG_FILE" \
     > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
@@ -95,7 +102,8 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
-    --backend openai \
+    --backend openai-chat \
+    --endpoint /v1/chat/completions \
     --input-len "$ISL" \
     --output-len "$OSL" \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
@@ -103,7 +111,6 @@ run_benchmark_serving \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir "$PWD/" \
-    --dsv4 \
     --trust-remote-code \
     --server-pid "$SERVER_PID"
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 0ce2f2760..d1280468e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2028,6 +2028,6 @@
     - dsv4-fp4-b300-trt
   description:
     - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300"
-    - "Use TensorRT-LLM release 1.2.0rc6.post2 with TP8, EP1, STP-only conc 1-8 for 1k1k and 8k1k"
-    - "Benchmark prompts use the repository DSv4 encoder via --dsv4; evals are intentionally avoided by keeping conc below the eval threshold until TRTLLM DSv4 chat handling is wired"
+    - "Use a TensorRT-LLM image built from NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1) with TP8, EP8, STP-only conc 1-8 for 1k1k and 8k1k"
+    - "Serve DeepSeek-V4-Pro through TRTLLM's deepseek_v4 tokenizer and /v1/chat/completions; include utils/build_trtllm_deepseek_v4_image.sh for building the required branch image"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233
diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py
index af030720e..7f4a93284 100644
--- a/utils/bench_serving/backend_request_func.py
+++ b/utils/bench_serving/backend_request_func.py
@@ -341,8 +341,9 @@ async def async_request_openai_chat_completions(
 
     async with aiohttp.ClientSession(trust_env=True,
                                      timeout=AIOHTTP_TIMEOUT) as session:
-        content = [{"type": "text", "text": request_func_input.prompt}]
+        content = request_func_input.prompt
         if request_func_input.multi_modal_content:
+            content = [{"type": "text", "text": request_func_input.prompt}]
             content.append(request_func_input.multi_modal_content)
         payload = {
             "model": request_func_input.model_name \
diff --git a/utils/build_trtllm_deepseek_v4_image.sh b/utils/build_trtllm_deepseek_v4_image.sh
new file mode 100755
index 000000000..883f310c0
--- /dev/null
+++ b/utils/build_trtllm_deepseek_v4_image.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+TRTLLM_REPO="${TRTLLM_REPO:-https://github.com/NVIDIA/TensorRT-LLM.git}"
+TRTLLM_REF="${TRTLLM_REF:-feat/deepseek_v4}"
+TRTLLM_COMMIT="${TRTLLM_COMMIT:-f1c5fe143febb70cd74f0fb4ccca1516206268d7}"
+IMAGE_WITH_TAG="${IMAGE_WITH_TAG:-ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1}"
+CUDA_ARCHS="${CUDA_ARCHS:-100-real;103-real}"
+PUSH="${PUSH:-0}"
+KEEP_SRC="${KEEP_SRC:-0}"
+
+require_cmd() {
+    if ! command -v "$1" >/dev/null 2>&1; then
+        echo "Missing required command: $1" >&2
+        exit 1
+    fi
+}
+
+to_enroot_image() {
+    local image="$1"
+    local registry="${image%%/*}"
+    local rest="${image#*/}"
+
+    if [[ "$image" == "$rest" ]]; then
+        printf '%s\n' "$image"
+    elif [[ "$registry" == *.* || "$registry" == *:* || "$registry" == "localhost" ]]; then
+        printf '%s#%s\n' "$registry" "$rest"
+    else
+        printf '%s\n' "$image"
+    fi
+}
+
+require_cmd docker
+require_cmd git
+require_cmd make
+
+if ! docker buildx version >/dev/null 2>&1; then
+    echo "docker buildx is required to build TensorRT-LLM release images." >&2
+    exit 1
+fi
+
+if ! git lfs version >/dev/null 2>&1; then
+    echo "git-lfs is required. Install it, then rerun this script." >&2
+    exit 1
+fi
+
+WORKDIR=""
+if [[ -n "${TRTLLM_SRC_DIR:-}" ]]; then
+    SRC_DIR="$TRTLLM_SRC_DIR"
+else
+    WORKDIR="$(mktemp -d "${TMPDIR:-/tmp}/trtllm-dsv4-build.XXXXXX")"
+    SRC_DIR="$WORKDIR/TensorRT-LLM"
+fi
+
+cleanup() {
+    if [[ -n "$WORKDIR" && "$KEEP_SRC" != "1" ]]; then
+        rm -rf "$WORKDIR"
+    elif [[ -n "$WORKDIR" ]]; then
+        echo "Keeping TensorRT-LLM checkout at $SRC_DIR"
+    fi
+}
+trap cleanup EXIT
+
+if [[ ! -d "$SRC_DIR/.git" ]]; then
+    git clone --recurse-submodules --branch "$TRTLLM_REF" "$TRTLLM_REPO" "$SRC_DIR"
+fi
+
+cd "$SRC_DIR"
+git fetch origin "$TRTLLM_REF"
+git checkout -B "$TRTLLM_REF" "origin/$TRTLLM_REF" 2>/dev/null || git checkout "$TRTLLM_REF"
+if [[ -n "$TRTLLM_COMMIT" ]]; then
+    git checkout "$TRTLLM_COMMIT"
+fi
+git submodule update --init --recursive
+git lfs install --local
+git lfs pull
+
+ACTUAL_COMMIT="$(git rev-parse HEAD)"
+
+echo "Building TensorRT-LLM DeepSeek-V4 image"
+echo "  source: $TRTLLM_REPO"
+echo "  ref:    $TRTLLM_REF"
+echo "  commit: $ACTUAL_COMMIT"
+echo "  image:  $IMAGE_WITH_TAG"
+echo "  archs:  $CUDA_ARCHS"
+
+make -C docker release_build \
+    IMAGE_WITH_TAG="$IMAGE_WITH_TAG" \
+    CUDA_ARCHS="$CUDA_ARCHS" \
+    GIT_COMMIT="$ACTUAL_COMMIT"
+
+if [[ "$PUSH" == "1" ]]; then
+    docker push "$IMAGE_WITH_TAG"
+fi
+
+echo
+echo "Docker image: $IMAGE_WITH_TAG"
+echo "InferenceX/enroot image string: $(to_enroot_image "$IMAGE_WITH_TAG")"

From e079fb7495add73def7130a6a977a20685a98805 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 29 Apr 2026 19:26:58 -0700
Subject: [PATCH 03/14] fix: point DeepSeek V4 image to correct org

---
 .github/configs/nvidia-master.yaml      | 4 ++--
 utils/build_trtllm_deepseek_v4_image.sh | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index e6a70bbd5..e1b3fe5e6 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1731,7 +1731,7 @@ dsv4-fp4-b200-vllm:
 # NVIDIA/TensorRT-LLM@feat/deepseek_v4; public release images do not include
 # the DSv4 model, sparse attention, tokenizer, and cache-manager code.
 dsv4-fp4-b200-trt:
-  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
+  image: ghcr.io#semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -2587,7 +2587,7 @@ dsv4-fp4-b300-vllm:
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
 dsv4-fp4-b300-trt:
-  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
+  image: ghcr.io#semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/utils/build_trtllm_deepseek_v4_image.sh b/utils/build_trtllm_deepseek_v4_image.sh
index 883f310c0..dfff4b80e 100755
--- a/utils/build_trtllm_deepseek_v4_image.sh
+++ b/utils/build_trtllm_deepseek_v4_image.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 TRTLLM_REPO="${TRTLLM_REPO:-https://github.com/NVIDIA/TensorRT-LLM.git}"
 TRTLLM_REF="${TRTLLM_REF:-feat/deepseek_v4}"
 TRTLLM_COMMIT="${TRTLLM_COMMIT:-f1c5fe143febb70cd74f0fb4ccca1516206268d7}"
-IMAGE_WITH_TAG="${IMAGE_WITH_TAG:-ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1}"
+IMAGE_WITH_TAG="${IMAGE_WITH_TAG:-ghcr.io/semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1}"
 CUDA_ARCHS="${CUDA_ARCHS:-100-real;103-real}"
 PUSH="${PUSH:-0}"
 KEEP_SRC="${KEEP_SRC:-0}"

From 6a949a6eee3919b61020b1224c531d3311ec1f3e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 29 Apr 2026 21:33:45 -0700
Subject: [PATCH 04/14] Use runtime TensorRT-LLM DSv4 bootstrap

---
 .github/configs/nvidia-master.yaml            |  10 +-
 benchmarks/single_node/dsv4_fp4_b200_trt.sh   |   6 +-
 .../single_node/trtllm_dsv4_bootstrap.sh      | 113 ++++++++++++++++++
 perf-changelog.yaml                           |   4 +-
 utils/build_trtllm_deepseek_v4_image.sh       |  99 ---------------
 5 files changed, 125 insertions(+), 107 deletions(-)
 create mode 100644 benchmarks/single_node/trtllm_dsv4_bootstrap.sh
 delete mode 100755 utils/build_trtllm_deepseek_v4_image.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index e1b3fe5e6..d66ddaede 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1727,11 +1727,11 @@ dsv4-fp4-b200-vllm:
     - { tp: 8, conc-start: 1, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
 
-# DeepSeek-V4-Pro TRTLLM bring-up. Requires a TensorRT-LLM image built from
-# NVIDIA/TensorRT-LLM@feat/deepseek_v4; public release images do not include
-# the DSv4 model, sparse attention, tokenizer, and cache-manager code.
+# DeepSeek-V4-Pro TRTLLM bring-up. Public release/devel tags do not include
+# the DSv4 branch yet, so the benchmark script bootstraps
+# NVIDIA/TensorRT-LLM@feat/deepseek_v4 inside the official devel image.
 dsv4-fp4-b200-trt:
-  image: ghcr.io#semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
+  image: nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -2587,7 +2587,7 @@ dsv4-fp4-b300-vllm:
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
 dsv4-fp4-b300-trt:
-  image: ghcr.io#semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
+  image: nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
index d33ce6fde..dcdafa479 100644
--- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
@@ -1,9 +1,11 @@
 #!/usr/bin/env bash
 
 # DeepSeek-V4-Pro single-node TRTLLM bring-up recipe for NVIDIA/TensorRT-LLM
-# feat/deepseek_v4. The public release images do not contain this model path.
+# feat/deepseek_v4. The public release/devel images do not contain this model
+# path yet, so the script builds and installs the pinned branch under /tmp.
 
 source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/trtllm_dsv4_bootstrap.sh"
 
 check_env_vars \
     MODEL \
@@ -23,6 +25,8 @@ fi
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
+bootstrap_trtllm_dsv4 || exit 1
+
 if [[ "$MODEL" != /* ]]; then
     hf download "$MODEL"
 fi
diff --git a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh
new file mode 100644
index 000000000..0074e08ed
--- /dev/null
+++ b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+# Build and install the TensorRT-LLM DeepSeek-V4 feature branch at runtime.
+# This avoids relying on a custom prebuilt image while still picking up the
+# branch's required C++/CUDA kernels and Python model/tokenizer code.
+
+trtllm_dsv4_supported() {
+    python3 - <<'PY'
+import importlib
+import sys
+
+try:
+    import tensorrt_llm  # noqa: F401
+    import torch
+
+    importlib.import_module("tensorrt_llm._torch.models.modeling_deepseekv4")
+    importlib.import_module(
+        "tensorrt_llm._torch.attention_backend.sparse.deepseek_v4.deepseek_v4"
+    )
+    getattr(torch.ops.trtllm, "compressor_prefill_reduction")
+    getattr(torch.ops.trtllm, "compressor_paged_kv_compress")
+    getattr(torch.ops.trtllm, "compressor_postprocess_scatter")
+except Exception as exc:
+    print(f"TensorRT-LLM DeepSeek-V4 support check failed: {exc}", file=sys.stderr)
+    raise SystemExit(1)
+PY
+}
+
+bootstrap_trtllm_dsv4() {
+    if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" == "0" ]]; then
+        echo "TRTLLM_DSV4_BOOTSTRAP=0; skipping TensorRT-LLM DeepSeek-V4 bootstrap"
+        return 0
+    fi
+
+    if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" != "force" ]] && trtllm_dsv4_supported; then
+        echo "TensorRT-LLM DeepSeek-V4 support already available"
+        return 0
+    fi
+
+    local repo="${TRTLLM_DSV4_REPO:-https://github.com/NVIDIA/TensorRT-LLM.git}"
+    local branch="${TRTLLM_DSV4_BRANCH:-feat/deepseek_v4}"
+    local ref="${TRTLLM_DSV4_REF:-f1c5fe143febb70cd74f0fb4ccca1516206268d7}"
+    local src="${TRTLLM_DSV4_SRC:-/tmp/trtllm-dsv4-src}"
+    local build_dir="${TRTLLM_DSV4_BUILD_DIR:-/tmp/trtllm-dsv4-build}"
+    local dist_dir="${TRTLLM_DSV4_DIST_DIR:-/tmp/trtllm-dsv4-wheel}"
+    local archs="${TRTLLM_DSV4_CUDA_ARCHITECTURES:-100-real;103-real}"
+    local lock_file="${TRTLLM_DSV4_LOCK_FILE:-/tmp/trtllm-dsv4-bootstrap.lock}"
+
+    echo "Bootstrapping TensorRT-LLM DeepSeek-V4 support"
+    echo "  repo:   $repo"
+    echo "  branch: $branch"
+    echo "  ref:    $ref"
+    echo "  archs:  $archs"
+
+    if ! command -v git >/dev/null 2>&1; then
+        if command -v apt-get >/dev/null 2>&1; then
+            apt-get update
+            apt-get install -y git
+        else
+            echo "git is required to bootstrap TensorRT-LLM DeepSeek-V4 support" >&2
+            return 1
+        fi
+    fi
+
+    (
+        set -euo pipefail
+        flock 9
+
+        if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" != "force" ]] && trtllm_dsv4_supported; then
+            echo "TensorRT-LLM DeepSeek-V4 support became available while waiting for bootstrap lock"
+            exit 0
+        fi
+
+        if [[ ! -d "$src/.git" ]]; then
+            rm -rf "$src"
+            git clone \
+                --filter=blob:none \
+                --single-branch \
+                --branch "$branch" \
+                "$repo" "$src"
+        fi
+
+        cd "$src"
+        git fetch origin "$branch" --depth 1
+        git fetch origin "$ref" --depth 1 || true
+        git checkout "$ref"
+        git submodule update --init --recursive --depth 1
+
+        if command -v git-lfs >/dev/null 2>&1; then
+            git lfs install --local
+            git lfs pull
+        else
+            echo "git-lfs not found; continuing without LFS pull"
+        fi
+
+        rm -rf "$dist_dir"
+        mkdir -p "$dist_dir"
+
+        python3 scripts/build_wheel.py \
+            --cuda_architectures "$archs" \
+            --build_dir "$build_dir" \
+            --dist_dir "$dist_dir" \
+            --clean \
+            --skip-stubs \
+            ${TRTLLM_DSV4_BUILD_ARGS:-}
+
+        local wheel
+        wheel="$(ls -t "$dist_dir"/tensorrt_llm*.whl | head -1)"
+        python3 -m pip install --force-reinstall --no-deps "$wheel"
+    ) 9>"$lock_file"
+
+    trtllm_dsv4_supported
+}
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d1280468e..3b3bb5d2f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2028,6 +2028,6 @@
     - dsv4-fp4-b300-trt
   description:
     - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300"
-    - "Use a TensorRT-LLM image built from NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1) with TP8, EP8, STP-only conc 1-8 for 1k1k and 8k1k"
-    - "Serve DeepSeek-V4-Pro through TRTLLM's deepseek_v4 tokenizer and /v1/chat/completions; include utils/build_trtllm_deepseek_v4_image.sh for building the required branch image"
+    - "Use nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13 and build/install NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1) at runtime under /tmp"
+    - "Serve DeepSeek-V4-Pro through TRTLLM's deepseek_v4 tokenizer and /v1/chat/completions with TP8, EP8, STP-only conc 1-8 for 1k1k and 8k1k"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233
diff --git a/utils/build_trtllm_deepseek_v4_image.sh b/utils/build_trtllm_deepseek_v4_image.sh
deleted file mode 100755
index dfff4b80e..000000000
--- a/utils/build_trtllm_deepseek_v4_image.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-TRTLLM_REPO="${TRTLLM_REPO:-https://github.com/NVIDIA/TensorRT-LLM.git}"
-TRTLLM_REF="${TRTLLM_REF:-feat/deepseek_v4}"
-TRTLLM_COMMIT="${TRTLLM_COMMIT:-f1c5fe143febb70cd74f0fb4ccca1516206268d7}"
-IMAGE_WITH_TAG="${IMAGE_WITH_TAG:-ghcr.io/semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1}"
-CUDA_ARCHS="${CUDA_ARCHS:-100-real;103-real}"
-PUSH="${PUSH:-0}"
-KEEP_SRC="${KEEP_SRC:-0}"
-
-require_cmd() {
-    if ! command -v "$1" >/dev/null 2>&1; then
-        echo "Missing required command: $1" >&2
-        exit 1
-    fi
-}
-
-to_enroot_image() {
-    local image="$1"
-    local registry="${image%%/*}"
-    local rest="${image#*/}"
-
-    if [[ "$image" == "$rest" ]]; then
-        printf '%s\n' "$image"
-    elif [[ "$registry" == *.* || "$registry" == *:* || "$registry" == "localhost" ]]; then
-        printf '%s#%s\n' "$registry" "$rest"
-    else
-        printf '%s\n' "$image"
-    fi
-}
-
-require_cmd docker
-require_cmd git
-require_cmd make
-
-if ! docker buildx version >/dev/null 2>&1; then
-    echo "docker buildx is required to build TensorRT-LLM release images." >&2
-    exit 1
-fi
-
-if ! git lfs version >/dev/null 2>&1; then
-    echo "git-lfs is required. Install it, then rerun this script." >&2
-    exit 1
-fi
-
-WORKDIR=""
-if [[ -n "${TRTLLM_SRC_DIR:-}" ]]; then
-    SRC_DIR="$TRTLLM_SRC_DIR"
-else
-    WORKDIR="$(mktemp -d "${TMPDIR:-/tmp}/trtllm-dsv4-build.XXXXXX")"
-    SRC_DIR="$WORKDIR/TensorRT-LLM"
-fi
-
-cleanup() {
-    if [[ -n "$WORKDIR" && "$KEEP_SRC" != "1" ]]; then
-        rm -rf "$WORKDIR"
-    elif [[ -n "$WORKDIR" ]]; then
-        echo "Keeping TensorRT-LLM checkout at $SRC_DIR"
-    fi
-}
-trap cleanup EXIT
-
-if [[ ! -d "$SRC_DIR/.git" ]]; then
-    git clone --recurse-submodules --branch "$TRTLLM_REF" "$TRTLLM_REPO" "$SRC_DIR"
-fi
-
-cd "$SRC_DIR"
-git fetch origin "$TRTLLM_REF"
-git checkout -B "$TRTLLM_REF" "origin/$TRTLLM_REF" 2>/dev/null || git checkout "$TRTLLM_REF"
-if [[ -n "$TRTLLM_COMMIT" ]]; then
-    git checkout "$TRTLLM_COMMIT"
-fi
-git submodule update --init --recursive
-git lfs install --local
-git lfs pull
-
-ACTUAL_COMMIT="$(git rev-parse HEAD)"
-
-echo "Building TensorRT-LLM DeepSeek-V4 image"
-echo "  source: $TRTLLM_REPO"
-echo "  ref:    $TRTLLM_REF"
-echo "  commit: $ACTUAL_COMMIT"
-echo "  image:  $IMAGE_WITH_TAG"
-echo "  archs:  $CUDA_ARCHS"
-
-make -C docker release_build \
-    IMAGE_WITH_TAG="$IMAGE_WITH_TAG" \
-    CUDA_ARCHS="$CUDA_ARCHS" \
-    GIT_COMMIT="$ACTUAL_COMMIT"
-
-if [[ "$PUSH" == "1" ]]; then
-    docker push "$IMAGE_WITH_TAG"
-fi
-
-echo
-echo "Docker image: $IMAGE_WITH_TAG"
-echo "InferenceX/enroot image string: $(to_enroot_image "$IMAGE_WITH_TAG")"

From 9488f346184df9991ee8431260a4157583d5051e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 29 Apr 2026 22:07:23 -0700
Subject: [PATCH 05/14] Fix TensorRT-LLM DSv4 runtime wheel build

---
 benchmarks/single_node/trtllm_dsv4_bootstrap.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh
index 0074e08ed..55f0b6491 100644
--- a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh
+++ b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh
@@ -96,12 +96,13 @@ bootstrap_trtllm_dsv4() {
         rm -rf "$dist_dir"
         mkdir -p "$dist_dir"
 
+        # setup.py sanity-checks for the generated bindings/ stubs directory.
+        # Do not use --skip-stubs here, or wheel packaging fails after C++ build.
         python3 scripts/build_wheel.py \
             --cuda_architectures "$archs" \
             --build_dir "$build_dir" \
             --dist_dir "$dist_dir" \
             --clean \
-            --skip-stubs \
             ${TRTLLM_DSV4_BUILD_ARGS:-}
 
         local wheel

From b0cc6656a49abe1211bb7d597190d26a60e3a36c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Wed, 29 Apr 2026 22:26:43 -0700
Subject: [PATCH 06/14] Use DeepSeek V4 TRTLLM image

---
 .github/configs/nvidia-master.yaml            | 10 ++---
 .github/workflows/benchmark-tmpl.yml          |  2 +
 benchmarks/single_node/dsv4_fp4_b200_trt.sh   |  5 ++-
 .../single_node/trtllm_dsv4_bootstrap.sh      |  5 +--
 perf-changelog.yaml                           |  2 +-
 runners/launch_b200-dgxc.sh                   | 39 +++++++++++++++++++
 runners/launch_b300-nv.sh                     | 39 +++++++++++++++++++
 7 files changed, 91 insertions(+), 11 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 77e82ee19..126f6f3f1 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1727,11 +1727,11 @@ dsv4-fp4-b200-vllm:
     - { tp: 8, conc-start: 1, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
 
-# DeepSeek-V4-Pro TRTLLM bring-up. Public release/devel tags do not include
-# the DSv4 branch yet, so the benchmark script bootstraps
-# NVIDIA/TensorRT-LLM@feat/deepseek_v4 inside the official devel image.
+# DeepSeek-V4-Pro TRTLLM bring-up. This uses a TensorRT-LLM image built from
+# NVIDIA/TensorRT-LLM@feat/deepseek_v4; the benchmark script keeps a guarded
+# source-build fallback if the image is missing the required DSv4 support.
 dsv4-fp4-b200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -2587,7 +2587,7 @@ dsv4-fp4-b300-vllm:
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
 dsv4-fp4-b300-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index c38082cbe..012c48975 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -89,6 +89,8 @@ env:
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
   EVAL_ONLY: ${{ inputs.eval-only }}
+  GHCR_TOKEN: ${{ secrets.GHCR_TOKEN || secrets.REPO_PAT }}
+  GHCR_USER: ${{ secrets.GHCR_USER || github.actor }}
   PYTHONDONTWRITEBYTECODE: '1'
   PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
index dcdafa479..ae75b5388 100644
--- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
@@ -1,8 +1,9 @@
 #!/usr/bin/env bash
 
 # DeepSeek-V4-Pro single-node TRTLLM bring-up recipe for NVIDIA/TensorRT-LLM
-# feat/deepseek_v4. The public release/devel images do not contain this model
-# path yet, so the script builds and installs the pinned branch under /tmp.
+# feat/deepseek_v4. The configured image should already contain this branch;
+# bootstrap_trtllm_dsv4 verifies that and only builds the pinned branch as a
+# fallback.
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 source "$(dirname "$0")/trtllm_dsv4_bootstrap.sh"
diff --git a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh
index 55f0b6491..fb11aee4e 100644
--- a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh
+++ b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh
@@ -1,8 +1,7 @@
 #!/usr/bin/env bash
 
-# Build and install the TensorRT-LLM DeepSeek-V4 feature branch at runtime.
-# This avoids relying on a custom prebuilt image while still picking up the
-# branch's required C++/CUDA kernels and Python model/tokenizer code.
+# Verify TensorRT-LLM DeepSeek-V4 support and, if needed, build/install the
+# pinned feature branch at runtime as a fallback.
 
 trtllm_dsv4_supported() {
     python3 - <<'PY'
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 47c46a207..9ce2ee998 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2037,6 +2037,6 @@
     - dsv4-fp4-b300-trt
   description:
     - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300"
-    - "Use nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13 and build/install NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1) at runtime under /tmp"
+    - "Use ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1, built from NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1), with a guarded source-build fallback"
     - "Serve DeepSeek-V4-Pro through TRTLLM's deepseek_v4 tokenizer and /v1/chat/completions with TP8, EP8, STP-only conc 1-8 for 1k1k and 8k1k"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index edf5db957..f81ee0027 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -6,6 +6,45 @@ SLURM_ACCOUNT="benchmark"
 
 set -x
 
+configure_enroot_ghcr_auth() {
+    case "$IMAGE" in
+        ghcr.io#*|ghcr.io/*) ;;
+        *) return 0 ;;
+    esac
+
+    if [[ -z "${GHCR_TOKEN:-}" ]]; then
+        echo "GHCR_TOKEN is not set; attempting anonymous ghcr.io import"
+        return 0
+    fi
+
+    local config_dir="${ENROOT_CONFIG_PATH:-${XDG_CONFIG_HOME:-$HOME/.config}/enroot}"
+    local credentials_file="$config_dir/.credentials"
+    local tmp_file
+    local ghcr_user="${GHCR_USER:-${GITHUB_ACTOR:-oauth2}}"
+    local xtrace_was_set=0
+
+    case "$-" in
+        *x*) xtrace_was_set=1; set +x ;;
+    esac
+
+    mkdir -p "$config_dir"
+    touch "$credentials_file"
+    chmod 600 "$credentials_file"
+    tmp_file="$(mktemp "${credentials_file}.XXXXXX")"
+    grep -v '^machine ghcr\.io ' "$credentials_file" > "$tmp_file" || true
+    printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file"
+    mv "$tmp_file" "$credentials_file"
+    chmod 600 "$credentials_file"
+
+    if [[ "$xtrace_was_set" == "1" ]]; then
+        set -x
+    fi
+
+    echo "Configured enroot credentials for ghcr.io"
+}
+
+configure_enroot_ghcr_auth
+
 if [[ "$IS_MULTINODE" == "true" ]]; then
 
     # Validate framework
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 3c855e805..795ea38fc 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -6,6 +6,45 @@ SLURM_ACCOUNT="benchmark"
 
 set -x
 
+configure_enroot_ghcr_auth() {
+    case "$IMAGE" in
+        ghcr.io#*|ghcr.io/*) ;;
+        *) return 0 ;;
+    esac
+
+    if [[ -z "${GHCR_TOKEN:-}" ]]; then
+        echo "GHCR_TOKEN is not set; attempting anonymous ghcr.io import"
+        return 0
+    fi
+
+    local config_dir="${ENROOT_CONFIG_PATH:-${XDG_CONFIG_HOME:-$HOME/.config}/enroot}"
+    local credentials_file="$config_dir/.credentials"
+    local tmp_file
+    local ghcr_user="${GHCR_USER:-${GITHUB_ACTOR:-oauth2}}"
+    local xtrace_was_set=0
+
+    case "$-" in
+        *x*) xtrace_was_set=1; set +x ;;
+    esac
+
+    mkdir -p "$config_dir"
+    touch "$credentials_file"
+    chmod 600 "$credentials_file"
+    tmp_file="$(mktemp "${credentials_file}.XXXXXX")"
+    grep -v '^machine ghcr\.io ' "$credentials_file" > "$tmp_file" || true
+    printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file"
+    mv "$tmp_file" "$credentials_file"
+    chmod 600 "$credentials_file"
+
+    if [[ "$xtrace_was_set" == "1" ]]; then
+        set -x
+    fi
+
+    echo "Configured enroot credentials for ghcr.io"
+}
+
+configure_enroot_ghcr_auth
+
 if [[ "$IS_MULTINODE" == "true" ]]; then
 
 # Validate framework

From 8ee56afdc050875d9291b2d02c40802c15246e0b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 30 Apr 2026 07:54:06 -0700
Subject: [PATCH 07/14] Use anonymous GHCR pulls by default

---
 .github/workflows/benchmark-tmpl.yml |  2 +-
 runners/launch_b200-dgxc.sh          | 17 ++++++++++-------
 runners/launch_b300-nv.sh            | 17 ++++++++++-------
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 012c48975..b8138fe35 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -89,7 +89,7 @@ env:
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
   EVAL_ONLY: ${{ inputs.eval-only }}
-  GHCR_TOKEN: ${{ secrets.GHCR_TOKEN || secrets.REPO_PAT }}
+  GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }}
   GHCR_USER: ${{ secrets.GHCR_USER || github.actor }}
   PYTHONDONTWRITEBYTECODE: '1'
   PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index f81ee0027..3014c16fc 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -12,11 +12,6 @@ configure_enroot_ghcr_auth() {
         *) return 0 ;;
     esac
 
-    if [[ -z "${GHCR_TOKEN:-}" ]]; then
-        echo "GHCR_TOKEN is not set; attempting anonymous ghcr.io import"
-        return 0
-    fi
-
     local config_dir="${ENROOT_CONFIG_PATH:-${XDG_CONFIG_HOME:-$HOME/.config}/enroot}"
     local credentials_file="$config_dir/.credentials"
     local tmp_file
@@ -32,7 +27,11 @@ configure_enroot_ghcr_auth() {
     chmod 600 "$credentials_file"
     tmp_file="$(mktemp "${credentials_file}.XXXXXX")"
     grep -v '^machine ghcr\.io ' "$credentials_file" > "$tmp_file" || true
-    printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file"
+
+    if [[ -n "${GHCR_TOKEN:-}" ]]; then
+        printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file"
+    fi
+
     mv "$tmp_file" "$credentials_file"
     chmod 600 "$credentials_file"
 
@@ -40,7 +39,11 @@ configure_enroot_ghcr_auth() {
         set -x
     fi
 
-    echo "Configured enroot credentials for ghcr.io"
+    if [[ -n "${GHCR_TOKEN:-}" ]]; then
+        echo "Configured enroot credentials for ghcr.io"
+    else
+        echo "GHCR_TOKEN is not set; removed stale ghcr.io credentials for anonymous import"
+    fi
 }
 
 configure_enroot_ghcr_auth
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 795ea38fc..39111556b 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -12,11 +12,6 @@ configure_enroot_ghcr_auth() {
         *) return 0 ;;
     esac
 
-    if [[ -z "${GHCR_TOKEN:-}" ]]; then
-        echo "GHCR_TOKEN is not set; attempting anonymous ghcr.io import"
-        return 0
-    fi
-
     local config_dir="${ENROOT_CONFIG_PATH:-${XDG_CONFIG_HOME:-$HOME/.config}/enroot}"
     local credentials_file="$config_dir/.credentials"
     local tmp_file
@@ -32,7 +27,11 @@ configure_enroot_ghcr_auth() {
     chmod 600 "$credentials_file"
     tmp_file="$(mktemp "${credentials_file}.XXXXXX")"
     grep -v '^machine ghcr\.io ' "$credentials_file" > "$tmp_file" || true
-    printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file"
+
+    if [[ -n "${GHCR_TOKEN:-}" ]]; then
+        printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file"
+    fi
+
     mv "$tmp_file" "$credentials_file"
     chmod 600 "$credentials_file"
 
@@ -40,7 +39,11 @@ configure_enroot_ghcr_auth() {
         set -x
     fi
 
-    echo "Configured enroot credentials for ghcr.io"
+    if [[ -n "${GHCR_TOKEN:-}" ]]; then
+        echo "Configured enroot credentials for ghcr.io"
+    else
+        echo "GHCR_TOKEN is not set; removed stale ghcr.io credentials for anonymous import"
+    fi
 }
 
 configure_enroot_ghcr_auth

From 2d48f08cffe0c57bee338a6fbf5737f9818b6fb9 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 30 Apr 2026 11:23:16 -0700
Subject: [PATCH 08/14] Fix DSv4 TRT launch env

---
 benchmarks/single_node/dsv4_fp4_b200_trt.sh | 17 ++++++++++++++---
 benchmarks/single_node/dsv4_fp4_b300_trt.sh |  7 ++++++-
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
index ae75b5388..28d148299 100644
--- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
@@ -26,6 +26,9 @@ fi
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
+export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
+echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
+
 bootstrap_trtllm_dsv4 || exit 1
 
 if [[ "$MODEL" != /* ]]; then
@@ -85,7 +88,7 @@ fi
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x
-mpirun -n 1 --oversubscribe --allow-run-as-root \
+SERVE_CMD=(
     trtllm-serve "$MODEL" \
     --host 0.0.0.0 \
     --port "$PORT" \
@@ -97,8 +100,16 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
     --tp_size "$TP" \
     --ep_size "$EP_SIZE" \
     --custom_tokenizer deepseek_v4 \
-    --config "$EXTRA_CONFIG_FILE" \
-    > "$SERVER_LOG" 2>&1 &
+    --config "$EXTRA_CONFIG_FILE"
+)
+
+if [[ "${TRTLLM_DSV4_USE_MPIRUN:-1}" == "0" ]]; then
+    "${SERVE_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+else
+    mpirun -n 1 --oversubscribe --allow-run-as-root \
+        "${SERVE_CMD[@]}" \
+        > "$SERVER_LOG" 2>&1 &
+fi
 
 SERVER_PID=$!
 
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
index 9ced0f972..03791dcd6 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
@@ -2,6 +2,11 @@
 
 # B300 uses the same low-concurrency TRTLLM bring-up recipe as B200. The B300
 # runner may rewrite MODEL to the pre-staged /data/models/dsv4-pro path before
-# this script is invoked.
+# this script is invoked. The job itself is already launched under srun; keep
+# mpirun local so OpenMPI does not try to use Slurm PMI/PMIx from inside pyxis.
+
+export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-1}"
+export OMPI_MCA_plm="${OMPI_MCA_plm:-isolated}"
+export OMPI_MCA_ras="${OMPI_MCA_ras:-^slurm}"
 
 bash "$(dirname "$0")/dsv4_fp4_b200_trt.sh"

From 6e75819d7b60f8e8c0d9319e137a89ae7e5c5a72 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 30 Apr 2026 11:54:35 -0700
Subject: [PATCH 09/14] Bypass mpirun for B300 DSv4 TRT

---
 benchmarks/single_node/dsv4_fp4_b300_trt.sh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
index 03791dcd6..3c1763835 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
@@ -2,11 +2,9 @@
 
 # B300 uses the same low-concurrency TRTLLM bring-up recipe as B200. The B300
 # runner may rewrite MODEL to the pre-staged /data/models/dsv4-pro path before
-# this script is invoked. The job itself is already launched under srun; keep
-# mpirun local so OpenMPI does not try to use Slurm PMI/PMIx from inside pyxis.
+# this script is invoked. The job itself is already launched under srun/pyxis;
+# avoid nested mpirun because this cluster's OpenMPI build lacks Slurm PMIx.
 
-export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-1}"
-export OMPI_MCA_plm="${OMPI_MCA_plm:-isolated}"
-export OMPI_MCA_ras="${OMPI_MCA_ras:-^slurm}"
+export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-0}"
 
 bash "$(dirname "$0")/dsv4_fp4_b200_trt.sh"

From e1e762dfbefe25a65eda6f08f5267a8199479a4a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 30 Apr 2026 12:35:00 -0700
Subject: [PATCH 10/14] larger sweep + mpi

---
 .github/configs/nvidia-master.yaml   | 10 ++++++----
 .github/workflows/benchmark-tmpl.yml |  5 +++++
 runners/launch_b300-nv.sh            |  1 +
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 126f6f3f1..f26e68766 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1742,11 +1742,13 @@ dsv4-fp4-b200-trt:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 }
+    - { tp: 8, conc-start: 1, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 }
+    - { tp: 8, conc-start: 1, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
 
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
@@ -2598,11 +2600,11 @@ dsv4-fp4-b300-trt:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 }
 
 dsv4-fp4-b300-vllm-mtp:
   image: vllm/vllm-openai:v0.20.0-cu130
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index b8138fe35..188aff5cd 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -140,6 +140,11 @@ jobs:
           rm -f results*.json || true
           rm -f sample*.jsonl || true
 
+      - name: Cleanup stale benchmark outputs (pre-run)
+        run: |
+          rm -f server.log || true
+          rm -f gpu_metrics.csv || true
+
       - name: Launch job script
         env:
           RUNNER_NAME: ${{ runner.name }}
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 39111556b..9222bf254 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -346,6 +346,7 @@ else
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
     srun --jobid=$JOB_ID \
+        --mpi=none \
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
         --no-container-mount-home \

From 8220f0d785e11c91237d13dfc0d84a1bee7141a2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 30 Apr 2026 15:19:29 -0700
Subject: [PATCH 11/14] mpi

---
 .github/configs/nvidia-master.yaml          | 38 ++++++++++-----------
 benchmarks/single_node/dsv4_fp4_b200_trt.sh | 17 +++++++++
 benchmarks/single_node/dsv4_fp4_b300_trt.sh |  2 ++
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 72fadb1b8..cbe43d9a9 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1730,25 +1730,25 @@ dsv4-fp4-b200-vllm:
 # DeepSeek-V4-Pro TRTLLM bring-up. This uses a TensorRT-LLM image built from
 # NVIDIA/TensorRT-LLM@feat/deepseek_v4; the benchmark script keeps a guarded
 # source-build fallback if the image is missing the required DSv4 support.
-dsv4-fp4-b200-trt:
-  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: b200-dsv4
-  precision: fp4
-  framework: trt
-  multinode: false
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 8, conc-start: 1, conc-end: 32 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 8, conc-start: 1, conc-end: 32 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
+#dsv4-fp4-b200-trt:
+#  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
+#  model: deepseek-ai/DeepSeek-V4-Pro
+#  model-prefix: dsv4
+#  runner: b200-dsv4
+#  precision: fp4
+#  framework: trt
+#  multinode: false
+#  seq-len-configs:
+#  - isl: 1024
+#    osl: 1024
+#    search-space:
+#    - { tp: 8, conc-start: 1, conc-end: 32 }
+#    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
+#  - isl: 8192
+#    osl: 1024
+#    search-space:
+#    - { tp: 8, conc-start: 1, conc-end: 32 }
+#    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128 }
 
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
index 28d148299..66c800147 100644
--- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
@@ -26,6 +26,23 @@ fi
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
+sanitize_slurm_mpi_env_for_trtllm() {
+    if [[ "${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-0}" != "1" ]]; then
+        return 0
+    fi
+
+    echo "Sanitizing Slurm/PMI environment for TensorRT-LLM direct launch"
+    while IFS='=' read -r name _; do
+        case "$name" in
+            SLURM_*|PMI*|PMIX*|OMPI_*|OPAL_*|ORTE_*)
+                unset "$name"
+                ;;
+        esac
+    done < <(env)
+}
+
+sanitize_slurm_mpi_env_for_trtllm
+
 export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
 echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
 
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
index 3c1763835..c143386ec 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
@@ -6,5 +6,7 @@
 # avoid nested mpirun because this cluster's OpenMPI build lacks Slurm PMIx.
 
 export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-0}"
+export TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV="${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-1}"
+export TRTLLM_DSV4_BOOTSTRAP="${TRTLLM_DSV4_BOOTSTRAP:-0}"
 
 bash "$(dirname "$0")/dsv4_fp4_b200_trt.sh"

From fb5d85f3b3817aa8574b2b34a2ab9b52ee2925bb Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 30 Apr 2026 15:27:09 -0700
Subject: [PATCH 12/14] b200 perf ok

---
 perf-changelog.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5fda5e2c5..8b968fc1a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2039,7 +2039,6 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1027
 
 - config-keys:
-    - dsv4-fp4-b200-trt
     - dsv4-fp4-b300-trt
   description:
     - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300"

From ef7f42cec3469e9a1b7ed38cb401eee0291cf748 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 30 Apr 2026 16:13:04 -0700
Subject: [PATCH 13/14] OPAL

---
 benchmarks/single_node/dsv4_fp4_b200_trt.sh | 4 ++--
 benchmarks/single_node/dsv4_fp4_b300_trt.sh | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
index 66c800147..3e9f3fe0a 100644
--- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
@@ -31,10 +31,10 @@ sanitize_slurm_mpi_env_for_trtllm() {
         return 0
     fi
 
-    echo "Sanitizing Slurm/PMI environment for TensorRT-LLM direct launch"
+    echo "Sanitizing Slurm/PMI environment for TensorRT-LLM launch"
     while IFS='=' read -r name _; do
         case "$name" in
-            SLURM_*|PMI*|PMIX*|OMPI_*|OPAL_*|ORTE_*)
+            SLURM_*|PMI*|PMIX*|OMPI_*|ORTE_*)
                 unset "$name"
                 ;;
         esac
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
index c143386ec..fd4b99be3 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
@@ -3,9 +3,10 @@
 # B300 uses the same low-concurrency TRTLLM bring-up recipe as B200. The B300
 # runner may rewrite MODEL to the pre-staged /data/models/dsv4-pro path before
 # this script is invoked. The job itself is already launched under srun/pyxis;
-# avoid nested mpirun because this cluster's OpenMPI build lacks Slurm PMIx.
+# scrub Slurm's PMI environment, then use mpirun to give TRTLLM a valid OpenMPI
+# runtime instead of direct-launching under srun.
 
-export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-0}"
+export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-1}"
 export TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV="${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-1}"
 export TRTLLM_DSV4_BOOTSTRAP="${TRTLLM_DSV4_BOOTSTRAP:-0}"
 

From 5f409ed753b6b1f67ea6b3f49fdda378c17b22bf Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 30 Apr 2026 16:52:18 -0700
Subject: [PATCH 14/14] sweep

---
 .github/configs/nvidia-master.yaml | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index cbe43d9a9..17ca6a6a9 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2600,11 +2600,17 @@ dsv4-fp4-b300-trt:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 }
+    - { tp: 4, conc-start: 1, conc-end: 64 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 512 }
+    - { tp: 8, conc-start: 1, conc-end: 32 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 }
+    - { tp: 4, conc-start: 1, conc-end: 64}
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 512 }
+    - { tp: 8, conc-start: 1, conc-end: 32 }
 
 dsv4-fp4-b300-vllm-mtp:
   image: vllm/vllm-openai:v0.20.0-cu130