From ad9ac4841b17b20e1bb4d3cac3fc8b17165f6537 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 29 Apr 2026 17:34:36 -0700 Subject: [PATCH 01/14] Add DSv4 TRT B200/300 test --- .github/configs/nvidia-master.yaml | 38 +++++++ benchmarks/single_node/dsv4_fp4_b200_trt.sh | 116 ++++++++++++++++++++ benchmarks/single_node/dsv4_fp4_b300_trt.sh | 7 ++ perf-changelog.yaml | 9 ++ 4 files changed, 170 insertions(+) create mode 100644 benchmarks/single_node/dsv4_fp4_b200_trt.sh create mode 100644 benchmarks/single_node/dsv4_fp4_b300_trt.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a841cb704..7680ba232 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1727,6 +1727,26 @@ dsv4-fp4-b200-vllm: - { tp: 8, conc-start: 1, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } +# DeepSeek-V4-Pro TRTLLM bring-up. Keep this TP-only and below the eval +# threshold until TRTLLM has a DSv4 chat-template/parser path wired in this repo. +dsv4-fp4-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b200-dsv4 + precision: fp4 + framework: trt + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 # B200 SGLang recipe as-is until B300-specific tuning is available. @@ -2540,6 +2560,24 @@ dsv4-fp4-b300-vllm: - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } +dsv4-fp4-b300-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: trt + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + dsv4-fp4-b300-vllm-mtp: image: vllm/vllm-openai:v0.20.0-cu130 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh new file mode 100644 index 000000000..115fe71f0 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash + +# DeepSeek-V4-Pro single-node TRTLLM bring-up recipe. This intentionally starts +# with low-concurrency TP-only STP points; DSv4 has no Jinja chat template and +# TRTLLM does not currently have a DSv4-specific chat parser wired here. + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + DP_ATTENTION \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" +fi + +nvidia-smi + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} +EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml" + +MOE_BACKEND="TRTLLM" +MAX_BATCH_SIZE=$(( CONC > 8 ? CONC : 8 )) +CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" +KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.80}" + +if [[ "$DP_ATTENTION" == "true" ]]; then + echo "DSv4 TRTLLM bring-up only supports TP-only search-space entries for now." >&2 + exit 1 +fi + +cat > "$EXTRA_CONFIG_FILE" << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE +enable_attention_dp: false +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION + enable_block_reuse: false +stream_interval: 10 +num_postprocess_workers: 4 +moe_config: + backend: $MOE_BACKEND +EOF + +echo "Generated config file contents:" +cat "$EXTRA_CONFIG_FILE" + +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) +MAX_NUM_TOKENS=$(( ISL + OSL + 256 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +set -x +mpirun -n 1 --oversubscribe --allow-run-as-root \ + trtllm-serve "$MODEL" --port="$PORT" \ + --trust_remote_code \ + --backend=pytorch \ + --max_batch_size="$MAX_BATCH_SIZE" \ + --max_seq_len="$MAX_MODEL_LEN" \ + --max_num_tokens="$MAX_NUM_TOKENS" \ + --tp_size="$TP" \ + --ep_size="$EP_SIZE" \ + --extra_llm_api_options="$EXTRA_CONFIG_FILE" \ + > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$(( CONC * 10 ))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" \ + --dsv4 \ + --trust-remote-code \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh new file mode 100644 index 000000000..9ced0f972 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# B300 uses the same low-concurrency TRTLLM bring-up recipe as B200. The B300 +# runner may rewrite MODEL to the pre-staged /data/models/dsv4-pro path before +# this script is invoked. + +bash "$(dirname "$0")/dsv4_fp4_b200_trt.sh" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 422d5347f..502d93eda 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2006,3 +2006,12 @@ - "Change image to vllm/vllm-openai:v0.20.0-cu130" - "Use Mega MoE for DEP configs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1221 + +- config-keys: + - dsv4-fp4-b200-trt + - dsv4-fp4-b300-trt + description: + - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300" + - "Use TensorRT-LLM release 1.2.0rc6.post2 with TP8, EP1, STP-only conc 1-8 for 1k1k and 8k1k" + - "Benchmark prompts use the repository DSv4 encoder via --dsv4; evals are intentionally avoided by keeping conc below the eval threshold until TRTLLM DSv4 chat handling is wired" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From 488ab3d3a97cbc92ccf672fad02a3971151e1c1c Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 29 Apr 2026 19:12:49 -0700 Subject: [PATCH 02/14] fix: use TensorRT-LLM DeepSeek-V4 branch image --- .github/configs/nvidia-master.yaml | 17 ++-- benchmarks/benchmark_lib.sh | 12 ++- benchmarks/single_node/dsv4_fp4_b200_trt.sh | 43 +++++---- perf-changelog.yaml | 4 +- utils/bench_serving/backend_request_func.py | 3 +- utils/build_trtllm_deepseek_v4_image.sh | 99 +++++++++++++++++++++ 6 files changed, 148 insertions(+), 30 deletions(-) create mode 100755 utils/build_trtllm_deepseek_v4_image.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c03b54f05..e6a70bbd5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1727,10 +1727,11 @@ dsv4-fp4-b200-vllm: - { tp: 8, conc-start: 1, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } -# DeepSeek-V4-Pro TRTLLM bring-up. Keep this TP-only and below the eval -# threshold until TRTLLM has a DSv4 chat-template/parser path wired in this repo. +# DeepSeek-V4-Pro TRTLLM bring-up. Requires a TensorRT-LLM image built from +# NVIDIA/TensorRT-LLM@feat/deepseek_v4; public release images do not include +# the DSv4 model, sparse attention, tokenizer, and cache-manager code. dsv4-fp4-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -1741,11 +1742,11 @@ dsv4-fp4-b200-trt: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 } # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. @@ -2586,7 +2587,7 @@ dsv4-fp4-b300-vllm: - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } dsv4-fp4-b300-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -2597,11 +2598,11 @@ dsv4-fp4-b300-trt: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 } dsv4-fp4-b300-vllm-mtp: image: vllm/vllm-openai:v0.20.0-cu130 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 268745735..d5323c599 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -165,11 +165,12 @@ wait_for_server_ready() { } # Run benchmark serving with standardized parameters -# All parameters are required except --use-chat-template, --dsv4, and --trust-remote-code +# All parameters are required except --endpoint, --use-chat-template, --dsv4, and --trust-remote-code # Parameters: # --model: Model name # --port: Server port # --backend: Backend type - e.g., 'vllm' or 'openai' +# --endpoint: Optional API endpoint override # --input-len: Random input sequence length # --output-len: Random output sequence length # --random-range-ratio: Random range ratio @@ -194,6 +195,7 @@ run_benchmark_serving() { local model="" local port="" local backend="" + local endpoint="" local input_len="" local output_len="" local random_range_ratio="" @@ -221,6 +223,10 @@ run_benchmark_serving() { backend="$2" shift 2 ;; + --endpoint) + endpoint="$2" + shift 2 + ;; --input-len) input_len="$2" shift 2 @@ -356,6 +362,10 @@ run_benchmark_serving() { --result-dir "$result_dir" --result-filename "$result_filename.json" ) + + if [[ -n "$endpoint" ]]; then + benchmark_cmd+=(--endpoint "$endpoint") + fi # Add --use-chat-template if requested if [[ "$use_chat_template" == true ]]; then diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh index 115fe71f0..d33ce6fde 100644 --- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh @@ -1,8 +1,7 @@ #!/usr/bin/env bash -# DeepSeek-V4-Pro single-node TRTLLM bring-up recipe. This intentionally starts -# with low-concurrency TP-only STP points; DSv4 has no Jinja chat template and -# TRTLLM does not currently have a DSv4-specific chat parser wired here. +# DeepSeek-V4-Pro single-node TRTLLM bring-up recipe for NVIDIA/TensorRT-LLM +# feat/deepseek_v4. The public release images do not contain this model path. source "$(dirname "$0")/../benchmark_lib.sh" @@ -35,22 +34,27 @@ PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml" MOE_BACKEND="TRTLLM" -MAX_BATCH_SIZE=$(( CONC > 8 ? CONC : 8 )) +MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" -KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.80}" +KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}" +ATTENTION_DP_CONFIG="" if [[ "$DP_ATTENTION" == "true" ]]; then - echo "DSv4 TRTLLM bring-up only supports TP-only search-space entries for now." >&2 - exit 1 + ATTENTION_DP_CONFIG=" +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60" fi cat > "$EXTRA_CONFIG_FILE" << EOF cuda_graph_config: enable_padding: true max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE -enable_attention_dp: false +enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG print_iter_log: true kv_cache_config: + tokens_per_block: 128 dtype: fp8 free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION enable_block_reuse: false @@ -77,15 +81,18 @@ start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x mpirun -n 1 --oversubscribe --allow-run-as-root \ - trtllm-serve "$MODEL" --port="$PORT" \ + trtllm-serve "$MODEL" \ + --host 0.0.0.0 \ + --port "$PORT" \ --trust_remote_code \ - --backend=pytorch \ - --max_batch_size="$MAX_BATCH_SIZE" \ - --max_seq_len="$MAX_MODEL_LEN" \ - --max_num_tokens="$MAX_NUM_TOKENS" \ - --tp_size="$TP" \ - --ep_size="$EP_SIZE" \ - --extra_llm_api_options="$EXTRA_CONFIG_FILE" \ + --backend pytorch \ + --max_batch_size "$MAX_BATCH_SIZE" \ + --max_seq_len "$MAX_MODEL_LEN" \ + --max_num_tokens "$MAX_NUM_TOKENS" \ + --tp_size "$TP" \ + --ep_size "$EP_SIZE" \ + --custom_tokenizer deepseek_v4 \ + --config "$EXTRA_CONFIG_FILE" \ > "$SERVER_LOG" 2>&1 & SERVER_PID=$! @@ -95,7 +102,8 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ - --backend openai \ + --backend openai-chat \ + --endpoint /v1/chat/completions \ --input-len "$ISL" \ --output-len "$OSL" \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ @@ -103,7 +111,6 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir "$PWD/" \ - --dsv4 \ --trust-remote-code \ --server-pid "$SERVER_PID" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0ce2f2760..d1280468e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2028,6 +2028,6 @@ - dsv4-fp4-b300-trt description: - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300" - - "Use TensorRT-LLM release 1.2.0rc6.post2 with TP8, EP1, STP-only conc 1-8 for 1k1k and 8k1k" - - "Benchmark prompts use the repository DSv4 encoder via --dsv4; evals are intentionally avoided by keeping conc below the eval threshold until TRTLLM DSv4 chat handling is wired" + - "Use a TensorRT-LLM image built from NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1) with TP8, EP8, STP-only conc 1-8 for 1k1k and 8k1k" + - "Serve DeepSeek-V4-Pro through TRTLLM's deepseek_v4 tokenizer and /v1/chat/completions; include utils/build_trtllm_deepseek_v4_image.sh for building the required branch image" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233 diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index af030720e..7f4a93284 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -341,8 +341,9 @@ async def async_request_openai_chat_completions( async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: - content = [{"type": "text", "text": request_func_input.prompt}] + content = request_func_input.prompt if request_func_input.multi_modal_content: + content = [{"type": "text", "text": request_func_input.prompt}] content.append(request_func_input.multi_modal_content) payload = { "model": request_func_input.model_name \ diff --git a/utils/build_trtllm_deepseek_v4_image.sh b/utils/build_trtllm_deepseek_v4_image.sh new file mode 100755 index 000000000..883f310c0 --- /dev/null +++ b/utils/build_trtllm_deepseek_v4_image.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash + +set -euo pipefail + +TRTLLM_REPO="${TRTLLM_REPO:-https://github.com/NVIDIA/TensorRT-LLM.git}" +TRTLLM_REF="${TRTLLM_REF:-feat/deepseek_v4}" +TRTLLM_COMMIT="${TRTLLM_COMMIT:-f1c5fe143febb70cd74f0fb4ccca1516206268d7}" +IMAGE_WITH_TAG="${IMAGE_WITH_TAG:-ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1}" +CUDA_ARCHS="${CUDA_ARCHS:-100-real;103-real}" +PUSH="${PUSH:-0}" +KEEP_SRC="${KEEP_SRC:-0}" + +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "Missing required command: $1" >&2 + exit 1 + fi +} + +to_enroot_image() { + local image="$1" + local registry="${image%%/*}" + local rest="${image#*/}" + + if [[ "$image" == "$rest" ]]; then + printf '%s\n' "$image" + elif [[ "$registry" == *.* || "$registry" == *:* || "$registry" == "localhost" ]]; then + printf '%s#%s\n' "$registry" "$rest" + else + printf '%s\n' "$image" + fi +} + +require_cmd docker +require_cmd git +require_cmd make + +if ! docker buildx version >/dev/null 2>&1; then + echo "docker buildx is required to build TensorRT-LLM release images." >&2 + exit 1 +fi + +if ! git lfs version >/dev/null 2>&1; then + echo "git-lfs is required. Install it, then rerun this script." >&2 + exit 1 +fi + +WORKDIR="" +if [[ -n "${TRTLLM_SRC_DIR:-}" ]]; then + SRC_DIR="$TRTLLM_SRC_DIR" +else + WORKDIR="$(mktemp -d "${TMPDIR:-/tmp}/trtllm-dsv4-build.XXXXXX")" + SRC_DIR="$WORKDIR/TensorRT-LLM" +fi + +cleanup() { + if [[ -n "$WORKDIR" && "$KEEP_SRC" != "1" ]]; then + rm -rf "$WORKDIR" + elif [[ -n "$WORKDIR" ]]; then + echo "Keeping TensorRT-LLM checkout at $SRC_DIR" + fi +} +trap cleanup EXIT + +if [[ ! -d "$SRC_DIR/.git" ]]; then + git clone --recurse-submodules --branch "$TRTLLM_REF" "$TRTLLM_REPO" "$SRC_DIR" +fi + +cd "$SRC_DIR" +git fetch origin "$TRTLLM_REF" +git checkout -B "$TRTLLM_REF" "origin/$TRTLLM_REF" 2>/dev/null || git checkout "$TRTLLM_REF" +if [[ -n "$TRTLLM_COMMIT" ]]; then + git checkout "$TRTLLM_COMMIT" +fi +git submodule update --init --recursive +git lfs install --local +git lfs pull + +ACTUAL_COMMIT="$(git rev-parse HEAD)" + +echo "Building TensorRT-LLM DeepSeek-V4 image" +echo " source: $TRTLLM_REPO" +echo " ref: $TRTLLM_REF" +echo " commit: $ACTUAL_COMMIT" +echo " image: $IMAGE_WITH_TAG" +echo " archs: $CUDA_ARCHS" + +make -C docker release_build \ + IMAGE_WITH_TAG="$IMAGE_WITH_TAG" \ + CUDA_ARCHS="$CUDA_ARCHS" \ + GIT_COMMIT="$ACTUAL_COMMIT" + +if [[ "$PUSH" == "1" ]]; then + docker push "$IMAGE_WITH_TAG" +fi + +echo +echo "Docker image: $IMAGE_WITH_TAG" +echo "InferenceX/enroot image string: $(to_enroot_image "$IMAGE_WITH_TAG")" From e079fb7495add73def7130a6a977a20685a98805 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 29 Apr 2026 19:26:58 -0700 Subject: [PATCH 03/14] fix: point DeepSeek V4 image to correct org --- .github/configs/nvidia-master.yaml | 4 ++-- utils/build_trtllm_deepseek_v4_image.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e6a70bbd5..e1b3fe5e6 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1731,7 +1731,7 @@ dsv4-fp4-b200-vllm: # NVIDIA/TensorRT-LLM@feat/deepseek_v4; public release images do not include # the DSv4 model, sparse attention, tokenizer, and cache-manager code. dsv4-fp4-b200-trt: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 + image: ghcr.io#semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -2587,7 +2587,7 @@ dsv4-fp4-b300-vllm: - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } dsv4-fp4-b300-trt: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 + image: ghcr.io#semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/utils/build_trtllm_deepseek_v4_image.sh b/utils/build_trtllm_deepseek_v4_image.sh index 883f310c0..dfff4b80e 100755 --- a/utils/build_trtllm_deepseek_v4_image.sh +++ b/utils/build_trtllm_deepseek_v4_image.sh @@ -5,7 +5,7 @@ set -euo pipefail TRTLLM_REPO="${TRTLLM_REPO:-https://github.com/NVIDIA/TensorRT-LLM.git}" TRTLLM_REF="${TRTLLM_REF:-feat/deepseek_v4}" TRTLLM_COMMIT="${TRTLLM_COMMIT:-f1c5fe143febb70cd74f0fb4ccca1516206268d7}" -IMAGE_WITH_TAG="${IMAGE_WITH_TAG:-ghcr.io/semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1}" +IMAGE_WITH_TAG="${IMAGE_WITH_TAG:-ghcr.io/semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1}" CUDA_ARCHS="${CUDA_ARCHS:-100-real;103-real}" PUSH="${PUSH:-0}" KEEP_SRC="${KEEP_SRC:-0}" From 6a949a6eee3919b61020b1224c531d3311ec1f3e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 29 Apr 2026 21:33:45 -0700 Subject: [PATCH 04/14] Use runtime TensorRT-LLM DSv4 bootstrap --- .github/configs/nvidia-master.yaml | 10 +- benchmarks/single_node/dsv4_fp4_b200_trt.sh | 6 +- .../single_node/trtllm_dsv4_bootstrap.sh | 113 ++++++++++++++++++ perf-changelog.yaml | 4 +- utils/build_trtllm_deepseek_v4_image.sh | 99 --------------- 5 files changed, 125 insertions(+), 107 deletions(-) create mode 100644 benchmarks/single_node/trtllm_dsv4_bootstrap.sh delete mode 100755 utils/build_trtllm_deepseek_v4_image.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e1b3fe5e6..d66ddaede 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1727,11 +1727,11 @@ dsv4-fp4-b200-vllm: - { tp: 8, conc-start: 1, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } -# DeepSeek-V4-Pro TRTLLM bring-up. Requires a TensorRT-LLM image built from -# NVIDIA/TensorRT-LLM@feat/deepseek_v4; public release images do not include -# the DSv4 model, sparse attention, tokenizer, and cache-manager code. +# DeepSeek-V4-Pro TRTLLM bring-up. Public release/devel tags do not include +# the DSv4 branch yet, so the benchmark script bootstraps +# NVIDIA/TensorRT-LLM@feat/deepseek_v4 inside the official devel image. dsv4-fp4-b200-trt: - image: ghcr.io#semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 + image: nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -2587,7 +2587,7 @@ dsv4-fp4-b300-vllm: - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } dsv4-fp4-b300-trt: - image: ghcr.io#semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 + image: nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh index d33ce6fde..dcdafa479 100644 --- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh @@ -1,9 +1,11 @@ #!/usr/bin/env bash # DeepSeek-V4-Pro single-node TRTLLM bring-up recipe for NVIDIA/TensorRT-LLM -# feat/deepseek_v4. The public release images do not contain this model path. +# feat/deepseek_v4. The public release/devel images do not contain this model +# path yet, so the script builds and installs the pinned branch under /tmp. source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/trtllm_dsv4_bootstrap.sh" check_env_vars \ MODEL \ @@ -23,6 +25,8 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" +bootstrap_trtllm_dsv4 || exit 1 + if [[ "$MODEL" != /* ]]; then hf download "$MODEL" fi diff --git a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh new file mode 100644 index 000000000..0074e08ed --- /dev/null +++ b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash + +# Build and install the TensorRT-LLM DeepSeek-V4 feature branch at runtime. +# This avoids relying on a custom prebuilt image while still picking up the +# branch's required C++/CUDA kernels and Python model/tokenizer code. + +trtllm_dsv4_supported() { + python3 - <<'PY' +import importlib +import sys + +try: + import tensorrt_llm # noqa: F401 + import torch + + importlib.import_module("tensorrt_llm._torch.models.modeling_deepseekv4") + importlib.import_module( + "tensorrt_llm._torch.attention_backend.sparse.deepseek_v4.deepseek_v4" + ) + getattr(torch.ops.trtllm, "compressor_prefill_reduction") + getattr(torch.ops.trtllm, "compressor_paged_kv_compress") + getattr(torch.ops.trtllm, "compressor_postprocess_scatter") +except Exception as exc: + print(f"TensorRT-LLM DeepSeek-V4 support check failed: {exc}", file=sys.stderr) + raise SystemExit(1) +PY +} + +bootstrap_trtllm_dsv4() { + if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" == "0" ]]; then + echo "TRTLLM_DSV4_BOOTSTRAP=0; skipping TensorRT-LLM DeepSeek-V4 bootstrap" + return 0 + fi + + if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" != "force" ]] && trtllm_dsv4_supported; then + echo "TensorRT-LLM DeepSeek-V4 support already available" + return 0 + fi + + local repo="${TRTLLM_DSV4_REPO:-https://github.com/NVIDIA/TensorRT-LLM.git}" + local branch="${TRTLLM_DSV4_BRANCH:-feat/deepseek_v4}" + local ref="${TRTLLM_DSV4_REF:-f1c5fe143febb70cd74f0fb4ccca1516206268d7}" + local src="${TRTLLM_DSV4_SRC:-/tmp/trtllm-dsv4-src}" + local build_dir="${TRTLLM_DSV4_BUILD_DIR:-/tmp/trtllm-dsv4-build}" + local dist_dir="${TRTLLM_DSV4_DIST_DIR:-/tmp/trtllm-dsv4-wheel}" + local archs="${TRTLLM_DSV4_CUDA_ARCHITECTURES:-100-real;103-real}" + local lock_file="${TRTLLM_DSV4_LOCK_FILE:-/tmp/trtllm-dsv4-bootstrap.lock}" + + echo "Bootstrapping TensorRT-LLM DeepSeek-V4 support" + echo " repo: $repo" + echo " branch: $branch" + echo " ref: $ref" + echo " archs: $archs" + + if ! command -v git >/dev/null 2>&1; then + if command -v apt-get >/dev/null 2>&1; then + apt-get update + apt-get install -y git + else + echo "git is required to bootstrap TensorRT-LLM DeepSeek-V4 support" >&2 + return 1 + fi + fi + + ( + set -euo pipefail + flock 9 + + if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" != "force" ]] && trtllm_dsv4_supported; then + echo "TensorRT-LLM DeepSeek-V4 support became available while waiting for bootstrap lock" + exit 0 + fi + + if [[ ! -d "$src/.git" ]]; then + rm -rf "$src" + git clone \ + --filter=blob:none \ + --single-branch \ + --branch "$branch" \ + "$repo" "$src" + fi + + cd "$src" + git fetch origin "$branch" --depth 1 + git fetch origin "$ref" --depth 1 || true + git checkout "$ref" + git submodule update --init --recursive --depth 1 + + if command -v git-lfs >/dev/null 2>&1; then + git lfs install --local + git lfs pull + else + echo "git-lfs not found; continuing without LFS pull" + fi + + rm -rf "$dist_dir" + mkdir -p "$dist_dir" + + python3 scripts/build_wheel.py \ + --cuda_architectures "$archs" \ + --build_dir "$build_dir" \ + --dist_dir "$dist_dir" \ + --clean \ + --skip-stubs \ + ${TRTLLM_DSV4_BUILD_ARGS:-} + + local wheel + wheel="$(ls -t "$dist_dir"/tensorrt_llm*.whl | head -1)" + python3 -m pip install --force-reinstall --no-deps "$wheel" + ) 9>"$lock_file" + + trtllm_dsv4_supported +} diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d1280468e..3b3bb5d2f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2028,6 +2028,6 @@ - dsv4-fp4-b300-trt description: - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300" - - "Use a TensorRT-LLM image built from NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1) with TP8, EP8, STP-only conc 1-8 for 1k1k and 8k1k" - - "Serve DeepSeek-V4-Pro through TRTLLM's deepseek_v4 tokenizer and /v1/chat/completions; include utils/build_trtllm_deepseek_v4_image.sh for building the required branch image" + - "Use nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13 and build/install NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1) at runtime under /tmp" + - "Serve DeepSeek-V4-Pro through TRTLLM's deepseek_v4 tokenizer and /v1/chat/completions with TP8, EP8, STP-only conc 1-8 for 1k1k and 8k1k" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233 diff --git a/utils/build_trtllm_deepseek_v4_image.sh b/utils/build_trtllm_deepseek_v4_image.sh deleted file mode 100755 index dfff4b80e..000000000 --- a/utils/build_trtllm_deepseek_v4_image.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -TRTLLM_REPO="${TRTLLM_REPO:-https://github.com/NVIDIA/TensorRT-LLM.git}" -TRTLLM_REF="${TRTLLM_REF:-feat/deepseek_v4}" -TRTLLM_COMMIT="${TRTLLM_COMMIT:-f1c5fe143febb70cd74f0fb4ccca1516206268d7}" -IMAGE_WITH_TAG="${IMAGE_WITH_TAG:-ghcr.io/semianalysiswork/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1}" -CUDA_ARCHS="${CUDA_ARCHS:-100-real;103-real}" -PUSH="${PUSH:-0}" -KEEP_SRC="${KEEP_SRC:-0}" - -require_cmd() { - if ! command -v "$1" >/dev/null 2>&1; then - echo "Missing required command: $1" >&2 - exit 1 - fi -} - -to_enroot_image() { - local image="$1" - local registry="${image%%/*}" - local rest="${image#*/}" - - if [[ "$image" == "$rest" ]]; then - printf '%s\n' "$image" - elif [[ "$registry" == *.* || "$registry" == *:* || "$registry" == "localhost" ]]; then - printf '%s#%s\n' "$registry" "$rest" - else - printf '%s\n' "$image" - fi -} - -require_cmd docker -require_cmd git -require_cmd make - -if ! docker buildx version >/dev/null 2>&1; then - echo "docker buildx is required to build TensorRT-LLM release images." >&2 - exit 1 -fi - -if ! git lfs version >/dev/null 2>&1; then - echo "git-lfs is required. Install it, then rerun this script." >&2 - exit 1 -fi - -WORKDIR="" -if [[ -n "${TRTLLM_SRC_DIR:-}" ]]; then - SRC_DIR="$TRTLLM_SRC_DIR" -else - WORKDIR="$(mktemp -d "${TMPDIR:-/tmp}/trtllm-dsv4-build.XXXXXX")" - SRC_DIR="$WORKDIR/TensorRT-LLM" -fi - -cleanup() { - if [[ -n "$WORKDIR" && "$KEEP_SRC" != "1" ]]; then - rm -rf "$WORKDIR" - elif [[ -n "$WORKDIR" ]]; then - echo "Keeping TensorRT-LLM checkout at $SRC_DIR" - fi -} -trap cleanup EXIT - -if [[ ! -d "$SRC_DIR/.git" ]]; then - git clone --recurse-submodules --branch "$TRTLLM_REF" "$TRTLLM_REPO" "$SRC_DIR" -fi - -cd "$SRC_DIR" -git fetch origin "$TRTLLM_REF" -git checkout -B "$TRTLLM_REF" "origin/$TRTLLM_REF" 2>/dev/null || git checkout "$TRTLLM_REF" -if [[ -n "$TRTLLM_COMMIT" ]]; then - git checkout "$TRTLLM_COMMIT" -fi -git submodule update --init --recursive -git lfs install --local -git lfs pull - -ACTUAL_COMMIT="$(git rev-parse HEAD)" - -echo "Building TensorRT-LLM DeepSeek-V4 image" -echo " source: $TRTLLM_REPO" -echo " ref: $TRTLLM_REF" -echo " commit: $ACTUAL_COMMIT" -echo " image: $IMAGE_WITH_TAG" -echo " archs: $CUDA_ARCHS" - -make -C docker release_build \ - IMAGE_WITH_TAG="$IMAGE_WITH_TAG" \ - CUDA_ARCHS="$CUDA_ARCHS" \ - GIT_COMMIT="$ACTUAL_COMMIT" - -if [[ "$PUSH" == "1" ]]; then - docker push "$IMAGE_WITH_TAG" -fi - -echo -echo "Docker image: $IMAGE_WITH_TAG" -echo "InferenceX/enroot image string: $(to_enroot_image "$IMAGE_WITH_TAG")" From 9488f346184df9991ee8431260a4157583d5051e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 29 Apr 2026 22:07:23 -0700 Subject: [PATCH 05/14] Fix TensorRT-LLM DSv4 runtime wheel build --- benchmarks/single_node/trtllm_dsv4_bootstrap.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh index 0074e08ed..55f0b6491 100644 --- a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh +++ b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh @@ -96,12 +96,13 @@ bootstrap_trtllm_dsv4() { rm -rf "$dist_dir" mkdir -p "$dist_dir" + # setup.py sanity-checks for the generated bindings/ stubs directory. + # Do not use --skip-stubs here, or wheel packaging fails after C++ build. python3 scripts/build_wheel.py \ --cuda_architectures "$archs" \ --build_dir "$build_dir" \ --dist_dir "$dist_dir" \ --clean \ - --skip-stubs \ ${TRTLLM_DSV4_BUILD_ARGS:-} local wheel From b0cc6656a49abe1211bb7d597190d26a60e3a36c Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Wed, 29 Apr 2026 22:26:43 -0700 Subject: [PATCH 06/14] Use DeepSeek V4 TRTLLM image --- .github/configs/nvidia-master.yaml | 10 ++--- .github/workflows/benchmark-tmpl.yml | 2 + benchmarks/single_node/dsv4_fp4_b200_trt.sh | 5 ++- .../single_node/trtllm_dsv4_bootstrap.sh | 5 +-- perf-changelog.yaml | 2 +- runners/launch_b200-dgxc.sh | 39 +++++++++++++++++++ runners/launch_b300-nv.sh | 39 +++++++++++++++++++ 7 files changed, 91 insertions(+), 11 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 77e82ee19..126f6f3f1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1727,11 +1727,11 @@ dsv4-fp4-b200-vllm: - { tp: 8, conc-start: 1, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } -# DeepSeek-V4-Pro TRTLLM bring-up. Public release/devel tags do not include -# the DSv4 branch yet, so the benchmark script bootstraps -# NVIDIA/TensorRT-LLM@feat/deepseek_v4 inside the official devel image. +# DeepSeek-V4-Pro TRTLLM bring-up. This uses a TensorRT-LLM image built from +# NVIDIA/TensorRT-LLM@feat/deepseek_v4; the benchmark script keeps a guarded +# source-build fallback if the image is missing the required DSv4 support. dsv4-fp4-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -2587,7 +2587,7 @@ dsv4-fp4-b300-vllm: - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } dsv4-fp4-b300-trt: - image: nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13 + image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index c38082cbe..012c48975 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -89,6 +89,8 @@ env: DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} EVAL_ONLY: ${{ inputs.eval-only }} + GHCR_TOKEN: ${{ secrets.GHCR_TOKEN || secrets.REPO_PAT }} + GHCR_USER: ${{ secrets.GHCR_USER || github.actor }} PYTHONDONTWRITEBYTECODE: '1' PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh index dcdafa479..ae75b5388 100644 --- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh @@ -1,8 +1,9 @@ #!/usr/bin/env bash # DeepSeek-V4-Pro single-node TRTLLM bring-up recipe for NVIDIA/TensorRT-LLM -# feat/deepseek_v4. The public release/devel images do not contain this model -# path yet, so the script builds and installs the pinned branch under /tmp. +# feat/deepseek_v4. The configured image should already contain this branch; +# bootstrap_trtllm_dsv4 verifies that and only builds the pinned branch as a +# fallback. source "$(dirname "$0")/../benchmark_lib.sh" source "$(dirname "$0")/trtllm_dsv4_bootstrap.sh" diff --git a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh index 55f0b6491..fb11aee4e 100644 --- a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh +++ b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh @@ -1,8 +1,7 @@ #!/usr/bin/env bash -# Build and install the TensorRT-LLM DeepSeek-V4 feature branch at runtime. -# This avoids relying on a custom prebuilt image while still picking up the -# branch's required C++/CUDA kernels and Python model/tokenizer code. +# Verify TensorRT-LLM DeepSeek-V4 support and, if needed, build/install the +# pinned feature branch at runtime as a fallback. trtllm_dsv4_supported() { python3 - <<'PY' diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 47c46a207..9ce2ee998 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2037,6 +2037,6 @@ - dsv4-fp4-b300-trt description: - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300" - - "Use nvcr.io#nvidia/tensorrt-llm/devel:1.3.0rc13 and build/install NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1) at runtime under /tmp" + - "Use ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1, built from NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1), with a guarded source-build fallback" - "Serve DeepSeek-V4-Pro through TRTLLM's deepseek_v4 tokenizer and /v1/chat/completions with TP8, EP8, STP-only conc 1-8 for 1k1k and 8k1k" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233 diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index edf5db957..f81ee0027 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -6,6 +6,45 @@ SLURM_ACCOUNT="benchmark" set -x +configure_enroot_ghcr_auth() { + case "$IMAGE" in + ghcr.io#*|ghcr.io/*) ;; + *) return 0 ;; + esac + + if [[ -z "${GHCR_TOKEN:-}" ]]; then + echo "GHCR_TOKEN is not set; attempting anonymous ghcr.io import" + return 0 + fi + + local config_dir="${ENROOT_CONFIG_PATH:-${XDG_CONFIG_HOME:-$HOME/.config}/enroot}" + local credentials_file="$config_dir/.credentials" + local tmp_file + local ghcr_user="${GHCR_USER:-${GITHUB_ACTOR:-oauth2}}" + local xtrace_was_set=0 + + case "$-" in + *x*) xtrace_was_set=1; set +x ;; + esac + + mkdir -p "$config_dir" + touch "$credentials_file" + chmod 600 "$credentials_file" + tmp_file="$(mktemp "${credentials_file}.XXXXXX")" + grep -v '^machine ghcr\.io ' "$credentials_file" > "$tmp_file" || true + printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file" + mv "$tmp_file" "$credentials_file" + chmod 600 "$credentials_file" + + if [[ "$xtrace_was_set" == "1" ]]; then + set -x + fi + + echo "Configured enroot credentials for ghcr.io" +} + +configure_enroot_ghcr_auth + if [[ "$IS_MULTINODE" == "true" ]]; then # Validate framework diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 3c855e805..795ea38fc 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -6,6 +6,45 @@ SLURM_ACCOUNT="benchmark" set -x +configure_enroot_ghcr_auth() { + case "$IMAGE" in + ghcr.io#*|ghcr.io/*) ;; + *) return 0 ;; + esac + + if [[ -z "${GHCR_TOKEN:-}" ]]; then + echo "GHCR_TOKEN is not set; attempting anonymous ghcr.io import" + return 0 + fi + + local config_dir="${ENROOT_CONFIG_PATH:-${XDG_CONFIG_HOME:-$HOME/.config}/enroot}" + local credentials_file="$config_dir/.credentials" + local tmp_file + local ghcr_user="${GHCR_USER:-${GITHUB_ACTOR:-oauth2}}" + local xtrace_was_set=0 + + case "$-" in + *x*) xtrace_was_set=1; set +x ;; + esac + + mkdir -p "$config_dir" + touch "$credentials_file" + chmod 600 "$credentials_file" + tmp_file="$(mktemp "${credentials_file}.XXXXXX")" + grep -v '^machine ghcr\.io ' "$credentials_file" > "$tmp_file" || true + printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file" + mv "$tmp_file" "$credentials_file" + chmod 600 "$credentials_file" + + if [[ "$xtrace_was_set" == "1" ]]; then + set -x + fi + + echo "Configured enroot credentials for ghcr.io" +} + +configure_enroot_ghcr_auth + if [[ "$IS_MULTINODE" == "true" ]]; then # Validate framework From 8ee56afdc050875d9291b2d02c40802c15246e0b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 30 Apr 2026 07:54:06 -0700 Subject: [PATCH 07/14] Use anonymous GHCR pulls by default --- .github/workflows/benchmark-tmpl.yml | 2 +- runners/launch_b200-dgxc.sh | 17 ++++++++++------- runners/launch_b300-nv.sh | 17 ++++++++++------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 012c48975..b8138fe35 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -89,7 +89,7 @@ env: DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} EVAL_ONLY: ${{ inputs.eval-only }} - GHCR_TOKEN: ${{ secrets.GHCR_TOKEN || secrets.REPO_PAT }} + GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }} GHCR_USER: ${{ secrets.GHCR_USER || github.actor }} PYTHONDONTWRITEBYTECODE: '1' PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f81ee0027..3014c16fc 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -12,11 +12,6 @@ configure_enroot_ghcr_auth() { *) return 0 ;; esac - if [[ -z "${GHCR_TOKEN:-}" ]]; then - echo "GHCR_TOKEN is not set; attempting anonymous ghcr.io import" - return 0 - fi - local config_dir="${ENROOT_CONFIG_PATH:-${XDG_CONFIG_HOME:-$HOME/.config}/enroot}" local credentials_file="$config_dir/.credentials" local tmp_file @@ -32,7 +27,11 @@ configure_enroot_ghcr_auth() { chmod 600 "$credentials_file" tmp_file="$(mktemp "${credentials_file}.XXXXXX")" grep -v '^machine ghcr\.io ' "$credentials_file" > "$tmp_file" || true - printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file" + + if [[ -n "${GHCR_TOKEN:-}" ]]; then + printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file" + fi + mv "$tmp_file" "$credentials_file" chmod 600 "$credentials_file" @@ -40,7 +39,11 @@ configure_enroot_ghcr_auth() { set -x fi - echo "Configured enroot credentials for ghcr.io" + if [[ -n "${GHCR_TOKEN:-}" ]]; then + echo "Configured enroot credentials for ghcr.io" + else + echo "GHCR_TOKEN is not set; removed stale ghcr.io credentials for anonymous import" + fi } configure_enroot_ghcr_auth diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 795ea38fc..39111556b 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -12,11 +12,6 @@ configure_enroot_ghcr_auth() { *) return 0 ;; esac - if [[ -z "${GHCR_TOKEN:-}" ]]; then - echo "GHCR_TOKEN is not set; attempting anonymous ghcr.io import" - return 0 - fi - local config_dir="${ENROOT_CONFIG_PATH:-${XDG_CONFIG_HOME:-$HOME/.config}/enroot}" local credentials_file="$config_dir/.credentials" local tmp_file @@ -32,7 +27,11 @@ configure_enroot_ghcr_auth() { chmod 600 "$credentials_file" tmp_file="$(mktemp "${credentials_file}.XXXXXX")" grep -v '^machine ghcr\.io ' "$credentials_file" > "$tmp_file" || true - printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file" + + if [[ -n "${GHCR_TOKEN:-}" ]]; then + printf 'machine ghcr.io login %s password $GHCR_TOKEN\n' "$ghcr_user" >> "$tmp_file" + fi + mv "$tmp_file" "$credentials_file" chmod 600 "$credentials_file" @@ -40,7 +39,11 @@ configure_enroot_ghcr_auth() { set -x fi - echo "Configured enroot credentials for ghcr.io" + if [[ -n "${GHCR_TOKEN:-}" ]]; then + echo "Configured enroot credentials for ghcr.io" + else + echo "GHCR_TOKEN is not set; removed stale ghcr.io credentials for anonymous import" + fi } configure_enroot_ghcr_auth From 2d48f08cffe0c57bee338a6fbf5737f9818b6fb9 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 30 Apr 2026 11:23:16 -0700 Subject: [PATCH 08/14] Fix DSv4 TRT launch env --- benchmarks/single_node/dsv4_fp4_b200_trt.sh | 17 ++++++++++++++--- benchmarks/single_node/dsv4_fp4_b300_trt.sh | 7 ++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh index ae75b5388..28d148299 100644 --- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh @@ -26,6 +26,9 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" +export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" +echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" + bootstrap_trtllm_dsv4 || exit 1 if [[ "$MODEL" != /* ]]; then @@ -85,7 +88,7 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x -mpirun -n 1 --oversubscribe --allow-run-as-root \ +SERVE_CMD=( trtllm-serve "$MODEL" \ --host 0.0.0.0 \ --port "$PORT" \ @@ -97,8 +100,16 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --tp_size "$TP" \ --ep_size "$EP_SIZE" \ --custom_tokenizer deepseek_v4 \ - --config "$EXTRA_CONFIG_FILE" \ - > "$SERVER_LOG" 2>&1 & + --config "$EXTRA_CONFIG_FILE" +) + +if [[ "${TRTLLM_DSV4_USE_MPIRUN:-1}" == "0" ]]; then + "${SERVE_CMD[@]}" > "$SERVER_LOG" 2>&1 & +else + mpirun -n 1 --oversubscribe --allow-run-as-root \ + "${SERVE_CMD[@]}" \ + > "$SERVER_LOG" 2>&1 & +fi SERVER_PID=$! diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh index 9ced0f972..03791dcd6 100644 --- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh @@ -2,6 +2,11 @@ # B300 uses the same low-concurrency TRTLLM bring-up recipe as B200. The B300 # runner may rewrite MODEL to the pre-staged /data/models/dsv4-pro path before -# this script is invoked. +# this script is invoked. The job itself is already launched under srun; keep +# mpirun local so OpenMPI does not try to use Slurm PMI/PMIx from inside pyxis. + +export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-1}" +export OMPI_MCA_plm="${OMPI_MCA_plm:-isolated}" +export OMPI_MCA_ras="${OMPI_MCA_ras:-^slurm}" bash "$(dirname "$0")/dsv4_fp4_b200_trt.sh" From 6e75819d7b60f8e8c0d9319e137a89ae7e5c5a72 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 30 Apr 2026 11:54:35 -0700 Subject: [PATCH 09/14] Bypass mpirun for B300 DSv4 TRT --- benchmarks/single_node/dsv4_fp4_b300_trt.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh index 03791dcd6..3c1763835 100644 --- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh @@ -2,11 +2,9 @@ # B300 uses the same low-concurrency TRTLLM bring-up recipe as B200. The B300 # runner may rewrite MODEL to the pre-staged /data/models/dsv4-pro path before -# this script is invoked. The job itself is already launched under srun; keep -# mpirun local so OpenMPI does not try to use Slurm PMI/PMIx from inside pyxis. +# this script is invoked. The job itself is already launched under srun/pyxis; +# avoid nested mpirun because this cluster's OpenMPI build lacks Slurm PMIx. -export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-1}" -export OMPI_MCA_plm="${OMPI_MCA_plm:-isolated}" -export OMPI_MCA_ras="${OMPI_MCA_ras:-^slurm}" +export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-0}" bash "$(dirname "$0")/dsv4_fp4_b200_trt.sh" From e1e762dfbefe25a65eda6f08f5267a8199479a4a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 30 Apr 2026 12:35:00 -0700 Subject: [PATCH 10/14] larger sweep + mpi --- .github/configs/nvidia-master.yaml | 10 ++++++---- .github/workflows/benchmark-tmpl.yml | 5 +++++ runners/launch_b300-nv.sh | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 126f6f3f1..f26e68766 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1742,11 +1742,13 @@ dsv4-fp4-b200-trt: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 } + - { tp: 8, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 } + - { tp: 8, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. @@ -2598,11 +2600,11 @@ dsv4-fp4-b300-trt: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 1, conc-end: 8 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 } dsv4-fp4-b300-vllm-mtp: image: vllm/vllm-openai:v0.20.0-cu130 diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index b8138fe35..188aff5cd 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -140,6 +140,11 @@ jobs: rm -f results*.json || true rm -f sample*.jsonl || true + - name: Cleanup stale benchmark outputs (pre-run) + run: | + rm -f server.log || true + rm -f gpu_metrics.csv || true + - name: Launch job script env: RUNNER_NAME: ${{ runner.name }} diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 39111556b..9222bf254 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -346,6 +346,7 @@ else JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) srun --jobid=$JOB_ID \ + --mpi=none \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ --no-container-mount-home \ From 8220f0d785e11c91237d13dfc0d84a1bee7141a2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 30 Apr 2026 15:19:29 -0700 Subject: [PATCH 11/14] mpi --- .github/configs/nvidia-master.yaml | 38 ++++++++++----------- benchmarks/single_node/dsv4_fp4_b200_trt.sh | 17 +++++++++ benchmarks/single_node/dsv4_fp4_b300_trt.sh | 2 ++ 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 72fadb1b8..cbe43d9a9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1730,25 +1730,25 @@ dsv4-fp4-b200-vllm: # DeepSeek-V4-Pro TRTLLM bring-up. This uses a TensorRT-LLM image built from # NVIDIA/TensorRT-LLM@feat/deepseek_v4; the benchmark script keeps a guarded # source-build fallback if the image is missing the required DSv4 support. -dsv4-fp4-b200-trt: - image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: b200-dsv4 - precision: fp4 - framework: trt - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } +#dsv4-fp4-b200-trt: +# image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1 +# model: deepseek-ai/DeepSeek-V4-Pro +# model-prefix: dsv4 +# runner: b200-dsv4 +# precision: fp4 +# framework: trt +# multinode: false +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# - { tp: 8, conc-start: 1, conc-end: 32 } +# - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 } +# - isl: 8192 +# osl: 1024 +# search-space: +# - { tp: 8, conc-start: 1, conc-end: 32 } +# - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 128 } # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh index 28d148299..66c800147 100644 --- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh @@ -26,6 +26,23 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" +sanitize_slurm_mpi_env_for_trtllm() { + if [[ "${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-0}" != "1" ]]; then + return 0 + fi + + echo "Sanitizing Slurm/PMI environment for TensorRT-LLM direct launch" + while IFS='=' read -r name _; do + case "$name" in + SLURM_*|PMI*|PMIX*|OMPI_*|OPAL_*|ORTE_*) + unset "$name" + ;; + esac + done < <(env) +} + +sanitize_slurm_mpi_env_for_trtllm + export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh index 3c1763835..c143386ec 100644 --- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh @@ -6,5 +6,7 @@ # avoid nested mpirun because this cluster's OpenMPI build lacks Slurm PMIx. export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-0}" +export TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV="${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-1}" +export TRTLLM_DSV4_BOOTSTRAP="${TRTLLM_DSV4_BOOTSTRAP:-0}" bash "$(dirname "$0")/dsv4_fp4_b200_trt.sh" From fb5d85f3b3817aa8574b2b34a2ab9b52ee2925bb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 30 Apr 2026 15:27:09 -0700 Subject: [PATCH 12/14] b200 perf ok --- perf-changelog.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5fda5e2c5..8b968fc1a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2039,7 +2039,6 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1027 - config-keys: - - dsv4-fp4-b200-trt - dsv4-fp4-b300-trt description: - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300" From ef7f42cec3469e9a1b7ed38cb401eee0291cf748 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 30 Apr 2026 16:13:04 -0700 Subject: [PATCH 13/14] OPAL --- benchmarks/single_node/dsv4_fp4_b200_trt.sh | 4 ++-- benchmarks/single_node/dsv4_fp4_b300_trt.sh | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh index 66c800147..3e9f3fe0a 100644 --- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b200_trt.sh @@ -31,10 +31,10 @@ sanitize_slurm_mpi_env_for_trtllm() { return 0 fi - echo "Sanitizing Slurm/PMI environment for TensorRT-LLM direct launch" + echo "Sanitizing Slurm/PMI environment for TensorRT-LLM launch" while IFS='=' read -r name _; do case "$name" in - SLURM_*|PMI*|PMIX*|OMPI_*|OPAL_*|ORTE_*) + SLURM_*|PMI*|PMIX*|OMPI_*|ORTE_*) unset "$name" ;; esac diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh index c143386ec..fd4b99be3 100644 --- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh @@ -3,9 +3,10 @@ # B300 uses the same low-concurrency TRTLLM bring-up recipe as B200. The B300 # runner may rewrite MODEL to the pre-staged /data/models/dsv4-pro path before # this script is invoked. The job itself is already launched under srun/pyxis; -# avoid nested mpirun because this cluster's OpenMPI build lacks Slurm PMIx. +# scrub Slurm's PMI environment, then use mpirun to give TRTLLM a valid OpenMPI +# runtime instead of direct-launching under srun. -export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-0}" +export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-1}" export TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV="${TRTLLM_DSV4_SANITIZE_SLURM_MPI_ENV:-1}" export TRTLLM_DSV4_BOOTSTRAP="${TRTLLM_DSV4_BOOTSTRAP:-0}" From 5f409ed753b6b1f67ea6b3f49fdda378c17b22bf Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 30 Apr 2026 16:52:18 -0700 Subject: [PATCH 14/14] sweep --- .github/configs/nvidia-master.yaml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index cbe43d9a9..17ca6a6a9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2600,11 +2600,17 @@ dsv4-fp4-b300-trt: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 } + - { tp: 4, conc-start: 1, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 512 } + - { tp: 8, conc-start: 1, conc-end: 32 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 } + - { tp: 4, conc-start: 1, conc-end: 64} + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 512 } + - { tp: 8, conc-start: 1, conc-end: 32 } dsv4-fp4-b300-vllm-mtp: image: vllm/vllm-openai:v0.20.0-cu130