diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6b029001d..58f297834 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -74,6 +74,57 @@ dsr1-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } +dsr1-fp4-b200-trt-mtp: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + runner: b200-trt + precision: fp4 + framework: trt + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # If TP=4: + # If CONC >= 16, then EP=4 + # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 4, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 16, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + # If TP=8: + # If CONC >= 16, then EP=8 + # If CONC >= 64, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - isl: 1024 + osl: 8192 + search-space: + # If TP=4: + # If CONC >= 32, then EP=4 + # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + # If TP=8: + # If CONC >= 8, then EP=8 + # If CONC >= 128, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 8, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + # If TP=4: + # If CONC >= 32, then EP=4, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp } + # If TP=8: + # If CONC >= 32, then EP=8, DP_ATTN=true, MOE_BACKEND=CUTLASS, MTP=1 + - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256, spec-decoding: mtp } + dsr1-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.6-cu129-amd64 model: deepseek-ai/DeepSeek-R1-0528 @@ -123,6 +174,35 @@ dsr1-fp8-b200-trt: # If CONC > 64, then DP_ATTN=true - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } +dsr1-fp8-b200-trt-mtp: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: b200-trt + precision: fp8 + framework: trt + multinode: false + seq-len-configs: + # For all sequence lengths, EP=TP, MOE_BACKEND=DEEPGEMM, MTP=3 (or MTP=1 when DP_ATTN=true) + - isl: 1024 + osl: 1024 + search-space: + # If CONC >= 64, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - isl: 1024 + osl: 8192 + search-space: + # If CONC >= 128, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + # If CONC >= 64, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } + dsr1-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.6-cu129-amd64 model: deepseek-ai/DeepSeek-R1-0528 @@ -172,6 +252,35 @@ dsr1-fp8-h200-trt: - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } +dsr1-fp8-h200-trt-mtp: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: h200 + precision: fp8 + framework: trt + multinode: false + # For all sequence lengths, EP=TP, MOE_BACKEND=CUTLASS, MTP=3 (or MTP=1 when DP_ATTN=true) + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # If CONC >= 128, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - isl: 1024 + osl: 8192 + search-space: + # If CONC >= 256, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + # If CONC >= 64, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } + gptoss-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2 model: openai/gpt-oss-120b diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index cc7b81553..fad1c5064 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -92,7 +92,7 @@ wait_for_server_ready() { } # Run benchmark serving with standardized parameters -# All parameters are required +# All parameters are required except --use-chat-template # Parameters: # --model: Model name # --port: Server port @@ -104,6 +104,7 @@ wait_for_server_ready() { # --max-concurrency: Max concurrency # --result-filename: Result filename without extension # --result-dir: Result directory +# --use-chat-template: Optional flag to enable chat template run_benchmark_serving() { set +x local model="" @@ -116,6 +117,7 @@ run_benchmark_serving() { local max_concurrency="" local result_filename="" local result_dir="" + local use_chat_template=false # Parse arguments while [[ $# -gt 0 ]]; do @@ -160,6 +162,10 @@ run_benchmark_serving() { result_dir="$2" shift 2 ;; + --use-chat-template) + use_chat_template=true + shift + ;; *) echo "Unknown parameter: $1" return 1 @@ -224,23 +230,33 @@ run_benchmark_serving() { local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR" + # Build benchmark command + local benchmark_cmd=( + python3 "$BENCH_SERVING_DIR/benchmark_serving.py" + --model "$model" + --backend "$backend" + --base-url "http://0.0.0.0:$port" + --dataset-name random + --random-input-len "$input_len" + --random-output-len "$output_len" + --random-range-ratio "$random_range_ratio" + --num-prompts "$num_prompts" + --max-concurrency "$max_concurrency" + --request-rate inf + --ignore-eos + --save-result + --percentile-metrics 'ttft,tpot,itl,e2el' + --result-dir "$result_dir" + --result-filename "$result_filename.json" + ) + + # Add --use-chat-template if requested + if [[ "$use_chat_template" == true ]]; then + benchmark_cmd+=(--use-chat-template) + fi + # Run benchmark set -x - python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \ - --model "$model" \ - --backend "$backend" \ - --base-url "http://0.0.0.0:$port" \ - --dataset-name random \ - --random-input-len "$input_len" \ - --random-output-len "$output_len" \ - --random-range-ratio "$random_range_ratio" \ - --num-prompts "$num_prompts" \ - --max-concurrency "$max_concurrency" \ - --request-rate inf \ - --ignore-eos \ - --save-result \ - --percentile-metrics 'ttft,tpot,itl,e2el' \ - --result-dir "$result_dir" \ - --result-filename "$result_filename.json" + "${benchmark_cmd[@]}" set +x } diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh new file mode 100644 index 000000000..52ca1e9e1 --- /dev/null +++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# MODEL +# TP +# CONC +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# PORT_OFFSET +# DP_ATTENTION +# EP_SIZE + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +hf download $MODEL + +# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION ========= +if [[ "$DP_ATTENTION" == "true" ]]; then + MOE_BACKEND="CUTLASS" + MTP=1 +else + MOE_BACKEND="TRTLLM" + MTP=3 +fi + +echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'" + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) +EXTRA_CONFIG_FILE="dsr1-fp4-mtp.yml" + +cat > $EXTRA_CONFIG_FILE << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: 512 +enable_attention_dp: $DP_ATTENTION +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: $MOE_BACKEND +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: ${MTP} +EOF + +if [[ "$DP_ATTENTION" == "true" ]]; then + cat << EOF >> $EXTRA_CONFIG_FILE +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +EOF +fi + +if [[ "$DP_ATTENTION" == "true" ]]; then + MAX_BATCH_SIZE=$((CONC/TP)) +else + MAX_BATCH_SIZE=$CONC +fi + +MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 )) + +set -x +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root \ + trtllm-serve $MODEL --port=$PORT \ + --trust_remote_code \ + --backend=pytorch \ + --max_batch_size=$MAX_BATCH_SIZE \ + --max_seq_len=$MAX_MODEL_LEN \ + --max_num_tokens=$MAX_NUM_TOKENS \ + --tp_size=$TP --ep_size=$EP_SIZE \ + --extra_llm_api_options=$EXTRA_CONFIG_FILE \ + > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh new file mode 100644 index 000000000..519e49089 --- /dev/null +++ b/benchmarks/dsr1_fp8_b200_trt_mtp_slurm.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# MODEL +# TP +# CONC +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# PORT_OFFSET +# DP_ATTENTION +# EP_SIZE + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +hf download $MODEL + +# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION ========= +MOE_BACKEND="DEEPGEMM" + +if [[ "$DP_ATTENTION" == "true" ]]; then + MTP=1 +else + MTP=3 +fi + +echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'" + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) +EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml" + +cat > $EXTRA_CONFIG_FILE << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: 256 +enable_attention_dp: $DP_ATTENTION +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: $MOE_BACKEND +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: ${MTP} +EOF + +if [[ "$DP_ATTENTION" == "true" ]]; then + cat << EOF >> $EXTRA_CONFIG_FILE +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +EOF +fi + +if [[ "$DP_ATTENTION" == "true" ]]; then + MAX_BATCH_SIZE=$((CONC/TP)) +else + MAX_BATCH_SIZE=$CONC +fi + +MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 )) + +set -x +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root \ + trtllm-serve $MODEL --port=$PORT \ + --trust_remote_code \ + --backend=pytorch \ + --max_batch_size=$MAX_BATCH_SIZE \ + --max_seq_len=$MAX_MODEL_LEN \ + --max_num_tokens=$MAX_NUM_TOKENS \ + --tp_size=$TP --ep_size=$EP_SIZE \ + --extra_llm_api_options=$EXTRA_CONFIG_FILE \ + > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh new file mode 100644 index 000000000..ee252098b --- /dev/null +++ b/benchmarks/dsr1_fp8_h200_trt_mtp_slurm.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# MODEL +# TP +# CONC +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# PORT_OFFSET +# DP_ATTENTION +# EP_SIZE + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +hf download $MODEL + +# ========= Determine MOE_BACKEND and MTP based on DP_ATTENTION ========= +MOE_BACKEND="CUTLASS" + +if [[ "$DP_ATTENTION" == "true" ]]; then + MTP=1 +else + MTP=3 +fi + +echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'" + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) +EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml" + +# If ISL=8192 and DP_ATTENTION=true, export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8192 +if [[ "$ISL" == "8192" && "$DP_ATTENTION" == "true" ]]; then + export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:8192" +fi + +cat > $EXTRA_CONFIG_FILE << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +enable_attention_dp: $DP_ATTENTION +print_iter_log: true +kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false +stream_interval: 10 +moe_config: + backend: $MOE_BACKEND +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: ${MTP} +EOF + +if [[ "$DP_ATTENTION" == "true" ]]; then + cat << EOF >> $EXTRA_CONFIG_FILE +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +EOF +fi + +if [[ "$DP_ATTENTION" == "true" ]]; then + MAX_BATCH_SIZE=$((CONC/TP)) +else + MAX_BATCH_SIZE=$CONC +fi + +MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 )) + +set -x +# Launch TRT-LLM server +PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ + trtllm-serve $MODEL --port=$PORT \ + --trust_remote_code \ + --backend=pytorch \ + --max_batch_size=$MAX_BATCH_SIZE \ + --max_seq_len=$MAX_MODEL_LEN \ + --max_num_tokens=$MAX_NUM_TOKENS \ + --tp_size=$TP --ep_size=$EP_SIZE \ + --extra_llm_api_options=$EXTRA_CONFIG_FILE \ + > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 21f2e8655..0a87e23c4 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -162,3 +162,12 @@ description: - "Updating MI355x Deepseek-R1 FP4 SGLang Image to upstream v0.5.7" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/395 + +- config-keys: + - dsr1-fp4-b200-trt-mtp + - dsr1-fp8-b200-trt-mtp + - dsr1-fp8-h200-trt-mtp + description: + - Add MTP (Multi-Token Prediction) support for single-node TRT configs + - Add spec-decoding field to config entries and update launch scripts to select MTP benchmark scripts + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/412 diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index 1cb5c3dd1..08ed1e455 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -3,6 +3,7 @@ HF_HUB_CACHE_MOUNT="/mnt/data/hf-hub-cache-${USER: -1}/" PARTITION="main" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') UCX_NET_DEVICES=eth0 @@ -19,4 +20,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \ --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1},UCX_NET_DEVICES=$UCX_NET_DEVICES \ -bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh \ No newline at end of file +bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh \ No newline at end of file diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 28286e2be..1305c6848 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -5,6 +5,7 @@ export PORT_OFFSET=0 # Doesn't matter when --exclusive MODEL_CODE="${EXP_NAME%%_*}" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="dgx-b200" SQUASH_FILE="/raid/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -22,6 +23,6 @@ srun --jobid=$JOB_ID \ --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh +bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh scancel $JOB_ID diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 0b6740d7b..f72b8bb0d 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -5,6 +5,7 @@ export PORT_OFFSET=${USER: -1} MODEL_CODE="${EXP_NAME%%_*}" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="h200" SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -31,7 +32,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh +bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 15b6fa6c5..703bcf231 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -5,6 +5,7 @@ export PORT_OFFSET=${USER: -1} MODEL_CODE="${EXP_NAME%%_*}" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="main" SQUASH_FILE="/home/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -29,6 +30,6 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh +bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh scancel $JOB_ID diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh index 3282be1a8..0434f880b 100644 --- a/runners/launch_h200-nv.sh +++ b/runners/launch_h200-nv.sh @@ -5,6 +5,7 @@ export PORT_OFFSET=0 # Doesn't matter when --exclusive MODEL_CODE="${EXP_NAME%%_*}" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="dgx-h200" SQUASH_FILE="/raid/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -22,6 +23,6 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh +bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}_slurm.sh scancel $JOB_ID