diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8ab211cb5..1f7f527ef 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1977,7 +1977,7 @@ dsr1-fp8-h200-dynamo-trt: dsr1-fp8-h100-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3 model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: DeepSeek-R1-0528 + model-prefix: dsr1 runner: h100-multinode-slurm precision: fp8 framework: dynamo-trt @@ -2314,21 +2314,23 @@ dsr1-fp8-h100-dynamo-trt: tp: 16 ep: 16 dp-attn: true - - spec-decoding: "mtp" - conc-list: [78] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: false + # commenting out cuz it persistently causes problems + # https://github.com/InferenceMAX/InferenceMAX/actions/runs/21769314582/job/62813105509 + # - spec-decoding: "mtp" + # conc-list: [78] + # prefill: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # additional-settings: + # # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml + # - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" + # decode: + # num-worker: 2 + # tp: 16 + # ep: 16 + # dp-attn: false - spec-decoding: "mtp" conc-list: [154] prefill: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index eaae0dafc..05d016aa7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -459,3 +459,9 @@ description: - "New B300 FP8 Dynamo TRT configurations" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/638 +- config-keys: + - dsr1-fp8-h100-dynamo-trt + description: + - "Add DeepSeek R1 FP8 H100 Dynamo TRT-LLM disaggregated multinode configurations" + - "fix model_prefix bug from https://github.com/InferenceMAX/InferenceMAX/pull/651" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/663 diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index fb0ae69f7..d461e4a94 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -37,9 +37,10 @@ CONTAINER_KEY=$(echo "$IMAGE" | sed 's|nvcr.io/|nvcr.io#|') # Map container image to local squash file SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's|nvcr.io/||' | sed 's/[\/:@#]/+/g').sqsh" -if [[ $MODEL_PREFIX == "DeepSeek-R1-0528" ]]; then +if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/numa1/shared/models/dsr1-fp8" export SERVED_MODEL_NAME="DeepSeek-R1-0528" + export SRT_SLURM_MODEL_PREFIX="DeepSeek-R1-0528" else echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: DeepSeek-R1-0528" exit 1 @@ -64,7 +65,7 @@ network_interface: "" srtctl_root: "${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" # Model path aliases model_paths: - "${MODEL_PREFIX}": "${MODEL_PATH}" + "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}" containers: latest: "${SQUASH_FILE}" "${CONTAINER_KEY}": "${SQUASH_FILE}"