From 0383696f7c6173378dee1ab115b4151f51f47ccf Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 16 Mar 2026 08:36:19 +0000 Subject: [PATCH 01/55] [AMD] add dsr1 mxfp4 v2 sweep points --- .github/configs/amd-master.yaml | 56 +++++++++++++++++++++ benchmarks/multi_node/amd_utils/models.yaml | 31 ++++++++++++ 2 files changed, 87 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5551860f2..61c842f58 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1446,6 +1446,62 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" +dsr1-fp4-mi355x-sglang-disagg-mtp-v2: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 1P1D pure TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + + + # FIXME(billishyahao): disable FP4 1k8k for now # - isl: 1024 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 2bbdd91d6..4c6611571 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -222,3 +222,34 @@ DeepSeek-R1-0528-MXFP4: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" + +DeepSeek-R1-0528-MXFP4-v2: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 16384 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" From 18e05b1cbb097497a63800291b6015e8cd37e250 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 17 Mar 2026 06:36:04 +0000 Subject: [PATCH 02/55] fix --- .github/configs/amd-master.yaml | 3 --- benchmarks/multi_node/amd_utils/models.yaml | 14 +++++++------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 61c842f58..f20ed38fd 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1499,9 +1499,6 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=3" - - - # FIXME(billishyahao): disable FP4 1k8k for now # - isl: 1024 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 4c6611571..07668659d 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -38,7 +38,7 @@ # cuda_graph_bs_range: str DeepSeek-V3: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -69,7 +69,7 @@ DeepSeek-V3: cuda_graph_bs_range: "1-128" DeepSeek-V3-0324: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -100,7 +100,7 @@ DeepSeek-V3-0324: cuda_graph_bs_range: "1-128" DeepSeek-R1: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -131,7 +131,7 @@ DeepSeek-R1: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -162,7 +162,7 @@ DeepSeek-R1-0528: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-Preview: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-v2: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: From 32b5d3d00cce991eb9e7a3b298c69ee7b9cf28cd Mon Sep 17 00:00:00 2001 From: Zhai Feiyue Date: Tue, 24 Mar 2026 14:59:35 +0000 Subject: [PATCH 03/55] Fix tokenizer mismatch between benchmark client and sglang server on transformers v5 Transformers v5 incorrectly rebuilds pre_tokenizer/decoder components for models like DeepSeek-R1 that use LlamaTokenizerFast with a non-Llama tokenizer architecture. The sglang server fixes this at startup, but the benchmark client loads the tokenizer without these fixes, causing a ~5x token count inflation (e.g. 7000 tokens -> 35000 tokens) and false performance regressions in TTFT and throughput benchmarks. Apply the same tokenizer fixes (pre_tokenizer/decoder restoration and add_bos_token recovery) that sglang server applies, so client and server tokenize identically. No-op on transformers v4. Made-with: Cursor --- utils/bench_serving/backend_request_func.py | 72 ++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index 32331a398..4990ef5fa 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -439,6 +439,75 @@ def get_model(pretrained_model_name_or_path: str) -> str: return pretrained_model_name_or_path +def _fix_tokenizer_for_sglang(tokenizer, model_path): + """Fix transformers v5 tokenizer to match sglang server-side behavior. + + Root cause: transformers v5 (>= 5.0) changed how tokenizers are loaded. + Specifically, LlamaTokenizerFast.__init__ in v5 rebuilds the pre_tokenizer + and decoder from scratch using class-specific components, discarding the + originals from tokenizer.json. For models like DeepSeek-R1 that declare + LlamaTokenizerFast but actually use a ByteLevel/Sequence tokenizer + architecture, v5 incorrectly replaces the original Sequence pre_tokenizer + with Metaspace, and the original ByteLevel decoder with Sequence. + + The sglang server applies fixes for this in hf_transformers_utils.py + (_fix_v5_tokenizer_components and _fix_v5_add_bos_eos_token), but the + benchmark client loads the tokenizer directly via AutoTokenizer without + these fixes. This mismatch causes the client to encode text differently + from the server -- e.g. a 7000-token prompt on the client becomes ~35000 + tokens on the server, leading to ~5x TTFT inflation and false performance + regressions in benchmarks. + + This function replicates the same fixes so the benchmark client tokenizes + identically to the sglang server. It is a no-op on transformers v4. + """ + import json + from pathlib import Path + + backend = getattr(tokenizer, "_tokenizer", None) + if backend is not None: + try: + from tokenizers import Tokenizer as RawTokenizer + tok_file = Path(model_path) / "tokenizer.json" + if tok_file.is_file(): + raw = RawTokenizer.from_file(str(tok_file)) + raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None + loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None + if raw_pre and loaded_pre and raw_pre != loaded_pre: + backend.pre_tokenizer = raw.pre_tokenizer + backend.decoder = raw.decoder + except Exception: + pass + + try: + config_file = Path(model_path) / "tokenizer_config.json" + if config_file.is_file(): + with open(config_file) as f: + config = json.load(f) + tok_class = config.get("tokenizer_class", "") + bos_eos_classes = { + "LlamaTokenizer", "LlamaTokenizerFast", + "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", + "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast", + } + if tok_class in bos_eos_classes: + defaults = {"add_bos_token": True, "add_eos_token": False} + changed = False + for attr in ("add_bos_token", "add_eos_token"): + val = config.get(attr) + if val is None: + val = defaults.get(attr, False) + if getattr(tokenizer, attr, None) != val: + setattr(tokenizer, f"_{attr}", val) + changed = True + if changed and hasattr(tokenizer, "update_post_processor"): + tokenizer.update_post_processor() + except Exception: + pass + + return tokenizer + + def get_tokenizer( pretrained_model_name_or_path: str, tokenizer_mode: str = "auto", @@ -464,11 +533,12 @@ def get_tokenizer( return MistralTokenizer.from_pretrained( str(pretrained_model_name_or_path)) else: - return AutoTokenizer.from_pretrained( + tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs, ) + return _fix_tokenizer_for_sglang(tokenizer, pretrained_model_name_or_path) ASYNC_REQUEST_FUNCS = { From 0bd347fe71d6689269c81569b797985618ffad7f Mon Sep 17 00:00:00 2001 From: billishyahao Date: Wed, 25 Mar 2026 15:28:03 +0000 Subject: [PATCH 04/55] change mtp model to fp8 --- .github/configs/amd-master.yaml | 369 +++++++++++++++++++- benchmarks/multi_node/amd_utils/models.yaml | 2 +- 2 files changed, 369 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f20ed38fd..525595b7b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1446,8 +1446,218 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + +dsr1-fp4-mi355x-sglang-disagg-v2: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1*DEP4+ 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + dsr1-fp4-mi355x-sglang-disagg-mtp-v2: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1456,6 +1666,106 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2: multinode: true disagg: true seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1P1D TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + - isl: 8192 osl: 1024 search-space: @@ -1499,6 +1809,63 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=3" + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + # FIXME(billishyahao): disable FP4 1k8k for now # - isl: 1024 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 07668659d..6bca6b52a 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -225,7 +225,7 @@ DeepSeek-R1-0528-MXFP4: DeepSeek-R1-0528-MXFP4-v2: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 From 754e53c00fd834dcc6093c8b164966b8019b0605 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Wed, 25 Mar 2026 15:32:42 +0000 Subject: [PATCH 05/55] change fp8 image --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 525595b7b..2cea84d01 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -596,7 +596,7 @@ dsr1-fp8-mi355x-atom-mtp: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg @@ -751,7 +751,7 @@ dsr1-fp8-mi355x-sglang-disagg: dsr1-fp8-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg From f29f2d01ea990161dfc6bc79401a30a30bda9502 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 27 Mar 2026 11:20:18 +0000 Subject: [PATCH 06/55] bump image to 0327 --- .github/configs/amd-master.yaml | 4 ++-- benchmarks/multi_node/amd_utils/env.sh | 4 +++- benchmarks/multi_node/amd_utils/models.yaml | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 2cea84d01..a0112d479 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1448,7 +1448,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: dsr1-fp4-mi355x-sglang-disagg-v2: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1657,7 +1657,7 @@ dsr1-fp4-mi355x-sglang-disagg-v2: dsr1-fp4-mi355x-sglang-disagg-mtp-v2: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 5565c5b3b..f4b631673 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -34,7 +34,6 @@ export IBDEVICES export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -set +x export NCCL_IB_HCA=$IBDEVICES @@ -123,4 +122,7 @@ fi # FIXME: WA for latest upstream 0305 image export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} +export SGLANG_ENABLE_SPEC_V2=1 +export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 +set +x diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 6bca6b52a..eed59bdab 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -225,7 +225,7 @@ DeepSeek-R1-0528-MXFP4: DeepSeek-R1-0528-MXFP4-v2: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" - mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1" + mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 From a44c7eb8759e8527d8af192e5aa2ffc7f7e65fb0 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 27 Mar 2026 14:09:11 +0000 Subject: [PATCH 07/55] remove specv2 --- benchmarks/multi_node/amd_utils/env.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index f4b631673..02cb77a91 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -122,7 +122,5 @@ fi # FIXME: WA for latest upstream 0305 image export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} -export SGLANG_ENABLE_SPEC_V2=1 -export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 set +x From 25141364c930fc59e455aeca97eeeebd81e750fa Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 30 Mar 2026 01:57:19 +0000 Subject: [PATCH 08/55] consolidate dsr1 fp4 configs --- .github/configs/amd-master.yaml | 422 +------------------------------- 1 file changed, 1 insertion(+), 421 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a0112d479..6a96a4af2 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1027,427 +1027,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: # - "DECODE_NODES=2" # - "DECODE_MTP_SIZE=0" - dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D pure TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - -dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # 1P1D TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 1P1D pure TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - -dsr1-fp4-mi355x-sglang-disagg-v2: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 @@ -1656,7 +1236,7 @@ dsr1-fp4-mi355x-sglang-disagg-v2: - "DECODE_MTP_SIZE=0" -dsr1-fp4-mi355x-sglang-disagg-mtp-v2: +dsr1-fp4-mi355x-sglang-disagg-mtp: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 From 682a4ab4ec3d42c73cd5c54b9aede2ba1fc33a54 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 30 Mar 2026 02:03:58 +0000 Subject: [PATCH 09/55] bump fp8 image to 0327 --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 27518d40b..a139ca560 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -486,7 +486,7 @@ dsr1-fp8-mi355x-atom-mtp: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg @@ -641,7 +641,7 @@ dsr1-fp8-mi355x-sglang-disagg: dsr1-fp8-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg From 64bf10078c4e4f9f19486dc0f6727dc6ef1902d2 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 30 Mar 2026 04:36:27 +0000 Subject: [PATCH 10/55] fix crash --- benchmarks/multi_node/amd_utils/server.sh | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 7f174b760..7340ef51c 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -187,18 +187,8 @@ else decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP fi -# Use Decode configuration to configure different TP/DP size between P and D -PREFILL_DECODE_DIFFERENT_TP="" -if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then - if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" - else - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" - fi -fi - # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi From c44e1755ea6cc81f5e6f59b071ed20ddb7abefe4 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 30 Mar 2026 15:26:57 +0000 Subject: [PATCH 11/55] fix env --- benchmarks/multi_node/amd_utils/env.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 02cb77a91..88ea2ac84 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -88,17 +88,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p { if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then TC=$(( 4 * ND_DSCP )) export MORI_RDMA_SL=$ND_PRIO + export MORI_IO_SL=$ND_PRIO export MORI_RDMA_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + export MORI_IO_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" else echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." # Fall back to hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." @@ -109,9 +113,11 @@ else NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." From 0a41f8980559717d1e2544ac013048dbb85b8c94 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 31 Mar 2026 06:50:28 +0000 Subject: [PATCH 12/55] cleanup --- .github/configs/amd-master.yaml | 123 -------------------------------- 1 file changed, 123 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a139ca560..14eec1583 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -794,129 +794,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" - # FIXME(billishyahao): disable 1k8k for now - # - isl: 1024 - # osl: 8192 - # search-space: - # # MTP configurations - # # "Top of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - # - spec-decoding: "mtp" - # conc-list: [ 2048 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 1 - # tp: 1 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - - # # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8) - # - spec-decoding: "mtp" - # conc-list: [ 256, 512, 1024 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "mtp" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - # # non-MTP configurations - # # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) - # - spec-decoding: "none" - # conc-list: [ 2048 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 1 - # tp: 1 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - - # # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - # - spec-decoding: "none" - # conc-list: [ 256, 512, 1024 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "none" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - dsr1-fp4-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 model: amd/DeepSeek-R1-0528-MXFP4-v2 From 7282748ed6da9d902f737b8843f0599f01546d26 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 31 Mar 2026 06:54:30 +0000 Subject: [PATCH 13/55] add perf change log --- perf-changelog.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3dbc5eccc..1cd22211a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1213,3 +1213,13 @@ - "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization" - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973 + +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Bump SGL mori image to March 27" + - "Add more low latency sweep configs" + - "Enable v2 mxfp4 DSR1 0528 model" + - "Enable fp4 disp feature on mori" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/983 From e6d4b3255d079f7de7ad13367120d521cb5d02a7 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 31 Mar 2026 08:14:42 +0000 Subject: [PATCH 14/55] add deprecate comments --- benchmarks/multi_node/amd_utils/env.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 88ea2ac84..0aa2d0c20 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -63,6 +63,8 @@ export MORI_MAX_DISPATCH_TOKENS_DECODE=160 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + +#TODO(billishyahao): The following IO env will be deprecated soon. export MORI_IO_QP_MAX_SEND_WR=16384 export MORI_IO_QP_MAX_CQE=32768 export MORI_IO_QP_MAX_SGE=4 From b7dd65f146b3aeea9d0592a0164d003312ece3c1 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Wed, 1 Apr 2026 13:21:09 +0000 Subject: [PATCH 15/55] add spec v2 env --- .github/configs/amd-master.yaml | 4 ++-- benchmarks/multi_node/amd_utils/env.sh | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 14eec1583..14577525c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-2 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-2 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 0aa2d0c20..d0fa8aa9d 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -52,6 +52,10 @@ fi export SGLANG_MORI_FP4_DISP=False export SGLANG_MORI_FP8_COMB=False +# Enable spec v2 +export SGLANG_ENABLE_SPEC_V2=1 +export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 + # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 if [[ "$MODEL_NAME" == *mxfp4* ]]; then From 12a4ba0ab618385daf26355aa9bfa28cd9432a4f Mon Sep 17 00:00:00 2001 From: billishyahao Date: Thu, 2 Apr 2026 15:04:45 +0000 Subject: [PATCH 16/55] bump the docker image --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 14577525c..18131ee9f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg From 597a458e352859d036c8bef4c0f37145736fe58b Mon Sep 17 00:00:00 2001 From: billishyahao Date: Thu, 9 Apr 2026 08:55:37 +0000 Subject: [PATCH 17/55] add stream control to eliminate cpu overhead --- .github/configs/amd-master.yaml | 113 ++++++++++++++++++++++ benchmarks/multi_node/amd_utils/server.sh | 4 +- 2 files changed, 115 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 18131ee9f..f5bc7390f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1213,3 +1213,116 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" +dsr1-fp4-mi355x-sglang-disagg-exp: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # 1*DEP4+ 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 4*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + +dsr1-fp4-mi355x-sglang-disagg-mtp-exp: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 4*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 7340ef51c..e27e036f9 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -188,12 +188,12 @@ else fi # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 32 " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 32 --stream-interval 2" if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" fi From f715e47ba79972dfd5035ac0ba6ded0adb4e9452 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 10 Apr 2026 15:01:07 +0000 Subject: [PATCH 18/55] tune the config --- .github/configs/amd-master.yaml | 4 ++-- benchmarks/multi_node/amd_utils/env.sh | 8 +++++++- benchmarks/multi_node/amd_utils/models.yaml | 4 ++-- benchmarks/multi_node/amd_utils/server.sh | 4 ++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f5bc7390f..e4bc8178d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1252,7 +1252,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: # non-MTP configurations # 4*DEP4 + 1*DEP8 - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] + conc-list: [ 512, 1024, 2048 ] prefill: num-worker: 4 tp: 4 @@ -1310,7 +1310,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: # MTP configurations # 4*DEP4 + 1*DEP8 - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] + conc-list: [ 512, 1024, 2048 ] prefill: num-worker: 4 tp: 4 diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index d0fa8aa9d..ee9cd0087 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -56,12 +56,17 @@ export SGLANG_MORI_FP8_COMB=False export SGLANG_ENABLE_SPEC_V2=1 export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 +export SGLANG_LOG_MS=true +export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 +export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 +export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 + # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 if [[ "$MODEL_NAME" == *mxfp4* ]]; then export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 fi -export MORI_MAX_DISPATCH_TOKENS_DECODE=160 +export MORI_MAX_DISPATCH_TOKENS_DECODE=512 # set MTP size=1 when EP16 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) @@ -73,6 +78,7 @@ export MORI_IO_QP_MAX_SEND_WR=16384 export MORI_IO_QP_MAX_CQE=32768 export MORI_IO_QP_MAX_SGE=4 + export MORI_APP_LOG_LEVEL=INFO # Router logging control: diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index eed59bdab..3e8af0266 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -231,7 +231,7 @@ DeepSeek-R1-0528-MXFP4-v2: mem_fraction_static: 0.8 disable_radix_cache: true dp: - max_running_requests: 24 + max_running_requests: 32 chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" cuda_graph_bs: "1 2 3" no_dp: @@ -244,7 +244,7 @@ DeepSeek-R1-0528-MXFP4-v2: dp: max_running_requests: 4096 chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" - cuda_graph_bs_range: "1-160" + cuda_graph_bs_range: "1-512" ep_only: max_running_requests: 256 chunked_prefill_size: 262144 diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index e27e036f9..141dc0d7d 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -188,12 +188,12 @@ else fi # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 32 " +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 8 " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 32 --stream-interval 2" +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 8 --stream-interval 3" if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" fi From 2ea82d5a9f18d88f059bc1c30a606d8028effb48 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sat, 11 Apr 2026 01:28:36 +0000 Subject: [PATCH 19/55] bump image --- .github/configs/amd-master.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e4bc8178d..924975932 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -985,7 +985,7 @@ dsr1-fp4-mi355x-sglang-disagg: # 4*DEP4 + 1*DEP8 - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] + conc-list: [ 512, 1024, 2048 ] prefill: num-worker: 4 tp: 4 @@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1196,7 +1196,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: # 4*DEP4 + 1*DEP8 - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] + conc-list: [ 512, 1024, 2048 ] prefill: num-worker: 4 tp: 4 From 16384e7f11bc253d993c619c9cc27cecc5ef61c0 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sat, 11 Apr 2026 16:15:29 +0000 Subject: [PATCH 20/55] tune config --- .github/configs/amd-master.yaml | 155 +++++++++++++++++++--- benchmarks/multi_node/amd_utils/server.sh | 6 +- 2 files changed, 140 insertions(+), 21 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 924975932..064312b7d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=1" dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1227,36 +1227,94 @@ dsr1-fp4-mi355x-sglang-disagg-exp: osl: 1024 search-space: # non-MTP configurations - # 1*DEP4+ 1*DEP8 + # 1*DEP8+ 2*DEP8 - spec-decoding: "none" - conc-list: [ 1024, 2048 ] + conc-list: [ 512, 1024, 2048] prefill: num-worker: 1 - tp: 4 - ep: 4 + tp: 8 + ep: 8 dp-attn: true additional-settings: - "PREFILL_NODES=1" decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1*DEP8+ 3*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048] + prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=1" + - "PREFILL_NODES=1" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=3" - "DECODE_MTP_SIZE=0" + - isl: 8192 osl: 1024 search-space: # non-MTP configurations - # 4*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "none" - conc-list: [ 512, 1024, 2048 ] + conc-list: [ 512, 1024, 2048] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 3*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=3" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 4*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048] prefill: num-worker: 4 - tp: 4 - ep: 4 + tp: 8 + ep: 8 dp-attn: true additional-settings: - "PREFILL_NODES=4" @@ -1271,7 +1329,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1284,23 +1342,42 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: osl: 1024 search-space: # MTP configurations - # 1*DEP4+ 1*DEP8 + # 1*DEP8+ 2*DEP8 - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] + conc-list: [ 512, 1024, 2048] prefill: num-worker: 1 - tp: 4 - ep: 4 + tp: 8 + ep: 8 dp-attn: true additional-settings: - "PREFILL_NODES=1" decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1*DEP8+ 3*DEP8 + - spec-decoding: "mtp" + conc-list: [ 512, 1024, 2048] + prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=1" + - "PREFILL_NODES=1" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=3" - "DECODE_MTP_SIZE=1" @@ -1308,13 +1385,51 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: osl: 1024 search-space: # MTP configurations - # 4*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048 ] + conc-list: [ 512, 1024, 2048] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # 3*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 512, 1024, 2048] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=3" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # 4*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048] prefill: num-worker: 4 - tp: 4 - ep: 4 + tp: 8 + ep: 8 dp-attn: true additional-settings: - "PREFILL_NODES=4" diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 141dc0d7d..e8d1f09f9 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -193,7 +193,11 @@ if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_ PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 8 --stream-interval 3" +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 8" +if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --stream-interval 3" +fi + if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" fi From 4d733e783d2a4dc91b0f506058982c6f2462b90a Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 13 Apr 2026 05:34:57 +0000 Subject: [PATCH 21/55] add new exp config --- .github/configs/amd-master.yaml | 71 ++++++++++++++++++++++- benchmarks/multi_node/amd_utils/server.sh | 4 +- 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 064312b7d..f76176406 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1213,7 +1213,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" -dsr1-fp4-mi355x-sglang-disagg-exp: +dsr1-fp4-mi355x-sglang-disagg-exp-april12: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 @@ -1328,7 +1328,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: - "DECODE_MTP_SIZE=0" -dsr1-fp4-mi355x-sglang-disagg-mtp-exp: +dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april12: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 @@ -1433,6 +1433,73 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: dp-attn: true additional-settings: - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + +dsr1-fp4-mi355x-sglang-disagg-exp: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 4*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + +dsr1-fp4-mi355x-sglang-disagg-mtp-exp: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 4*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" decode: num-worker: 1 tp: 8 diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index e8d1f09f9..025b943cc 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -188,12 +188,12 @@ else fi # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 8 " +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 32 " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 8" +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 32" if [[ "$DECODE_ENABLE_DP" == "true" ]]; then DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --stream-interval 3" fi From 83af74381c01a3282b99415391bab27390a966d8 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 13 Apr 2026 08:27:15 +0000 Subject: [PATCH 22/55] enable log level info --- .github/configs/amd-master.yaml | 107 ++++++++++++++++++++ benchmarks/multi_node/amd_utils/models.yaml | 14 +-- benchmarks/multi_node/amd_utils/server.sh | 9 +- 3 files changed, 117 insertions(+), 13 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f76176406..401f2ed3e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1442,6 +1442,75 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april12: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" +dsr1-fp4-mi355x-sglang-disagg-exp-april13: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 4*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + +dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april13: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 4*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + + dsr1-fp4-mi355x-sglang-disagg-exp: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -1475,6 +1544,25 @@ dsr1-fp4-mi355x-sglang-disagg-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" + # 2*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + dsr1-fp4-mi355x-sglang-disagg-mtp-exp: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 @@ -1500,6 +1588,25 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: dp-attn: true additional-settings: - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # 2*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" decode: num-worker: 1 tp: 8 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 3e8af0266..36c1ea707 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -38,7 +38,7 @@ # cuda_graph_bs_range: str DeepSeek-V3: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -69,7 +69,7 @@ DeepSeek-V3: cuda_graph_bs_range: "1-128" DeepSeek-V3-0324: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -100,7 +100,7 @@ DeepSeek-V3-0324: cuda_graph_bs_range: "1-128" DeepSeek-R1: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -131,7 +131,7 @@ DeepSeek-R1: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -162,7 +162,7 @@ DeepSeek-R1-0528: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-Preview: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-v2: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 025b943cc..9e3714c6f 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -368,8 +368,7 @@ if [ "$NODE_RANK" -eq 0 ]; then --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" + ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" @@ -498,8 +497,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" + ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) @@ -561,8 +559,7 @@ else --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${DECODE_SERVER_CONFIG} \ - --log-level-http warning" + ${DECODE_SERVER_CONFIG} " if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then rank=$((RANK % DECODE_NODES_PER_WORKER)) From 0c3083e29d7f742ae09845aa39981fde029df954 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 13 Apr 2026 09:50:26 +0000 Subject: [PATCH 23/55] fix mori env --- benchmarks/multi_node/amd_utils/env.sh | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index ee9cd0087..c84af0055 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -38,19 +38,15 @@ export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head export NCCL_IB_HCA=$IBDEVICES export SGLANG_USE_AITER=1 -export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 -export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 -# Disable allocating memory in one pass -export MORI_SHMEM_MODE=ISOLATION -export SGLANG_MORI_FP8_DISP=True +export SGLANG_MORI_DISPATCH_DTYPE=auto +export SGLANG_MORI_FP8_COMB=true -if [[ "$MODEL_NAME" == *mxfp4* ]]; then -export SGLANG_MORI_FP8_DISP=False -fi +export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 +export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 -export SGLANG_MORI_FP4_DISP=False -export SGLANG_MORI_FP8_COMB=False +# Disable allocating memory in one pass +export MORI_SHMEM_MODE=ISOLATION # Enable spec v2 export SGLANG_ENABLE_SPEC_V2=1 @@ -58,12 +54,11 @@ export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 export SGLANG_LOG_MS=true export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 -export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 -export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 + # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 -if [[ "$MODEL_NAME" == *mxfp4* ]]; then +if [[ "$MODEL_NAME" == *mxfp4* || "$MODEL_NAME" == *MXFP4* ]]; then export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 fi export MORI_MAX_DISPATCH_TOKENS_DECODE=512 @@ -73,11 +68,6 @@ export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_T export MORI_EP_LAUNCH_CONFIG_MODE=AUTO -#TODO(billishyahao): The following IO env will be deprecated soon. -export MORI_IO_QP_MAX_SEND_WR=16384 -export MORI_IO_QP_MAX_CQE=32768 -export MORI_IO_QP_MAX_SGE=4 - export MORI_APP_LOG_LEVEL=INFO From 1c61622541b61707961ef478e031a66d680ef9c2 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 13 Apr 2026 12:36:08 +0000 Subject: [PATCH 24/55] bump image --- .github/configs/amd-master.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 401f2ed3e..239fc399e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1512,7 +1512,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april13: dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1565,7 +1565,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg From e2d2ac99cef73197204f55351eb45016505f4cdf Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 13 Apr 2026 12:57:07 +0000 Subject: [PATCH 25/55] fix log --- benchmarks/multi_node/amd_utils/models.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 36c1ea707..06ede97dd 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -38,7 +38,7 @@ # cuda_graph_bs_range: str DeepSeek-V3: - base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -69,7 +69,7 @@ DeepSeek-V3: cuda_graph_bs_range: "1-128" DeepSeek-V3-0324: - base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -100,7 +100,7 @@ DeepSeek-V3-0324: cuda_graph_bs_range: "1-128" DeepSeek-R1: - base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -131,7 +131,7 @@ DeepSeek-R1: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528: - base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -162,7 +162,7 @@ DeepSeek-R1-0528: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-Preview: - base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4: - base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-v2: - base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: From d2a7988e47c7ab71febe9f09a3860262234db662 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 14 Apr 2026 13:58:15 +0000 Subject: [PATCH 26/55] bump the image --- .github/configs/amd-master.yaml | 406 +------------------- benchmarks/multi_node/amd_utils/env.sh | 15 +- benchmarks/multi_node/amd_utils/models.yaml | 6 +- benchmarks/multi_node/amd_utils/server.sh | 11 +- 4 files changed, 20 insertions(+), 418 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 239fc399e..44c8df96d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1213,405 +1213,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" -dsr1-fp4-mi355x-sglang-disagg-exp-april12: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # 1*DEP8+ 2*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1*DEP8+ 3*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=3" - - "DECODE_MTP_SIZE=0" - - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 2*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 3*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=3" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 4*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - -dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april12: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # 1*DEP8+ 2*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1*DEP8+ 3*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=3" - - "DECODE_MTP_SIZE=1" - - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 2*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # 3*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=3" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # 4*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - -dsr1-fp4-mi355x-sglang-disagg-exp-april13: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 4*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - -dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april13: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 4*DEP4 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - - -dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 4*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 2*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - -dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 4*DEP4 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # 2*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index c84af0055..a4fe1d72b 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -41,6 +41,12 @@ export SGLANG_USE_AITER=1 export SGLANG_MORI_DISPATCH_DTYPE=auto export SGLANG_MORI_FP8_COMB=true +export SGLANG_MORI_QP_PER_TRANSFER=2 +export SGLANG_MORI_NUM_WORKERS=2 + +export MORI_IO_QP_MAX_SEND_WR=16384 +export MORI_IO_QP_MAX_CQE=32768 +export MORI_IO_QP_MAX_SGE=4 export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 @@ -57,10 +63,11 @@ export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) -export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 -if [[ "$MODEL_NAME" == *mxfp4* || "$MODEL_NAME" == *MXFP4* ]]; then - export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 -fi +# export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 +# if [[ "$MODEL_NAME" == *mxfp4* || "$MODEL_NAME" == *MXFP4* ]]; then +# export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 +# fi +export MORI_MAX_DISPATCH_TOKENS_PREFILL=2048 export MORI_MAX_DISPATCH_TOKENS_DECODE=512 # set MTP size=1 when EP16 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 06ede97dd..f5faf5935 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -226,7 +226,7 @@ DeepSeek-R1-0528-MXFP4: DeepSeek-R1-0528-MXFP4-v2: base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 3 --tokenizer-worker-num 32 " prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -242,9 +242,9 @@ DeepSeek-R1-0528-MXFP4-v2: mem_fraction_static: 0.85 prefill_round_robin_balance: true dp: - max_running_requests: 4096 + max_running_requests: 2048 chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" - cuda_graph_bs_range: "1-512" + cuda_graph_bs_range: "1-256" ep_only: max_running_requests: 256 chunked_prefill_size: 262144 diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 9e3714c6f..3c224a872 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -188,15 +188,12 @@ else fi # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 32 " +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 32" -if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --stream-interval 3" -fi +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" @@ -356,8 +353,8 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " echo "================================================" # start the head prefill server From b09ae6cb96952c1dd3c1b166b301152881cd14ce Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 14 Apr 2026 14:09:55 +0000 Subject: [PATCH 27/55] fix --- benchmarks/multi_node/amd_utils/models.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index f5faf5935..317352365 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -38,7 +38,7 @@ # cuda_graph_bs_range: str DeepSeek-V3: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -69,7 +69,7 @@ DeepSeek-V3: cuda_graph_bs_range: "1-128" DeepSeek-V3-0324: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -100,7 +100,7 @@ DeepSeek-V3-0324: cuda_graph_bs_range: "1-128" DeepSeek-R1: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -131,7 +131,7 @@ DeepSeek-R1: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -162,7 +162,7 @@ DeepSeek-R1-0528: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-Preview: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-v2: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 3 --tokenizer-worker-num 32 " prefill: From 2c3ee04fe5913dfd72eef3ec40740bc315682c7d Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 14 Apr 2026 14:27:19 +0000 Subject: [PATCH 28/55] fix --- benchmarks/multi_node/amd_utils/env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index a4fe1d72b..a944a3d00 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -68,7 +68,7 @@ export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 # export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 # fi export MORI_MAX_DISPATCH_TOKENS_PREFILL=2048 -export MORI_MAX_DISPATCH_TOKENS_DECODE=512 +export MORI_MAX_DISPATCH_TOKENS_DECODE=256 # set MTP size=1 when EP16 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) From 69102f7a4838303253cc705b3f4021a26f6ecc09 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Wed, 15 Apr 2026 15:28:42 +0000 Subject: [PATCH 29/55] fix --- .github/configs/amd-master.yaml | 4 ++-- benchmarks/multi_node/amd_utils/server.sh | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 44c8df96d..ece23090d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0415 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0415 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 3c224a872..18b0bc7ea 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -355,6 +355,8 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " + echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${DECODE_CUDA_GRAPH_BS_DP_END} " + echo "================================================" # start the head prefill server @@ -549,7 +551,7 @@ else echo "Decode node rank: $RANK" echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${DECODE_CUDA_GRAPH_BS_DP_END} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ From 668068c8c8669de8e99466af2bec944f9d89355a Mon Sep 17 00:00:00 2001 From: billishyahao Date: Thu, 16 Apr 2026 05:22:08 +0000 Subject: [PATCH 30/55] fix --- benchmarks/multi_node/amd_utils/env.sh | 1 + benchmarks/multi_node/amd_utils/server.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index a944a3d00..5071ec62d 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -69,6 +69,7 @@ export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 # fi export MORI_MAX_DISPATCH_TOKENS_PREFILL=2048 export MORI_MAX_DISPATCH_TOKENS_DECODE=256 +export SGLANG_MORI_MOE_MAX_INPUT_TOKENS=2048 # set MTP size=1 when EP16 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 18b0bc7ea..18518f4d2 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -551,7 +551,7 @@ else echo "Decode node rank: $RANK" echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${DECODE_CUDA_GRAPH_BS_DP_END} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ From 776fd425871777ad4cfb180896aa88ddb290e0d1 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Thu, 16 Apr 2026 15:28:49 +0000 Subject: [PATCH 31/55] bump image to 0416 --- .github/configs/amd-master.yaml | 4 ++-- benchmarks/multi_node/amd_utils/server.sh | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ece23090d..f5705293b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0415 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0415 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 18518f4d2..4e1164b24 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -201,6 +201,7 @@ fi if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) + SGLANG_MORI_MOE_MAX_INPUT_TOKENS=$((SGLANG_MORI_MOE_MAX_INPUT_TOKENS * (DECODE_MTP_SIZE + 1))) fi # ============================================================================= @@ -355,7 +356,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " - echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${DECODE_CUDA_GRAPH_BS_DP_END} " + echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${SGLANG_MORI_MOE_MAX_INPUT_TOKENS} " echo "================================================" @@ -551,7 +552,7 @@ else echo "Decode node rank: $RANK" echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${SGLANG_MORI_MOE_MAX_INPUT_TOKENS} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ From 2471379ae51e9d85370ace8c3717e0104d589232 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 17 Apr 2026 06:29:55 +0000 Subject: [PATCH 32/55] fix --- .github/configs/amd-master.yaml | 114 ++++++++++++++++++++++ benchmarks/multi_node/amd_utils/env.sh | 2 +- benchmarks/multi_node/amd_utils/server.sh | 6 +- 3 files changed, 118 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f5705293b..ad4b3f559 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1213,3 +1213,117 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" +dsr1-fp4-mi355x-sglang-disagg-exp: + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # 1*DEP4+ 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 4*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + +dsr1-fp4-mi355x-sglang-disagg-mtp-exp: + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 4*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 5071ec62d..36361bb7b 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -69,7 +69,7 @@ export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 # fi export MORI_MAX_DISPATCH_TOKENS_PREFILL=2048 export MORI_MAX_DISPATCH_TOKENS_DECODE=256 -export SGLANG_MORI_MOE_MAX_INPUT_TOKENS=2048 +export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2048 # set MTP size=1 when EP16 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 4e1164b24..c7ab4d4ac 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -201,7 +201,7 @@ fi if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) - SGLANG_MORI_MOE_MAX_INPUT_TOKENS=$((SGLANG_MORI_MOE_MAX_INPUT_TOKENS * (DECODE_MTP_SIZE + 1))) + MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) fi # ============================================================================= @@ -356,7 +356,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " - echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${SGLANG_MORI_MOE_MAX_INPUT_TOKENS} " + echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} " echo "================================================" @@ -552,7 +552,7 @@ else echo "Decode node rank: $RANK" echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${SGLANG_MORI_MOE_MAX_INPUT_TOKENS} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ From c80997fddcd5f3c8a23c37d477f7bb6caf231277 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 17 Apr 2026 10:31:52 +0000 Subject: [PATCH 33/55] set si to 100 --- benchmarks/multi_node/amd_utils/models.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 317352365..eefc93920 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -226,7 +226,7 @@ DeepSeek-R1-0528-MXFP4: DeepSeek-R1-0528-MXFP4-v2: base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 3 --tokenizer-worker-num 32 " + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 " prefill: mem_fraction_static: 0.8 disable_radix_cache: true From 616c57deaa094dafa358931daea2b9c703e97cf7 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sat, 18 Apr 2026 08:08:03 +0000 Subject: [PATCH 34/55] bump the image --- .github/configs/amd-master.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ad4b3f559..e3995511e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=1" dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1271,7 +1271,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg From 3d62e2c6d8ce0066abeb6eb2426f0150ea000f82 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sun, 19 Apr 2026 15:06:50 +0000 Subject: [PATCH 35/55] revert old image --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e3995511e..8f1fb5efa 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=1" dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0408-high-concurrency model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1271,7 +1271,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0408-high-concurrency model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg From 2c4c09d97ac0efd2a3dd1255f9a063923e2faed4 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sun, 19 Apr 2026 15:09:23 +0000 Subject: [PATCH 36/55] revert old image --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8f1fb5efa..d844c8ecf 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=1" dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0408-high-concurrency + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1271,7 +1271,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0408-high-concurrency + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg From 1c9b8d2a95259b4325e579cb10f40b84a78f05e5 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 20 Apr 2026 06:14:43 +0000 Subject: [PATCH 37/55] increase DISPATCH_TOKENS_PREFILL to 5120 --- benchmarks/multi_node/amd_utils/env.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 36361bb7b..c751078ec 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -67,7 +67,8 @@ export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 # if [[ "$MODEL_NAME" == *mxfp4* || "$MODEL_NAME" == *MXFP4* ]]; then # export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 # fi -export MORI_MAX_DISPATCH_TOKENS_PREFILL=2048 + +export MORI_MAX_DISPATCH_TOKENS_PREFILL=5120 export MORI_MAX_DISPATCH_TOKENS_DECODE=256 export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2048 From 8e6104eb526bf5ddd6023e3f3616e761a52ef7a1 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 20 Apr 2026 06:17:56 +0000 Subject: [PATCH 38/55] bump image to 0417 --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d844c8ecf..e3995511e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=1" dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1271,7 +1271,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg From 7cc5d81728877f4f2067e85fd6de18a7039c6a3a Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 21 Apr 2026 15:06:04 +0000 Subject: [PATCH 39/55] add exp config --- .github/configs/amd-master.yaml | 116 +++++++++++++++++++++++++------- 1 file changed, 91 insertions(+), 25 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e3995511e..de6ee6239 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=1" dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1223,20 +1223,39 @@ dsr1-fp4-mi355x-sglang-disagg-exp: multinode: true disagg: true seq-len-configs: - - isl: 1024 + - isl: 8192 osl: 1024 search-space: # non-MTP configurations - # 1*DEP4+ 1*DEP8 + # 4*DEP4 + 1*DEP8 - spec-decoding: "none" - conc-list: [ 1024, 2048 ] + conc-list: [ 512, 1024, 2048 ] prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 3*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 3 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_NODES=1" + - "PREFILL_NODES=3" decode: num-worker: 1 tp: 8 @@ -1246,20 +1265,35 @@ dsr1-fp4-mi355x-sglang-disagg-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 4*DEP4 + 1*DEP8 + # 2*DEP4 + 1*DEP8 - spec-decoding: "none" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 4 + num-worker: 2 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_NODES=4" + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" decode: num-worker: 1 tp: 8 @@ -1271,7 +1305,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1280,20 +1314,20 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: multinode: true disagg: true seq-len-configs: - - isl: 1024 + - isl: 8192 osl: 1024 search-space: # MTP configurations - # 1*DEP4+ 1*DEP8 + # 4*DEP4 + 1*DEP8 - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] + conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 1 + num-worker: 4 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_NODES=1" + - "PREFILL_NODES=4" decode: num-worker: 1 tp: 8 @@ -1303,21 +1337,35 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + # 3*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=3" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 4*DEP4 + 1*DEP8 + # 2*DEP4 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 4 + num-worker: 2 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_NODES=4" + - "PREFILL_NODES=2" decode: num-worker: 1 tp: 8 @@ -1327,3 +1375,21 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + # 1*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" \ No newline at end of file From a1c05da8c0db2f43e9c346c22b4eaf608694da84 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Wed, 22 Apr 2026 11:06:42 +0000 Subject: [PATCH 40/55] add exp config --- .github/configs/amd-master.yaml | 96 +++------------------ benchmarks/multi_node/amd_utils/models.yaml | 2 +- 2 files changed, 11 insertions(+), 87 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index de6ee6239..31bdcc6a5 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1227,53 +1227,15 @@ dsr1-fp4-mi355x-sglang-disagg-exp: osl: 1024 search-space: # non-MTP configurations - # 4*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "none" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 3*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=3" - decode: - num-worker: 1 + num-worker: 2 tp: 8 ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 2*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - "PREFILL_NODES=2" decode: num-worker: 1 @@ -1284,13 +1246,13 @@ dsr1-fp4-mi355x-sglang-disagg-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - # 1*DEP8 + 1*DEP8 + # 1*DEP4 + 1*DEP8 - spec-decoding: "none" conc-list: [ 512, 1024, 2048 ] prefill: num-worker: 1 - tp: 8 - ep: 8 + tp: 4 + ep: 4 dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -1318,53 +1280,15 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: osl: 1024 search-space: # MTP configurations - # 4*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # 3*DEP4 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=3" - decode: - num-worker: 1 + num-worker: 2 tp: 8 ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # 2*DEP4 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - "PREFILL_NODES=2" decode: num-worker: 1 @@ -1375,13 +1299,13 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - # 1*DEP8 + 1*DEP8 + # 1*DEP4 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 512, 1024, 2048 ] prefill: num-worker: 1 - tp: 8 - ep: 8 + tp: 4 + ep: 4 dp-attn: true additional-settings: - "PREFILL_NODES=1" diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index eefc93920..b2b013244 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -226,7 +226,7 @@ DeepSeek-R1-0528-MXFP4: DeepSeek-R1-0528-MXFP4-v2: base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 " + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 --num-continuous-decode-steps=4 " prefill: mem_fraction_static: 0.8 disable_radix_cache: true From a915729352b1a1ead26d1b594fd362d89e09d0e3 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Thu, 23 Apr 2026 05:37:26 +0000 Subject: [PATCH 41/55] add exp config --- .github/configs/amd-master.yaml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 31bdcc6a5..355fa141e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1227,16 +1227,16 @@ dsr1-fp4-mi355x-sglang-disagg-exp: osl: 1024 search-space: # non-MTP configurations - # 2*DEP8 + 1*DEP8 + # 4*DEP4 + 1*DEP8 - spec-decoding: "none" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 2 - tp: 8 - ep: 8 + num-worker: 4 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "PREFILL_NODES=2" + - "PREFILL_NODES=4" decode: num-worker: 1 tp: 8 @@ -1246,16 +1246,16 @@ dsr1-fp4-mi355x-sglang-disagg-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - # 1*DEP4 + 1*DEP8 + # 3*DEP4 + 1*DEP8 - spec-decoding: "none" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 1 + num-worker: 3 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_NODES=1" + - "PREFILL_NODES=3" decode: num-worker: 1 tp: 8 @@ -1280,16 +1280,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: osl: 1024 search-space: # MTP configurations - # 2*DEP8 + 1*DEP8 + # 4*DEP4 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 2 - tp: 8 - ep: 8 + num-worker: 4 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "PREFILL_NODES=2" + - "PREFILL_NODES=4" decode: num-worker: 1 tp: 8 @@ -1299,16 +1299,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - # 1*DEP4 + 1*DEP8 + # 3*DEP4 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 1 + num-worker: 3 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_NODES=1" + - "PREFILL_NODES=3" decode: num-worker: 1 tp: 8 From 44d10a1adf7ebb5fb2114d57b9bd72140f028baf Mon Sep 17 00:00:00 2001 From: billishyahao Date: Thu, 23 Apr 2026 15:44:35 +0000 Subject: [PATCH 42/55] add exp config --- .github/configs/amd-master.yaml | 20 ++++++++++---------- benchmarks/multi_node/amd_utils/models.yaml | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 355fa141e..2550f9b70 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1246,16 +1246,16 @@ dsr1-fp4-mi355x-sglang-disagg-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - # 3*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "none" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 3 - tp: 4 - ep: 4 + num-worker: 2 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=3" + - "PREFILL_NODES=2" decode: num-worker: 1 tp: 8 @@ -1299,16 +1299,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - # 3*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 3 - tp: 4 - ep: 4 + num-worker: 2 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=3" + - "PREFILL_NODES=2" decode: num-worker: 1 tp: 8 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index b2b013244..c547b1174 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -226,7 +226,7 @@ DeepSeek-R1-0528-MXFP4: DeepSeek-R1-0528-MXFP4-v2: base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 --num-continuous-decode-steps=4 " + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 --num-continuous-decode-steps=4 --scheduler-recv-interval=4 " prefill: mem_fraction_static: 0.8 disable_radix_cache: true From f09820e7c5bb46805743ff5bf1fecd6d116aea6b Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 24 Apr 2026 02:16:20 +0000 Subject: [PATCH 43/55] add exp configs --- .github/configs/amd-master.yaml | 135 +++++++++++++++++++- benchmarks/multi_node/amd_utils/models.yaml | 2 +- 2 files changed, 129 insertions(+), 8 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 2550f9b70..8b9d7a594 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1266,6 +1266,112 @@ dsr1-fp4-mi355x-sglang-disagg-exp: - "DECODE_MTP_SIZE=0" +# dsr1-fp4-mi355x-sglang-disagg-mtp-exp: +# image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 +# model: amd/DeepSeek-R1-0528-MXFP4-v2 +# model-prefix: dsr1 +# runner: mi355x-disagg +# precision: fp4 +# framework: sglang-disagg +# multinode: true +# disagg: true +# seq-len-configs: +# - isl: 8192 +# osl: 1024 +# search-space: +# # MTP configurations +# # 4*DEP4 + 1*DEP8 +# - spec-decoding: "mtp" +# conc-list: [ 512, 1024, 2048 ] +# prefill: +# num-worker: 4 +# tp: 4 +# ep: 4 +# dp-attn: true +# additional-settings: +# - "PREFILL_NODES=4" +# decode: +# num-worker: 1 +# tp: 8 +# ep: 8 +# dp-attn: true +# additional-settings: +# - "DECODE_NODES=1" +# - "DECODE_MTP_SIZE=1" + +# # 2*DEP8 + 1*DEP8 +# - spec-decoding: "mtp" +# conc-list: [ 512, 1024, 2048 ] +# prefill: +# num-worker: 2 +# tp: 8 +# ep: 8 +# dp-attn: true +# additional-settings: +# - "PREFILL_NODES=2" +# decode: +# num-worker: 1 +# tp: 8 +# ep: 8 +# dp-attn: true +# additional-settings: +# - "DECODE_NODES=1" +# - "DECODE_MTP_SIZE=1" + + +# dsr1-fp4-mi355x-sglang-disagg-exp: +# image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 +# model: amd/DeepSeek-R1-0528-MXFP4-v2 +# model-prefix: dsr1 +# runner: mi355x-disagg +# precision: fp4 +# framework: sglang-disagg +# multinode: true +# disagg: true +# seq-len-configs: +# - isl: 8192 +# osl: 1024 +# search-space: +# # non-MTP configurations +# # 4*DEP4 + 1*DEP8 +# - spec-decoding: "none" +# conc-list: [ 512, 1024, 2048 ] +# prefill: +# num-worker: 4 +# tp: 4 +# ep: 4 +# dp-attn: true +# additional-settings: +# - "PREFILL_NODES=4" +# decode: +# num-worker: 1 +# tp: 8 +# ep: 8 +# dp-attn: true +# additional-settings: +# - "DECODE_NODES=1" +# - "DECODE_MTP_SIZE=0" + +# # 2*DEP8 + 1*DEP8 +# - spec-decoding: "none" +# conc-list: [ 512, 1024, 2048 ] +# prefill: +# num-worker: 2 +# tp: 8 +# ep: 8 +# dp-attn: true +# additional-settings: +# - "PREFILL_NODES=2" +# decode: +# num-worker: 1 +# tp: 8 +# ep: 8 +# dp-attn: true +# additional-settings: +# - "DECODE_NODES=1" +# - "DECODE_MTP_SIZE=0" + + dsr1-fp4-mi355x-sglang-disagg-mtp-exp: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -1280,16 +1386,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: osl: 1024 search-space: # MTP configurations - # 4*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 512, 1024, 2048 ] prefill: - num-worker: 4 - tp: 4 - ep: 4 + num-worker: 2 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=4" + - "PREFILL_NODES=2" decode: num-worker: 1 tp: 8 @@ -1299,8 +1405,23 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + +dsr1-fp4-mi355x-sglang-disagg-exp: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations # 2*DEP8 + 1*DEP8 - - spec-decoding: "mtp" + - spec-decoding: "none" conc-list: [ 512, 1024, 2048 ] prefill: num-worker: 2 @@ -1316,4 +1437,4 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: dp-attn: true additional-settings: - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" \ No newline at end of file + - "DECODE_MTP_SIZE=0" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index c547b1174..eefc93920 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -226,7 +226,7 @@ DeepSeek-R1-0528-MXFP4: DeepSeek-R1-0528-MXFP4-v2: base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 --num-continuous-decode-steps=4 --scheduler-recv-interval=4 " + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 " prefill: mem_fraction_static: 0.8 disable_radix_cache: true From 5144ca12507ebc6aceddde3afe48cdfb1d2953f3 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 24 Apr 2026 06:58:53 +0000 Subject: [PATCH 44/55] add exp configs --- .github/configs/amd-master.yaml | 42 +++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8b9d7a594..ae40a0633 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1373,7 +1373,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0424 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1405,9 +1405,28 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + # 4*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0424 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1430,6 +1449,25 @@ dsr1-fp4-mi355x-sglang-disagg-exp: dp-attn: true additional-settings: - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" decode: num-worker: 1 tp: 8 From d9e2eefa0a6d17d9193654e8289a8552bd985a29 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 28 Apr 2026 16:30:00 +0000 Subject: [PATCH 45/55] bump image --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ae40a0633..ed165452d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417 + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg From ee33925882723f97bafea7ebadda6564e8cd36b5 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Thu, 30 Apr 2026 10:43:33 +0000 Subject: [PATCH 46/55] sync arguments --- .github/configs/amd-master.yaml | 271 +------------------- benchmarks/multi_node/amd_utils/env.sh | 12 +- benchmarks/multi_node/amd_utils/models.yaml | 4 +- 3 files changed, 9 insertions(+), 278 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ed165452d..887c81a58 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -886,7 +886,7 @@ dsr1-fp4-mi355x-sglang-disagg: # 1*DEP4+ 1*DEP8 - spec-decoding: "none" - conc-list: [ 1024, 2048 ] + conc-list: [ 1024, 2048, 4096 ] prefill: num-worker: 1 tp: 4 @@ -985,7 +985,7 @@ dsr1-fp4-mi355x-sglang-disagg: # 4*DEP4 + 1*DEP8 - spec-decoding: "none" - conc-list: [ 512, 1024, 2048 ] + conc-list: [ 1024, 2048 ] prefill: num-worker: 4 tp: 4 @@ -1095,7 +1095,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: # 1*DEP4+ 1*DEP8 - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] + conc-list: [ 1024, 2048, 4096 ] prefill: num-worker: 1 tp: 4 @@ -1196,218 +1196,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: # 4*DEP4 + 1*DEP8 - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - -dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 4*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 2*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - -# dsr1-fp4-mi355x-sglang-disagg-mtp-exp: -# image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 -# model: amd/DeepSeek-R1-0528-MXFP4-v2 -# model-prefix: dsr1 -# runner: mi355x-disagg -# precision: fp4 -# framework: sglang-disagg -# multinode: true -# disagg: true -# seq-len-configs: -# - isl: 8192 -# osl: 1024 -# search-space: -# # MTP configurations -# # 4*DEP4 + 1*DEP8 -# - spec-decoding: "mtp" -# conc-list: [ 512, 1024, 2048 ] -# prefill: -# num-worker: 4 -# tp: 4 -# ep: 4 -# dp-attn: true -# additional-settings: -# - "PREFILL_NODES=4" -# decode: -# num-worker: 1 -# tp: 8 -# ep: 8 -# dp-attn: true -# additional-settings: -# - "DECODE_NODES=1" -# - "DECODE_MTP_SIZE=1" - -# # 2*DEP8 + 1*DEP8 -# - spec-decoding: "mtp" -# conc-list: [ 512, 1024, 2048 ] -# prefill: -# num-worker: 2 -# tp: 8 -# ep: 8 -# dp-attn: true -# additional-settings: -# - "PREFILL_NODES=2" -# decode: -# num-worker: 1 -# tp: 8 -# ep: 8 -# dp-attn: true -# additional-settings: -# - "DECODE_NODES=1" -# - "DECODE_MTP_SIZE=1" - - -# dsr1-fp4-mi355x-sglang-disagg-exp: -# image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411 -# model: amd/DeepSeek-R1-0528-MXFP4-v2 -# model-prefix: dsr1 -# runner: mi355x-disagg -# precision: fp4 -# framework: sglang-disagg -# multinode: true -# disagg: true -# seq-len-configs: -# - isl: 8192 -# osl: 1024 -# search-space: -# # non-MTP configurations -# # 4*DEP4 + 1*DEP8 -# - spec-decoding: "none" -# conc-list: [ 512, 1024, 2048 ] -# prefill: -# num-worker: 4 -# tp: 4 -# ep: 4 -# dp-attn: true -# additional-settings: -# - "PREFILL_NODES=4" -# decode: -# num-worker: 1 -# tp: 8 -# ep: 8 -# dp-attn: true -# additional-settings: -# - "DECODE_NODES=1" -# - "DECODE_MTP_SIZE=0" - -# # 2*DEP8 + 1*DEP8 -# - spec-decoding: "none" -# conc-list: [ 512, 1024, 2048 ] -# prefill: -# num-worker: 2 -# tp: 8 -# ep: 8 -# dp-attn: true -# additional-settings: -# - "PREFILL_NODES=2" -# decode: -# num-worker: 1 -# tp: 8 -# ep: 8 -# dp-attn: true -# additional-settings: -# - "DECODE_NODES=1" -# - "DECODE_MTP_SIZE=0" - - -dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0424 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 2*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048 ] + conc-list: [ 1024, 2048 ] prefill: num-worker: 4 tp: 4 @@ -1424,55 +1213,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - -dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0424 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 2*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index c751078ec..9cc96738b 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -41,8 +41,9 @@ export SGLANG_USE_AITER=1 export SGLANG_MORI_DISPATCH_DTYPE=auto export SGLANG_MORI_FP8_COMB=true -export SGLANG_MORI_QP_PER_TRANSFER=2 -export SGLANG_MORI_NUM_WORKERS=2 +export SGLANG_MORI_QP_PER_TRANSFER=4 +export SGLANG_MORI_NUM_WORKERS=4 +export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 export MORI_IO_QP_MAX_SEND_WR=16384 export MORI_IO_QP_MAX_CQE=32768 @@ -61,13 +62,6 @@ export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 export SGLANG_LOG_MS=true export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 - -# Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) -# export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 -# if [[ "$MODEL_NAME" == *mxfp4* || "$MODEL_NAME" == *MXFP4* ]]; then -# export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 -# fi - export MORI_MAX_DISPATCH_TOKENS_PREFILL=5120 export MORI_MAX_DISPATCH_TOKENS_DECODE=256 export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2048 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index eefc93920..6ed51fc41 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -242,9 +242,9 @@ DeepSeek-R1-0528-MXFP4-v2: mem_fraction_static: 0.85 prefill_round_robin_balance: true dp: - max_running_requests: 2048 + max_running_requests: 4096 chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" - cuda_graph_bs_range: "1-256" + cuda_graph_bs_range: "1-512" ep_only: max_running_requests: 256 chunked_prefill_size: 262144 From 2b1ff6b5d15e98103ff0e6ef272fad76b195c02c Mon Sep 17 00:00:00 2001 From: billishyahao Date: Thu, 30 Apr 2026 10:57:14 +0000 Subject: [PATCH 47/55] fix --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 887c81a58..cd11bb7d5 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -486,7 +486,7 @@ dsr1-fp8-mi355x-atom-mtp: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg @@ -641,7 +641,7 @@ dsr1-fp8-mi355x-sglang-disagg: dsr1-fp8-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg From 05487731866203b099ce155831791cdd1084f330 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 1 May 2026 03:53:01 +0000 Subject: [PATCH 48/55] fix config --- .github/configs/amd-master.yaml | 113 ++++++++++++++++++++ benchmarks/multi_node/amd_utils/env.sh | 8 +- benchmarks/multi_node/amd_utils/models.yaml | 5 +- benchmarks/multi_node/amd_utils/server.sh | 22 +++- 4 files changed, 142 insertions(+), 6 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index cd11bb7d5..cfa23a210 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1213,3 +1213,116 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" +dsr1-fp4-mi355x-sglang-disagg-exp: + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # 1*DEP4+ 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 2*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + +dsr1-fp4-mi355x-sglang-disagg-mtp-exp: + image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 2*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 9cc96738b..472e9b0de 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -62,9 +62,11 @@ export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 export SGLANG_LOG_MS=true export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 -export MORI_MAX_DISPATCH_TOKENS_PREFILL=5120 -export MORI_MAX_DISPATCH_TOKENS_DECODE=256 -export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2048 +export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 +export MORI_MAX_DISPATCH_TOKENS_DECODE=512 + +export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768 +export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703 # set MTP size=1 when EP16 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 6ed51fc41..fbe60d0ec 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -231,9 +231,12 @@ DeepSeek-R1-0528-MXFP4-v2: mem_fraction_static: 0.8 disable_radix_cache: true dp: - max_running_requests: 32 + max_running_requests: 4096 chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" cuda_graph_bs: "1 2 3" + context_length: 9217 + max_total_tokens: 131072 + enable_two_batch_overlap: true no_dp: max_running_requests: 128 chunked_prefill_size: 16384 diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index c7ab4d4ac..e6c24909b 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -127,6 +127,9 @@ no_dp = prefill.get('no_dp', {}) print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"') +print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') +print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) @@ -169,10 +172,16 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP + prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP + prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP + prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP else prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP + prefill_context_length="" + prefill_max_total_tokens="" + prefill_enable_two_batch_overlap="false" fi # Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) @@ -192,6 +201,15 @@ PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-r if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi +if [[ -n "$prefill_context_length" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" +fi +if [[ -n "$prefill_max_total_tokens" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}" +fi +if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" +fi DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " @@ -361,7 +379,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "================================================" # start the head prefill server - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -490,7 +508,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then echo "Using prefill config: $PREFILL_SERVER_CONFIG" echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ From 724bd61a824e7d69b2ffd42de3d3f7aef6d05ec4 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 1 May 2026 06:12:54 +0000 Subject: [PATCH 49/55] add exp configs --- .github/configs/amd-master.yaml | 92 ++++++++++++++++----------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index cfa23a210..a5688b92f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1223,28 +1223,28 @@ dsr1-fp4-mi355x-sglang-disagg-exp: multinode: true disagg: true seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # 1*DEP4+ 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + # - isl: 1024 + # osl: 1024 + # search-space: + # # non-MTP configurations + # # 1*DEP4+ 1*DEP8 + # - spec-decoding: "none" + # conc-list: [ 1024, 2048, 4096 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=1" + # - "DECODE_MTP_SIZE=0" - isl: 8192 osl: 1024 @@ -1252,7 +1252,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp: # non-MTP configurations # 2*DEP8 + 1*DEP8 - spec-decoding: "none" - conc-list: [ 1024, 2048 ] + conc-list: [ 512, 1024, 2048, 4096 ] prefill: num-worker: 2 tp: 8 @@ -1280,28 +1280,28 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: multinode: true disagg: true seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # 1*DEP4+ 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" + # - isl: 1024 + # osl: 1024 + # search-space: + # # MTP configurations + # # 1*DEP4+ 1*DEP8 + # - spec-decoding: "mtp" + # conc-list: [ 1024, 2048, 4096 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=1" + # - "DECODE_MTP_SIZE=1" - isl: 8192 @@ -1310,7 +1310,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp: # MTP configurations # 2*DEP8 + 1*DEP8 - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] + conc-list: [ 512, 1024, 2048, 4096 ] prefill: num-worker: 2 tp: 8 From f8f0a3a0fd606b45a7a405535858a36e8b1fc2b2 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 1 May 2026 06:46:17 +0000 Subject: [PATCH 50/55] enable sdma --- benchmarks/multi_node/amd_utils/env.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 472e9b0de..cb094d1e3 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -49,12 +49,17 @@ export MORI_IO_QP_MAX_SEND_WR=16384 export MORI_IO_QP_MAX_CQE=32768 export MORI_IO_QP_MAX_SGE=4 +export MORI_IO_TC_DISABLE=0 + export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 # Disable allocating memory in one pass export MORI_SHMEM_MODE=ISOLATION +# Enable SDMA +export MORI_ENABLE_SDMA=true + # Enable spec v2 export SGLANG_ENABLE_SPEC_V2=1 export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 From feb6c7d327e8f55269cf2c1d71fec9b4b2dc2133 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 1 May 2026 08:19:39 +0000 Subject: [PATCH 51/55] fix --- benchmarks/multi_node/amd_utils/env.sh | 3 --- benchmarks/multi_node/amd_utils/server.sh | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index cb094d1e3..d0b99eddc 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -57,9 +57,6 @@ export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 # Disable allocating memory in one pass export MORI_SHMEM_MODE=ISOLATION -# Enable SDMA -export MORI_ENABLE_SDMA=true - # Enable spec v2 export SGLANG_ENABLE_SPEC_V2=1 export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index e6c24909b..537c0812e 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -209,6 +209,7 @@ if [[ -n "$prefill_max_total_tokens" ]]; then fi if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" + export MORI_ENABLE_SDMA=true fi DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " From f501a3e110cb364d7ef4eea54e0870356e080af1 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 1 May 2026 08:33:16 +0000 Subject: [PATCH 52/55] fix --- benchmarks/multi_node/amd_utils/server.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 537c0812e..89d0f223a 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -209,7 +209,7 @@ if [[ -n "$prefill_max_total_tokens" ]]; then fi if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" - export MORI_ENABLE_SDMA=true + PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true" fi DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " @@ -380,7 +380,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "================================================" # start the head prefill server - PREFILL_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="${PREFILL_SDMA_ENV} SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -509,7 +509,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then echo "Using prefill config: $PREFILL_SERVER_CONFIG" echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - PREFILL_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="${PREFILL_SDMA_ENV} SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ From 217d89277427627d7c76cc5e9932174d7932d221 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 1 May 2026 14:23:54 +0000 Subject: [PATCH 53/55] cleanup --- .github/configs/amd-master.yaml | 24 ++++++++++----------- benchmarks/multi_node/amd_utils/models.yaml | 14 ++++++------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a5688b92f..b98d1bb97 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -983,16 +983,16 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - # 4*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "none" - conc-list: [ 1024, 2048 ] + conc-list: [ 1024, 2048, 4096 ] prefill: - num-worker: 4 - tp: 4 - ep: 4 + num-worker: 2 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=4" + - "PREFILL_NODES=2" decode: num-worker: 1 tp: 8 @@ -1194,16 +1194,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" - # 4*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] + conc-list: [ 1024, 2048, 4096 ] prefill: - num-worker: 4 - tp: 4 - ep: 4 + num-worker: 2 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=4" + - "PREFILL_NODES=2" decode: num-worker: 1 tp: 8 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index fbe60d0ec..436c32d27 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -38,7 +38,7 @@ # cuda_graph_bs_range: str DeepSeek-V3: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -69,7 +69,7 @@ DeepSeek-V3: cuda_graph_bs_range: "1-128" DeepSeek-V3-0324: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -100,7 +100,7 @@ DeepSeek-V3-0324: cuda_graph_bs_range: "1-128" DeepSeek-R1: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -131,7 +131,7 @@ DeepSeek-R1: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -162,7 +162,7 @@ DeepSeek-R1-0528: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-Preview: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: @@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-v2: - base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 " prefill: From a5a822a122d466bda675ec6db1b10194b5731564 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sat, 2 May 2026 00:57:12 +0000 Subject: [PATCH 54/55] bump image --- .github/configs/amd-master.yaml | 118 +------------------------------- perf-changelog.yaml | 9 +-- 2 files changed, 7 insertions(+), 120 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b98d1bb97..866642fff 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428 + image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428 + image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg @@ -1212,117 +1212,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - -dsr1-fp4-mi355x-sglang-disagg-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - # - isl: 1024 - # osl: 1024 - # search-space: - # # non-MTP configurations - # # 1*DEP4+ 1*DEP8 - # - spec-decoding: "none" - # conc-list: [ 1024, 2048, 4096 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=1" - # - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 2*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048, 4096 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - -dsr1-fp4-mi355x-sglang-disagg-mtp-exp: - image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - seq-len-configs: - # - isl: 1024 - # osl: 1024 - # search-space: - # # MTP configurations - # # 1*DEP4+ 1*DEP8 - # - spec-decoding: "mtp" - # conc-list: [ 1024, 2048, 4096 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=1" - # - "DECODE_MTP_SIZE=1" - - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 2*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 512, 1024, 2048, 4096 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" \ No newline at end of file diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1cd22211a..ffd2a64e1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1218,8 +1218,9 @@ - dsr1-fp4-mi355x-sglang-disagg - dsr1-fp4-mi355x-sglang-disagg-mtp description: - - "Bump SGL mori image to March 27" - - "Add more low latency sweep configs" + - "Bump SGL mori image to lmsysorg/sglang-rocm" + - "Add more high tput / low latency sweep configs" - "Enable v2 mxfp4 DSR1 0528 model" - - "Enable fp4 disp feature on mori" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/983 + - "Enable fp4 disp / fp8 combine feature on mori" + - "Enable Mori SDMA + two batch overlapping feature" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1236 From f570ea7ae03f6dc9fff46cf4d45741321de1588f Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sat, 2 May 2026 01:10:57 +0000 Subject: [PATCH 55/55] fix yaml --- .github/configs/amd-master.yaml | 144 +++++++++++----------- benchmarks/multi_node/amd_utils/server.sh | 5 + 2 files changed, 77 insertions(+), 72 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 727bfc0b9..b1a9b1227 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1238,24 +1238,24 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - # 1*DEP4+ 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + # 1*DEP4+ 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" - isl: 8192 osl: 1024 @@ -1337,24 +1337,24 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - # 2*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + # 2*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: @@ -1448,24 +1448,24 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" - # 1*DEP4+ 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" - isl: 8192 @@ -1549,24 +1549,24 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" - # 2*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" + # 2*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" dsv4-fp8-mi355x-sglang: diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 63c1d3c48..4da9b56eb 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -338,6 +338,11 @@ if [[ -n "$MODEL_NAME" ]]; then echo "Using model-specific configuration for: $MODEL_NAME" fi +if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') +fi + # ============================================================================= # Container Synchronization # =============================================================================