diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5bc61f53a..b1a9b1227 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1147,10 +1147,9 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" - dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - model: amd/DeepSeek-R1-0528-MXFP4 + image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 + model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg precision: fp4 @@ -1241,7 +1240,7 @@ dsr1-fp4-mi355x-sglang-disagg: # 1*DEP4+ 1*DEP8 - spec-decoding: "none" - conc-list: [ 1024, 2048 ] + conc-list: [ 1024, 2048, 4096 ] prefill: num-worker: 1 tp: 4 @@ -1338,16 +1337,16 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - # 4*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "none" conc-list: [ 1024, 2048, 4096 ] prefill: - num-worker: 4 - tp: 4 - ep: 4 + num-worker: 2 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=4" + - "PREFILL_NODES=2" decode: num-worker: 1 tp: 8 @@ -1357,9 +1356,10 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" + dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - model: amd/DeepSeek-R1-0528-MXFP4 + image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 + model: amd/DeepSeek-R1-0528-MXFP4-v2 model-prefix: dsr1 runner: mi355x-disagg precision: fp4 @@ -1450,7 +1450,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: # 1*DEP4+ 1*DEP8 - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] + conc-list: [ 1024, 2048, 4096 ] prefill: num-worker: 1 tp: 4 @@ -1549,16 +1549,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" - # 4*DEP4 + 1*DEP8 + # 2*DEP8 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 1024, 2048, 4096 ] prefill: - num-worker: 4 - tp: 4 - ep: 4 + num-worker: 2 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "PREFILL_NODES=4" + - "PREFILL_NODES=2" decode: num-worker: 1 tp: 8 @@ -1567,6 +1567,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + dsv4-fp8-mi355x-sglang: image: rocm/sgl-dev:deepseek-v4-mi35x diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 5565c5b3b..d0b99eddc 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -34,39 +34,47 @@ export IBDEVICES export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -set +x export NCCL_IB_HCA=$IBDEVICES export SGLANG_USE_AITER=1 -export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 -export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 + +export SGLANG_MORI_DISPATCH_DTYPE=auto +export SGLANG_MORI_FP8_COMB=true +export SGLANG_MORI_QP_PER_TRANSFER=4 +export SGLANG_MORI_NUM_WORKERS=4 +export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 + +export MORI_IO_QP_MAX_SEND_WR=16384 +export MORI_IO_QP_MAX_CQE=32768 +export MORI_IO_QP_MAX_SGE=4 + +export MORI_IO_TC_DISABLE=0 + +export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 +export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 # Disable allocating memory in one pass export MORI_SHMEM_MODE=ISOLATION -export SGLANG_MORI_FP8_DISP=True -if [[ "$MODEL_NAME" == *mxfp4* ]]; then -export SGLANG_MORI_FP8_DISP=False -fi +# Enable spec v2 +export SGLANG_ENABLE_SPEC_V2=1 +export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 -export SGLANG_MORI_FP4_DISP=False -export SGLANG_MORI_FP8_COMB=False +export SGLANG_LOG_MS=true +export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 -# Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) -export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 -if [[ "$MODEL_NAME" == *mxfp4* ]]; then - export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 -fi -export MORI_MAX_DISPATCH_TOKENS_DECODE=160 +export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 +export MORI_MAX_DISPATCH_TOKENS_DECODE=512 + +export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768 +export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703 # set MTP size=1 when EP16 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export MORI_EP_LAUNCH_CONFIG_MODE=AUTO -export MORI_IO_QP_MAX_SEND_WR=16384 -export MORI_IO_QP_MAX_CQE=32768 -export MORI_IO_QP_MAX_SGE=4 + export MORI_APP_LOG_LEVEL=INFO @@ -89,17 +97,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p { if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then TC=$(( 4 * ND_DSCP )) export MORI_RDMA_SL=$ND_PRIO + export MORI_IO_SL=$ND_PRIO export MORI_RDMA_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + export MORI_IO_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" else echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." # Fall back to hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." @@ -110,9 +122,11 @@ else NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." @@ -124,3 +138,4 @@ fi export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} +set +x diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 2bbdd91d6..436c32d27 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -222,3 +222,37 @@ DeepSeek-R1-0528-MXFP4: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" + +DeepSeek-R1-0528-MXFP4-v2: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 " + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + context_length: 9217 + max_total_tokens: 131072 + enable_two_batch_overlap: true + no_dp: + max_running_requests: 128 + chunked_prefill_size: 16384 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-512" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index bf8163e2b..4da9b56eb 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -127,6 +127,9 @@ no_dp = prefill.get('no_dp', {}) print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"') +print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') +print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) @@ -169,10 +172,16 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP + prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP + prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP + prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP else prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP + prefill_context_length="" + prefill_max_total_tokens="" + prefill_enable_two_batch_overlap="false" fi # Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) @@ -187,29 +196,31 @@ else decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP fi -# Use Decode configuration to configure different TP/DP size between P and D -PREFILL_DECODE_DIFFERENT_TP="" -if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then - if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" - else - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" - fi -fi - # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi +if [[ -n "$prefill_context_length" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" +fi +if [[ -n "$prefill_max_total_tokens" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}" +fi +if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" + PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true" +fi + +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" fi if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) + MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) fi # ============================================================================= @@ -327,6 +338,11 @@ if [[ -n "$MODEL_NAME" ]]; then echo "Using model-specific configuration for: $MODEL_NAME" fi +if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') +fi + # ============================================================================= # Container Synchronization # ============================================================================= @@ -362,20 +378,21 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " + echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} " + echo "================================================" # start the head prefill server - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="${PREFILL_SDMA_ENV} SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" + ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" @@ -589,15 +606,14 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then echo "Using prefill config: $PREFILL_SERVER_CONFIG" echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="${PREFILL_SDMA_ENV} SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" + ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) @@ -652,15 +668,14 @@ else echo "Decode node rank: $RANK" echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${DECODE_SERVER_CONFIG} \ - --log-level-http warning" + ${DECODE_SERVER_CONFIG} " if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then rank=$((RANK % DECODE_NODES_PER_WORKER)) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 647ec35f9..bdd15795e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2087,3 +2087,14 @@ - "Dynamic scheduler-recv-interval: 30 for CONC>4, 10 otherwise" - "Remove --max-running-requests, reduce prefill/chunked from 81920 to 16384" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1018 + +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Bump SGL mori image to lmsysorg/sglang-rocm" + - "Add more high tput / low latency sweep configs" + - "Enable v2 mxfp4 DSR1 0528 model" + - "Enable fp4 disp / fp8 combine feature on mori" + - "Enable Mori SDMA + two batch overlapping feature" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1236