SemiAnalysisAI · Oseltamivir · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/...ks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/...ks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -1,123 +1,203 @@
-name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
-
-# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no-EP
-# shape as the 1p1d siblings — see ./disagg-gb200-1p1d-dep8-tep8.yaml
-# header for the full constraint chain.
-#
-# Both EP backends available upstream (deepep, flashinfer) are dead on
-# this image:
-#   * deepep — mxfp4_deepseek.py:347 reads dispatch_output.topk_output;
-#     neither DeepEPNormalDispatchOutput nor DeepEPLLDispatchOutput
-#     exposes that field in this fork.
-#   * flashinfer — `_handle_a2a_moe` in server_args.py asserts
-#     "Flashinfer MoE A2A is only supported with flashinfer_cutlass
-#     moe runner backend", and flashinfer_cutlass is FP8-only — won't
-#     load DSV4-Pro's MXFP4 weights.
-# Adds prefill capacity (3 workers vs 1) for the high-conc tail —
-# single prefill saturates around conc 4096 at 1k prompts.
-#
-# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes.
+name: "dsv4-pro-gb300-fp4"
 
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
-  precision: "fp4"
+slurm:
+  partition: hpc-mid
+  time_limit: "03:00:00"
+
+sbatch_directives:
+  cpus-per-task: "144"
+  mem: "0"
 
-# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
 dynamo:
-  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
-  install: true
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
 
-slurm:
-  time_limit: "8:00:00"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
+  nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh
 
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
 
 resources:
-  gpu_type: "gb200"
+  gpu_type: "gb300"
   gpus_per_node: 4
-  prefill_nodes: 6
-  decode_nodes: 2
-  prefill_workers: 3
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
+  # prefill_nodes / prefill_workers / decode_nodes / decode_workers are
+  # set per-override; not duplicated in base.
 
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
+extra_mount:
+  - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang"
+  - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang"
+
+# setup_script: "install_sglang.sh"
 
 backend:
   type: sglang
 
   prefill_environment:
+    # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests
     PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
 
   decode_environment:
+    # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests
     PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+    # is single-node only and corrupts results in 2-node decode setups.
 
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 30 # pr50 sets it, let's do it
+      # tokenizer-worker-num: 16  # need this if we run tokenizer
+
+      # Parallel
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+
       enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 3072
-      max-running-requests: 16
-      stream-interval: 50
-      decode-log-interval: 1000
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
       disaggregation-mode: "prefill"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 32768
+      # disable-radix-cache: true # NOTE try to enable radix cache
 
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 3072
-      max-running-requests: 1024
-      cuda-graph-max-bs: 1024
-      stream-interval: 50
-      decode-log-interval: 1000
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 30 # pr50 sets it, let's do it
+      # tokenizer-worker-num: 16  # need this if we run tokenizer
+      # disable-radix-cache: true # NOTE try to enable radix cache
+
       disaggregation-mode: "decode"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4096x8192"
-  req_rate: "inf"
-  use_chat_template: false
+      disaggregation-transfer-backend: mooncake
+
+      # tensor-parallel-size / data-parallel-size / expert-parallel-size
+      # / max-running-requests / cuda-graph-max-bs are set per-override.
+
+      mem-fraction-static: 0.94
+      swa-full-tokens-ratio: 0.15
+      context-length: 16384
+
+  benchmark:
+    type: custom
+    command: |
+      set -e
+      REPO=/configs/upstream-sa-bench/InferenceX
+      [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO"
+      cd "$REPO/utils/bench_serving"
+      python3 benchmark_serving.py \
+        --backend sglang --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \
+        --host 127.0.0.1 --port 8000 --endpoint /v1/completions \
+        --dataset-name random \
+        --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \
+        --random-num-workers 96 \
+        --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \
+        --num-warmups 512 \
+        --ignore-eos --trust-remote-code \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --save-result --result-dir /logs --result-filename results.json
+    # concurrencies set per-override
+
+############ 1k1k ##############
+# [0]is wideep, [1] is narrow ep
+zip_override_1k1k_hightpt:
+  resources:
+    prefill_nodes:   [7, 1]
+    prefill_workers: [7, 1]
+    decode_nodes:    [2, 2]
+    decode_workers:  [1, 1]
+  backend:
+    sglang_config:
+      decode:
+        tensor-parallel-size:     [8, 8] # NOTE change from 16gpu to 8gpu
+        data-parallel-size:       [8, 8] # NOTE change from 16gpu to 8gpu
+        expert-parallel-size:     [8, 8] # NOTE change from 16gpu to 8gpu
+
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+
+        # ep-num-redundant-experts + ep-dispatch-algorithm intentionally
+        # removed: no static dispatching file available yet.
+
+        moe-a2a-backend: "deepep"
+        deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+        max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu
+        cuda-graph-max-bs:    [1152,  32]
+
+        # benchmark:
+        # isl: 1024
+        # osl: 1024
+        # concurrencies: "16384"