diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index fdf705be7..7e975fdba 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7639,7 +7639,7 @@ dsv4-fp4-gb200-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - # Three validated 8k/1k points mirrored from NVIDIA/srt-slurm + # Validated 8k/1k points mirrored from NVIDIA/srt-slurm # aflowers/vllm-gb200-v0.20.0 history. conc-list values match each # recipe's benchmark.concurrencies. @@ -7659,6 +7659,22 @@ dsv4-fp4-gb200-dynamo-vllm: ep: 1 dp-attn: false + # Low-middle curve: 1 prefill (DEP=8) + 4 decode (TP=8). 11 nodes total + # with a dedicated NATS/etcd infra node. + - conc-list: [256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + # Mid curve: 1 prefill (DEP=8) + 1 decode (DEP=8). 5 nodes total with # a dedicated NATS/etcd infra node. - conc-list: [256] @@ -7690,3 +7706,19 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 8 ep: 8 dp-attn: true + + # MegaMOE max throughput: same 3 prefill (DEP=8 each) + 1 decode (DEP=8) + # shape, but uses deep_gemm_mega_moe on both workers and disables offload. + - conc-list: [4096] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-max-tpt-megamoe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve.yaml new file mode 100644 index 000000000..20672bfdf --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve.yaml @@ -0,0 +1,150 @@ +name: "svf-vllm-disagg-gb200-low-middle-curve" + +# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-middle-curve.yaml +# +# Topology: 1 prefill (DEP=8) + 4 decode (TP=8). 11 nodes total with a +# dedicated NATS/etcd infra node. Low-middle curve points at concurrencies +# 256 and 512. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-max-tpt-megamoe.yaml new file mode 100644 index 000000000..fe3840109 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-max-tpt-megamoe.yaml @@ -0,0 +1,154 @@ +name: "svf-vllm-disagg-gb200-max-tpt-megamoe" + +# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-max-tpt-megamoe.yaml +# +# Topology: 3 prefill (DEP=8 each) + 1 decode (DEP=8). 9 nodes total with a +# dedicated NATS/etcd infra node. MegaMOE max-throughput point at concurrency +# 4096 with no CPU/NVMe offload. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2be38aefe..2bd14e776 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1969,3 +1969,19 @@ - "Keeps the three validated 8k/1k points: low-latency 1P/1D TP8 conc=1, mid-curve 1P/1D DEP8 conc=256, and max-tpt 3P/1D DEP8 conc=4096" - "All three recipes run NATS/etcd on a dedicated infra node and use compute-node local NVMe model weights via /mnt/numa1/models/deepseek-v4-pro/" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1163 + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "Add GB200 Dynamo vLLM MegaMOE max-throughput recipe at conc=4096" + - "Topology matches max-tpt: 3 prefill DEP8 workers and 1 decode DEP8 worker with dedicated NATS/etcd" + - "Uses deep_gemm_mega_moe on prefill/decode, TORCH_SYMMMEM=NVSHMEM, and no offload" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218 + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "Add GB200 Dynamo vLLM low-middle curve recipe at conc=256/512" + - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd" + - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218