diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 15df0cc5f..56521e394 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7749,6 +7749,87 @@ dsv4-fp4-gb200-dynamo-vllm: ep: 8 dp-attn: true +# MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM nightly image +# and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm. +dsv4-fp4-gb200-dynamo-vllm-mtp2: + image: vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # Aggregate low latency: TP=8, max-num-seqs=4. + - conc-list: [1] + spec-decoding: mtp + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml" + decode: + num-worker: 0 + tp: 8 + ep: 1 + dp-attn: false + + # Low-latency bridge: 1 prefill (DEP=8) + 4 decode (TP=8), no offload. + - conc-list: [16, 32, 64] + spec-decoding: mtp + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + + # MegaMOE mid curve: 1 prefill (DEP=8) + 1 decode (DEP=8). + # 5 nodes total with a dedicated NATS/etcd infra node. + - conc-list: [128] + spec-decoding: mtp + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # MegaMOE high throughput: 2 prefill (DEP=8 each) + 1 decode (DEP=8). + # 7 nodes total with a dedicated NATS/etcd infra node. + - conc-list: [1024] + spec-decoding: mtp + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + dsv4-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:deepseek-v4-grace-blackwell model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml new file mode 100644 index 000000000..a5c3877d1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml @@ -0,0 +1,86 @@ +name: "svf-vllm-agg-gb200-low-latency-mtp2" + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + attention-config: '{"use_fp4_indexer_cache":true}' + tokenizer-mode: deepseek_v4 + max-model-len: 9280 + max-num-seqs: 4 + max-num-batched-tokens: 8192 + max-cudagraph-capture-size: 4 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.9 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + frameworks: + dynamo: "1.2.0.dev20260426" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml new file mode 100644 index 000000000..1464135e5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml @@ -0,0 +1,145 @@ +name: "svf-vllm-disagg-gb200-high-tpt-megamoe-mtp2" + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.94 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 1024 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + frameworks: + dynamo: "1.2.0.dev20260426" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml new file mode 100644 index 000000000..016f03755 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml @@ -0,0 +1,131 @@ +name: svf-vllm-disagg-gb200-low-latency-mtp2 + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 +infra: + etcd_nats_dedicated_node: true +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: '3600' + TILELANG_CLEANUP_TEMP_FILES: '1' + VLLM_USE_NCCL_SYMM_MEM: '1' + NCCL_CUMEM_ENABLE: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_NVLS_ENABLE: '1' + VLLM_SERVER_DEV_MODE: '1' + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: '1024' + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: '2048' + UCX_MEMTYPE_CACHE: n + UCX_MEMTYPE_REG_WHOLE: n + UCX_TLS: cuda_copy,cuda_ipc,tcp + UCX_CUDA_IPC_ENABLE_MNNVL: y + NCCL_P2P_LEVEL: NVL + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: '3600' + TILELANG_CLEANUP_TEMP_FILES: '1' + VLLM_USE_NCCL_SYMM_MEM: '1' + NCCL_CUMEM_ENABLE: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_NVLS_ENABLE: '1' + VLLM_SERVER_DEV_MODE: '1' + UCX_MEMTYPE_CACHE: n + UCX_MEMTYPE_REG_WHOLE: n + UCX_TLS: cuda_copy,cuda_ipc,tcp + UCX_CUDA_IPC_ENABLE_MNNVL: y + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: deepseek-ai/DeepSeek-V4-Pro + kv-cache-dtype: fp8 + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-hybrid-lb: true + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 8 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: deepseek-ai/DeepSeek-V4-Pro + kv-cache-dtype: fp8 + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: sa-bench + isl: 8192 + osl: 1024 + concurrencies: 16x32x64 + req_rate: inf + use_chat_template: true + custom_tokenizer: sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + frameworks: + dynamo: "1.2.0.dev20260426" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml new file mode 100644 index 000000000..001101525 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml @@ -0,0 +1,145 @@ +name: "svf-vllm-disagg-gb200-mid-curve-megamoe-mtp2" + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-hybrid-lb: true + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 8 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-hybrid-lb: true + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + frameworks: + dynamo: "1.2.0.dev20260426" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0e13cb570..2dfcda9fe 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2056,3 +2056,10 @@ - "Add --gpu-memory-utilization 0.9 to server launch" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1126 +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm-mtp2 + description: + - "Add final DeepSeek-V4-Pro FP4 GB200 Dynamo vLLM MTP2 Pareto recipes using vLLM nightly image" + - "Recipes cover 8k/1k aggregate TP8 low-latency conc=1, low-latency bridge 1P DEP8 + 4D TP8 no-offload conc=16/32/64, mid 1P/1D DEP8 MegaMOE conc=128, and high-throughput 2P/1D DEP8 MegaMOE conc=1024" + - "All recipes enable FP4 indexer cache and speculative-config mtp with num_speculative_tokens=2" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1242