From 2d47b4d22b00d4618c205d86e24a29f8a5bb25d8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 20 Apr 2026 08:46:05 -0500 Subject: [PATCH] Revert "Add B300 config: kimi-k2.5-fp4-vllm (#1056)" [skip-sweep] This reverts commit a35e536941a4ebc3322c409c59fa8541738438a5. --- .github/configs/nvidia-master.yaml | 23 ------ benchmarks/single_node/kimik2.5_fp4_b300.sh | 80 --------------------- perf-changelog.yaml | 8 --- 3 files changed, 111 deletions(-) delete mode 100755 benchmarks/single_node/kimik2.5_fp4_b300.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d6202608d..90a430b9d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2237,29 +2237,6 @@ kimik2.5-fp4-b200-vllm: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -kimik2.5-fp4-b300-vllm: - image: vllm/vllm-openai:v0.19.0-cu130 - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - runner: b300 - precision: fp4 - framework: vllm - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - dsr1-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.9-cu130 model: deepseek-ai/DeepSeek-R1-0528 diff --git a/benchmarks/single_node/kimik2.5_fp4_b300.sh b/benchmarks/single_node/kimik2.5_fp4_b300.sh deleted file mode 100755 index ad636f6ed..000000000 --- a/benchmarks/single_node/kimik2.5_fp4_b300.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env bash - -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this script reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - MAX_MODEL_LEN \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -hf download "$MODEL" - -nvidia-smi - -export TORCH_CUDA_ARCH_LIST="10.0" -export PYTHONNOUSERSITE=1 - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi -# Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor - -set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.90 \ ---max-model-len $MAX_MODEL_LEN \ ---max-num-seqs $CONC \ ---reasoning-parser kimi_k2 \ ---tool-call-parser kimi_k2 \ ---compilation_config.pass_config.fuse_allreduce_rms true \ ---no-enable-prefix-caching \ ---trust-remote-code > $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts $(( CONC * 10 )) \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --trust-remote-code - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -# Stop GPU monitoring -stop_gpu_monitor -set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 16fea938d..d3bc645be 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1471,14 +1471,6 @@ - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054 -- config-keys: - - kimik2.5-fp4-b300-vllm - description: - - "Add Kimi-K2.5 FP4 (NVFP4) B300 vLLM benchmark" - - "Image: vllm/vllm-openai:v0.19.0-cu130" - - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1056 - - config-keys: - gptoss-fp4-mi300x-vllm description: