Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2237,6 +2237,29 @@ kimik2.5-fp4-b200-vllm:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }

# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
# does not have a B300-specific recipe, so this config reuses the existing
# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
kimik2.5-fp4-b300-vllm:
image: vllm/vllm-openai:v0.19.0-cu130
model: nvidia/Kimi-K2.5-NVFP4
model-prefix: kimik2.5
runner: b300
precision: fp4
framework: vllm
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }

dsr1-fp8-b200-sglang-mtp:
image: lmsysorg/sglang:v0.5.9-cu130
model: deepseek-ai/DeepSeek-R1-0528
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ compute_eval_context_length() {
# Call directly (not in a subshell) so the export persists.
# Scripts then wire $EVAL_MAX_MODEL_LEN into whichever server variable they need.
setup_eval_context() {
EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$((ISL + OSL + 200))")
EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$((ISL + OSL + 256))")
export EVAL_MAX_MODEL_LEN
}

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/single_node/gptoss_fp4_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ nvidia-smi
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 256))
else
CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}
fi
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/single_node/gptoss_fp4_h200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pip install datasets pandas
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 256))
else
CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}
fi
Expand Down
80 changes: 80 additions & 0 deletions benchmarks/single_node/kimik2.5_fp4_b300.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env bash

# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
# does not have a B300-specific recipe, so this script reuses the existing
# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

nvidia-smi

export TORCH_CUDA_ARCH_LIST="10.0"
export PYTHONNOUSERSITE=1

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
vllm serve $MODEL --host 0.0.0.0 --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--max-num-seqs $CONC \
--reasoning-parser kimi_k2 \
--tool-call-parser kimi_k2 \
--compilation_config.pass_config.fuse_allreduce_rms true \
--no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $(( CONC * 10 )) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1471,6 +1471,14 @@
- "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054

- config-keys:
- kimik2.5-fp4-b300-vllm
description:
- "Add Kimi-K2.5 FP4 (NVFP4) B300 vLLM benchmark"
- "Image: vllm/vllm-openai:v0.19.0-cu130"
- "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1056
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 The perf-changelog.yaml entry for kimik2.5-fp4-b300-vllm has pr-link pointing to PR #1056, which was explicitly reverted by PR #1099. Since this PR (#1100) is the one that will actually land the change, the pr-link should reference https://github.com/SemiAnalysisAI/InferenceX/pull/1100 instead.

Extended reasoning...

What the bug is and how it manifests

The new perf-changelog.yaml entry for kimik2.5-fp4-b300-vllm (added in this PR) sets pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1056. PR #1056 was the original submission of this B300 config, but it was subsequently reverted by PR #1099 due to an error. The PR description for #1100 explicitly states: "This PR is a reopen of #1056, which was reverted in #1099 due to an error with the first PR." This means PR #1056 exists on GitHub in a reverted/superseded state, while PR #1100 is the change that will actually merge the configuration into main.

The specific code path that triggers it

In the diff, the new changelog block reads:

pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1056

This was carried over from the original PR #1056 submission without being updated to reflect the current PR number (#1100).

Why existing code does not prevent it

The changelog is a manually maintained YAML file with no automated validation that cross-checks pr-link values against the actual PR being merged. There is no CI check to enforce that the link matches the current PR. The error is a straightforward copy-paste oversight when reopening the PR.

What the impact would be

When anyone later looks up the changelog entry for kimik2.5-fp4-b300-vllm to understand when and how it was introduced, they will follow the pr-link and land on PR #1056, which GitHub shows as reverted. This is misleading — it appears the feature was reverted rather than merged. The actual merging PR (#1100) would not be linked anywhere in the changelog, making historical tracking inaccurate.

How to fix it

Change the pr-link in the new changelog entry from pull/1056 to pull/1100.

Step-by-step proof

  1. PR Add B300 config: kimi-k2.5-fp4-vllm #1056 was opened and merged, adding the kimik2.5-fp4-b300-vllm config.
  2. PR Revert "Add B300 config: kimi-k2.5-fp4-vllm (#1056)" [skip-sweep] #1099 explicitly reverted PR Add B300 config: kimi-k2.5-fp4-vllm #1056 (commit: "Revert 'Add B300 config: kimi-k2.5-fp4-vllm (Add B300 config: kimi-k2.5-fp4-vllm #1056)'").
  3. PR Add B300 config: kimi-k2.5-fp4-vllm #1100 (this PR) reopens the same change — its description confirms: "This PR is a reopen of Add B300 config: kimi-k2.5-fp4-vllm #1056, which was reverted in Revert "Add B300 config: kimi-k2.5-fp4-vllm (#1056)" [skip-sweep] #1099".
  4. The new changelog entry in this PR's diff sets pr-link to /pull/1056 (the reverted PR) instead of /pull/1100 (this PR).
  5. Comparable B300 entries in the same file correctly reference their own PR numbers: minimaxm2.5-fp8-b300-vllm -> Add B300 config: minimaxm2.5-fp8-vllm #1054, minimaxm2.5-fp4-b300-vllm -> Add B300 config: minimaxm2.5-fp4-vllm #1055, dsr1-fp8-b300-sglang-mtp -> Add B300 config: dsr1-fp8-sglang-mtp #1059.
  6. The pr-link should therefore be updated to https://github.com/SemiAnalysisAI/InferenceX/pull/1100.


- config-keys:
- gptoss-fp4-mi300x-vllm
description:
Expand Down
8 changes: 4 additions & 4 deletions utils/matrix_logic/generate_sweep_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def generate_full_sweep(args, all_config_data, runner_data):
Fields.PREFILL.value: prefill,
Fields.DECODE.value: decode,
Fields.CONC.value: conc_values, # Pass the entire list for multinode
Fields.MAX_MODEL_LEN.value: isl + osl + 200,
Fields.MAX_MODEL_LEN.value: isl + osl + 256,
Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
Fields.DISAGG.value: disagg,
Fields.RUN_EVAL.value: False, # Default, may be overridden by mark_eval_entries
Expand Down Expand Up @@ -350,7 +350,7 @@ def generate_full_sweep(args, all_config_data, runner_data):
Fields.OSL.value: osl,
Fields.TP.value: tp,
Fields.CONC.value: conc,
Fields.MAX_MODEL_LEN.value: isl + osl + 200,
Fields.MAX_MODEL_LEN.value: isl + osl + 256,
Fields.EP.value: 1, # Default
Fields.DP_ATTN.value: False, # Default
Fields.SPEC_DECODING.value: spec_decoding,
Expand Down Expand Up @@ -616,7 +616,7 @@ def generate_test_config_sweep(args, all_config_data):
Fields.PREFILL.value: prefill,
Fields.DECODE.value: decode,
Fields.CONC.value: conc_values,
Fields.MAX_MODEL_LEN.value: isl + osl + 200,
Fields.MAX_MODEL_LEN.value: isl + osl + 256,
Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
Fields.DISAGG.value: disagg,
Fields.RUN_EVAL.value: False,
Expand Down Expand Up @@ -664,7 +664,7 @@ def generate_test_config_sweep(args, all_config_data):
Fields.OSL.value: osl,
Fields.TP.value: tp,
Fields.CONC.value: conc,
Fields.MAX_MODEL_LEN.value: isl + osl + 200,
Fields.MAX_MODEL_LEN.value: isl + osl + 256,
Fields.EP.value: ep if ep is not None else 1,
Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False,
Fields.SPEC_DECODING.value: spec_decoding,
Expand Down
4 changes: 2 additions & 2 deletions utils/matrix_logic/test_generate_sweep_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,14 +643,14 @@ def test_exp_name_format(self, sample_single_node_config, sample_runner_config,
assert all(entry["exp-name"] == "dsr1_1k1k" for entry in result)

def test_max_model_len_calculation(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
"""max-model-len should be isl + osl + 200."""
"""max-model-len should be isl + osl + 256."""
result = generate_full_sweep(
full_sweep_args_single_node,
sample_single_node_config,
sample_runner_config
)
for entry in result:
expected_max_model_len = entry["isl"] + entry["osl"] + 200
expected_max_model_len = entry["isl"] + entry["osl"] + 256
assert entry["max-model-len"] == expected_max_model_len

def test_runner_node_filter(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node):
Expand Down
Loading