diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 337047e57..17614991d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -315,28 +315,6 @@ kimik2.5-int4-mi355x-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -kimik2.5-int4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: mi325x - precision: int4 - framework: vllm - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - kimik2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.16.0 model: amd/Kimi-K2.5-MXFP4 diff --git a/benchmarks/single_node/kimik2.5_int4_mi325x.sh b/benchmarks/single_node/kimik2.5_int4_mi325x.sh deleted file mode 100755 index a607b4492..000000000 --- a/benchmarks/single_node/kimik2.5_int4_mi325x.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - MAX_MODEL_LEN \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -hf download "$MODEL" - -# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ -if [ -n "$ROCR_VISIBLE_DEVICES" ]; then - export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" -fi - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -# following AMD andy luo's recipe -# https://x.com/linluo77/status/2017024513595301985 -set -x -vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---block-size=64 \ ---disable-log-requests \ ---trust-remote-code \ ---mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --trust-remote-code - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC - append_lm_eval_summary -fi -set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c19ddbd1a..e6a85927f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,10 +1,3 @@ -- config-keys: - - kimik2.5-int4-mi325x-vllm - description: - - "Add Kimi K2.5 INT4 single-node MI325X vLLM benchmark (TP8)" - - "Uses vLLM ROCm v0.16.0 image following AMD Andy Luo's recipe" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/857 - - config-keys: - 70b-fp8-*-vllm description: