From 4e9d5c57b2e6e995e1b3af10d97f0c8b749dfce0 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 1 Oct 2025 14:58:26 -0500 Subject: [PATCH] remove unused benchmarks --- benchmarks/70b_fp4_b200_slurm.sh | 82 --------------------------- benchmarks/70b_fp8_b200_slurm.sh | 86 ---------------------------- benchmarks/dsr1_fp4_b200_slurm.sh | 80 -------------------------- benchmarks/dsr1_fp8_b200_slurm.sh | 69 ---------------------- benchmarks/gptoss_fp4_b200_slurm.sh | 88 ----------------------------- 5 files changed, 405 deletions(-) delete mode 100644 benchmarks/70b_fp4_b200_slurm.sh delete mode 100644 benchmarks/70b_fp8_b200_slurm.sh delete mode 100644 benchmarks/dsr1_fp4_b200_slurm.sh delete mode 100644 benchmarks/dsr1_fp8_b200_slurm.sh delete mode 100644 benchmarks/gptoss_fp4_b200_slurm.sh diff --git a/benchmarks/70b_fp4_b200_slurm.sh b/benchmarks/70b_fp4_b200_slurm.sh deleted file mode 100644 index e4550e18c..000000000 --- a/benchmarks/70b_fp4_b200_slurm.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -pip install datasets pandas - -nvidia-smi - -pip install --upgrade --force-reinstall flashinfer-python==0.3.0post1 - -# Calculate max-model-len based on ISL and OSL -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) -elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) -else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}' -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -export TORCH_CUDA_ARCH_LIST="10.0" -export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' - -set -x -PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ ---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ ---disable-log-requests > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - # Ignore intel_extension_for_pytorch import errors - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ "intel_extension_for_pytorch" ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_b200_slurm.sh b/benchmarks/70b_fp8_b200_slurm.sh deleted file mode 100644 index a16217eac..000000000 --- a/benchmarks/70b_fp8_b200_slurm.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL - - -pip install datasets pandas - -nvidia-smi - -pip install --upgrade --force-reinstall flashinfer-python==0.3.0post1 - -FUSION_FLAG='{'\ -'"pass_config": {"enable_fi_allreduce_fusion": true, "enable_attn_fusion": true, "enable_noop": true},'\ -'"custom_ops": ["+quant_fp8", "+rms_norm"],'\ -'"cudagraph_mode": "FULL_DECODE_ONLY",'\ -'"splitting_ops": []'\ -'}' -cat > config.yaml <<-EOF -kv-cache-dtype: fp8 -compilation-config: '$FUSION_FLAG' -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: $MAX_MODEL_LEN -EOF - -cat config.yaml # Debugging - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - -export TORCH_CUDA_ARCH_LIST="10.0" -export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' - -set -x -PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=512 \ ---config config.yaml \ ---disable-log-requests > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - # Ignore intel_extension_for_pytorch import errors - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ "intel_extension_for_pytorch" ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/dsr1_fp4_b200_slurm.sh b/benchmarks/dsr1_fp4_b200_slurm.sh deleted file mode 100644 index 88d968d7d..000000000 --- a/benchmarks/dsr1_fp4_b200_slurm.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env bash - -echo "JOB $SLURM_JOB_ID running on NODE $SLURMD_NODENAME" - -huggingface-cli download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) - -set -x -PORT=$(( 8888 + $PORT_OFFSET )) - - -pip install --upgrade --force-reinstall flashinfer-python==0.3.0post1 - -# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. -if [[ $CONC -ge 16 ]]; then - SCHEDULER_RECV_INTERVAL=30 -else - SCHEDULER_RECV_INTERVAL=10 -fi - - -echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" - - -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 \ ---cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ ---chunked-prefill-size 16384 \ ---enable-ep-moe --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 10 \ -> $SERVER_LOG 2>&1 & - -set +x -IGNORE_PAT="Ignore import error when loading sglang.srt.models.glm4v_moe: No module named 'transformers.models.glm4v_moe'" - -while IFS= read -r line; do - printf '%s\n' "$line" - - # Skip the known benign "Ignore import error ..." line - if [[ "$line" == *"$IGNORE_PAT"* ]]; then - continue - fi - - # Keep your original "error" trap for everything else - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 "$SERVER_LOG" - echo "JOB ${SLURM_JOB_ID:-NA} ran on NODE ${SLURMD_NODENAME:-unknown}" - exit 1 - fi - - # Break when server is ready - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -# Start tail from the beginning so we don't miss early lines -done < <(tail -n +1 -F "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git - -# warmup for JIT kernels -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $CONC --max-concurrency $CONC \ ---request-rate inf --ignore-eos - -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/dsr1_fp8_b200_slurm.sh b/benchmarks/dsr1_fp8_b200_slurm.sh deleted file mode 100644 index 8cced9c80..000000000 --- a/benchmarks/dsr1_fp8_b200_slurm.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env bash - -echo "JOB $SLURM_JOB_ID running on NODE $SLURMD_NODENAME" - -huggingface-cli download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) - -set -x -PORT=$(( 8888 + $PORT_OFFSET )) - -pip install --upgrade --force-reinstall flashinfer-python==0.3.0post1 - -export SGL_ENABLE_JIT_DEEPGEMM=false -export SGLANG_ENABLE_FLASHINFER_GEMM=true - -# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. -if [[ $CONC -ge 16 ]]; then - SCHEDULER_RECV_INTERVAL=30 -else - SCHEDULER_RECV_INTERVAL=10 -fi - -set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ ---tensor-parallel-size=$TP --data-parallel-size=1 \ ---cuda-graph-max-bs 128 --max-running-requests 128 \ ---mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ ---enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ ---attention-backend trtllm_mla --stream-interval 30 --enable-flashinfer-trtllm-moe --quantization fp8 \ -> $SERVER_LOG 2>&1 & - -set +x -IGNORE_PAT="Ignore import error when loading sglang.srt.models.glm4v_moe: No module named 'transformers.models.glm4v_moe'" - -while IFS= read -r line; do - printf '%s\n' "$line" - - # Skip the known benign "Ignore import error ..." line - if [[ "$line" == *"$IGNORE_PAT"* ]]; then - continue - fi - - # Keep your original "error" trap for everything else - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 "$SERVER_LOG" - echo "JOB ${SLURM_JOB_ID:-NA} ran on NODE ${SLURMD_NODENAME:-unknown}" - exit 1 - fi - - # Break when server is ready - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -# Start tail from the beginning so we don't miss early lines -done < <(tail -n +1 -F "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/gptoss_fp4_b200_slurm.sh b/benchmarks/gptoss_fp4_b200_slurm.sh deleted file mode 100644 index 702d28f02..000000000 --- a/benchmarks/gptoss_fp4_b200_slurm.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL - - -pip install datasets pandas - -nvidia-smi - -# fixes race condition when downloading cubins -pip install --upgrade --force-reinstall flashinfer-python==0.3.0post1 - -# Calculate max-model-len based on ISL and OSL -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) -elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) -else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} -fi - -cat > config.yaml << EOF -compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+rms_norm"],"cudagraph_mode":"FULL_AND_PIECEWISE"}' -async-scheduling: true -no-enable-prefix-caching: true -cuda-graph-sizes: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - -export TORCH_CUDA_ARCH_LIST="10.0" -export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' -export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 - -set -x - - -PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ - --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ - --disable-log-requests > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - # Ignore intel_extension_for_pytorch import errors - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ "intel_extension_for_pytorch" ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json