diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh new file mode 100644 index 000000000..0458cfb78 --- /dev/null +++ b/benchmarks/benchmark_lib.sh @@ -0,0 +1,214 @@ +#!/usr/bin/env bash + +# Shared benchmarking utilities for InferenceMAX + +# Wait for server to be ready by polling the health endpoint +# All parameters are required +# Parameters: +# --port: Server port +# --server-log: Path to server log file +# --server-pid: Server process ID (required) +# --sleep-interval: Sleep interval between health checks (optional, default: 5) +wait_for_server_ready() { + set +x + local port="" + local server_log="" + local server_pid="" + local sleep_interval=5 + + # Parse arguments + while [[ $# -gt 0 ]]; do + case $1 in + --port) + port="$2" + shift 2 + ;; + --server-log) + server_log="$2" + shift 2 + ;; + --server-pid) + server_pid="$2" + shift 2 + ;; + --sleep-interval) + sleep_interval="$2" + shift 2 + ;; + *) + echo "Unknown parameter: $1" + return 1 + ;; + esac + done + + # Validate required parameters + if [[ -z "$port" ]]; then + echo "Error: --port is required" + return 1 + fi + if [[ -z "$server_log" ]]; then + echo "Error: --server-log is required" + return 1 + fi + if [[ -z "$server_pid" ]]; then + echo "Error: --server-pid is required" + return 1 + fi + + # Show logs until server is ready + tail -f "$server_log" & + local TAIL_PID=$! + until curl --output /dev/null --silent --fail http://0.0.0.0:$port/health; do + if ! kill -0 "$server_pid" 2>/dev/null; then + echo "Server died before becoming healthy. Exiting." + kill $TAIL_PID + exit 1 + fi + sleep "$sleep_interval" + done + kill $TAIL_PID +} + +# Run benchmark serving with standardized parameters +# All parameters are required +# Parameters: +# --model: Model name +# --port: Server port +# --backend: Backend type - e.g., 'vllm' or 'openai' +# --input-len: Random input sequence length +# --output-len: Random output sequence length +# --random-range-ratio: Random range ratio +# --num-prompts: Number of prompts +# --max-concurrency: Max concurrency +# --result-filename: Result filename without extension +# --result-dir: Result directory +run_benchmark_serving() { + set +x + local model="" + local port="" + local backend="" + local input_len="" + local output_len="" + local random_range_ratio="" + local num_prompts="" + local max_concurrency="" + local result_filename="" + local result_dir="" + + # Parse arguments + while [[ $# -gt 0 ]]; do + case $1 in + --model) + model="$2" + shift 2 + ;; + --port) + port="$2" + shift 2 + ;; + --backend) + backend="$2" + shift 2 + ;; + --input-len) + input_len="$2" + shift 2 + ;; + --output-len) + output_len="$2" + shift 2 + ;; + --random-range-ratio) + random_range_ratio="$2" + shift 2 + ;; + --num-prompts) + num_prompts="$2" + shift 2 + ;; + --max-concurrency) + max_concurrency="$2" + shift 2 + ;; + --result-filename) + result_filename="$2" + shift 2 + ;; + --result-dir) + result_dir="$2" + shift 2 + ;; + *) + echo "Unknown parameter: $1" + return 1 + ;; + esac + done + + # Validate all required parameters + if [[ -z "$model" ]]; then + echo "Error: --model is required" + return 1 + fi + if [[ -z "$port" ]]; then + echo "Error: --port is required" + return 1 + fi + if [[ -z "$backend" ]]; then + echo "Error: --backend is required" + return 1 + fi + if [[ -z "$input_len" ]]; then + echo "Error: --input-len is required" + return 1 + fi + if [[ -z "$output_len" ]]; then + echo "Error: --output-len is required" + return 1 + fi + if [[ -z "$random_range_ratio" ]]; then + echo "Error: --random-range-ratio is required" + return 1 + fi + if [[ -z "$num_prompts" ]]; then + echo "Error: --num-prompts is required" + return 1 + fi + if [[ -z "$max_concurrency" ]]; then + echo "Error: --max-concurrency is required" + return 1 + fi + if [[ -z "$result_filename" ]]; then + echo "Error: --result-filename is required" + return 1 + fi + if [[ -z "$result_dir" ]]; then + echo "Error: --result-dir is required" + return 1 + fi + + # Clone benchmark serving repo + local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX) + git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR" + + # Run benchmark + set -x + python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \ + --model "$model" \ + --backend "$backend" \ + --base-url "http://0.0.0.0:$port" \ + --dataset-name random \ + --random-input-len "$input_len" \ + --random-output-len "$output_len" \ + --random-range-ratio "$random_range_ratio" \ + --num-prompts "$num_prompts" \ + --max-concurrency "$max_concurrency" \ + --request-rate inf \ + --ignore-eos \ + --save-result \ + --percentile-metrics 'ttft,tpot,itl,e2el' \ + --result-dir "$result_dir" \ + --result-filename "$result_filename.json" + set +x +} diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index 3c8232072..4ff123a32 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -1,11 +1,25 @@ #!/usr/bin/env bash +# === Required Env Vars === +# MODEL +# PORT +# TP +# CONC +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# EP_SIZE +# NUM_PROMPTS + nvidia-smi # To improve CI stability, we patch this helper function to prevent a race condition that # happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779 sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then SCHEDULER_RECV_INTERVAL=30 @@ -22,5 +36,27 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 +--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 6f4f814a0..aa2be7648 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION @@ -100,24 +97,22 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -git clone https://github.com/kimbochen/bench_serving.git -set -x -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index 4d3ed084c..ca1255802 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -1,14 +1,15 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO +# PORT # TP # CONC -# PORT +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# NUM_PROMPTS export SGLANG_USE_AITER=1 PREFILL_SIZE=196608 @@ -18,6 +19,8 @@ if [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --host=0.0.0.0 --port=$PORT \ @@ -27,5 +30,24 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --disable-radix-cache \ --num-continuous-decode-steps=4 \ --max-prefill-tokens=$PREFILL_SIZE \ ---cuda-graph-max-bs=128 +--cuda-graph-max-bs=128 > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index b88a90f46..0983b7ddf 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL +# PORT +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC -# PORT # RESULT_FILENAME export SGLANG_USE_AITER=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) @@ -34,23 +31,23 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --cuda-graph-max-bs=128 \ > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +SERVER_PID=$! -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index 361b6f1f6..4d8a9ff18 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -1,13 +1,16 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC -# MAX_MODEL_LEN +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# EP_SIZE +# NUM_PROMPTS nvidia-smi @@ -17,6 +20,7 @@ sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_l export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then @@ -34,4 +38,26 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --cuda-graph-max-bs 128 --max-running-requests 128 \ --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ ---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 +--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 58d4525f1..58d24a7ed 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION @@ -70,24 +67,22 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & +SERVER_PID=$! -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" -git clone https://github.com/kimbochen/bench_serving.git -set -x -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 86ea0024f..f84d741d6 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -1,16 +1,12 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET @@ -44,23 +40,22 @@ else > $SERVER_LOG 2>&1 & fi -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +SERVER_PID=$! -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 7b566c0ab..ac6bc167c 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION @@ -69,25 +66,23 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ --tp_size=$TP --ep_size=$EP_SIZE \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & + +SERVER_PID=$! +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -git clone https://github.com/kimbochen/bench_serving.git -set -x -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index fca44bcf1..e92765ebb 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -1,13 +1,14 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC -# MAX_MODEL_LEN +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME # Reference # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-sglang-deepseek-r1-fp8.html#run-the-inference-benchmark @@ -24,6 +25,8 @@ fi export SGLANG_USE_AITER=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x python3 -m sglang.launch_server \ --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ @@ -33,4 +36,24 @@ python3 -m sglang.launch_server \ --chunked-prefill-size=196608 \ --num-continuous-decode-steps=4 \ --max-prefill-tokens=196608 \ ---disable-radix-cache +--disable-radix-cache > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index 90babeaee..662f4bdfb 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -1,16 +1,12 @@ #!/usr/bin/bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -47,23 +43,22 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +SERVER_PID=$! -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm \ ---base-url="http://0.0.0.0:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index f39a8dbbd..a8cdf566a 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -1,19 +1,22 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC -# MAX_MODEL_LEN +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME # Reference # https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/inference-sglang-deepseek-r1-fp8.html export SGLANG_USE_AITER=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + python3 -m sglang.launch_server \ --model-path $MODEL \ --host=0.0.0.0 \ @@ -24,5 +27,24 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.8 --disable-radix-cache \ --num-continuous-decode-steps 4 \ --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 - + --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 09dae4dbb..fb5e07df9 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -1,5 +1,14 @@ #!/usr/bin/bash +# === Required Env Vars === +# MODEL +# TP +# CONC +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) @@ -23,23 +32,22 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +SERVER_PID=$! -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index f39a8dbbd..8c5038cee 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -1,19 +1,22 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC -# MAX_MODEL_LEN +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME # Reference # https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/inference-sglang-deepseek-r1-fp8.html export SGLANG_USE_AITER=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + python3 -m sglang.launch_server \ --model-path $MODEL \ --host=0.0.0.0 \ @@ -24,5 +27,35 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.8 --disable-radix-cache \ --num-continuous-decode-steps 4 \ --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 - + --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + NUM_PROMPTS=$(( CONC * 20 )) + else + NUM_PROMPTS=$(( CONC * 50 )) + fi +else + NUM_PROMPTS=$(( CONC * 10 )) +fi + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index bf5d60e9c..921f08a4c 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL +# PORT +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC -# PORT # RESULT_FILENAME export HF_MODULES_CACHE="/tmp/hf_modules_cache/" @@ -32,22 +29,22 @@ python3 -m sglang.launch_server \ --max-prefill-tokens 196608 \ --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +SERVER_PID=$! -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index fd6ac15c5..4fbf4f50c 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -1,18 +1,16 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# PORT +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME -# PORT_OFFSET +# NUM_PROMPTS nvidia-smi @@ -44,7 +42,31 @@ export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ ---disable-log-requests +--disable-log-requests > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 349930dfb..44e9dbf4c 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION @@ -78,24 +75,22 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --extra_llm_api_options=$EXTRA_CONFIG_FILE \ > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -git clone https://github.com/kimbochen/bench_serving.git -set -x -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index a8bb57c16..48b548e37 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -1,12 +1,15 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL -# MAX_MODEL_LEN +# PORT # TP # CONC +# ISL +# OSL +# RANDOM_RANGE_RATIO +# RESULT_FILENAME + cat > config.yaml << EOF compilation-config: '{"cudagraph_mode":"PIECEWISE"}' @@ -18,6 +21,7 @@ max-model-len: 10240 EOF export PYTHONNOUSERSITE=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ @@ -25,4 +29,26 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---disable-log-requests +--disable-log-requests > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency 512 \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index d2819b5b3..a004f8892 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -1,18 +1,14 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# PORT +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME -# PORT_OFFSET echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -36,25 +32,24 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --max-num-seqs=$CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" pip install -q datasets pandas -git clone https://github.com/kimbochen/bench_serving.git -set -x -python3 bench_serving/benchmark_serving.py \ ---model=$MODEL \ ---backend=vllm \ ---base-url="http://0.0.0.0:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index f92c60425..970b7ad35 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -1,16 +1,12 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET @@ -48,23 +44,22 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ --disable-log-requests > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +SERVER_PID=$! -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index c148a3cb7..12a6af5b7 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -1,16 +1,12 @@ #!/usr/bin/env bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME # PORT_OFFSET # DP_ATTENTION @@ -44,27 +40,37 @@ print_iter_log: true stream_interval: 20 EOF -#mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --max_batch_size $CONC --max_num_tokens 20000 --backend pytorch --extra_llm_api_options gptoss-config.yml --ep_size=$EP_SIZE --trust_remote_code --gpus_per_node 8 --host 0.0.0.0 --port $PORT --tp_size=$TP --pp_size=1 > $SERVER_LOG 2>&1 & +mpirun -n 1 --oversubscribe --allow-run-as-root \ +trtllm-serve $MODEL \ +--max_batch_size $CONC \ +--max_num_tokens 20000 \ +--backend pytorch \ +--extra_llm_api_options gptoss-config.yml \ +--ep_size=$EP_SIZE \ +--trust_remote_code \ +--gpus_per_node 8 \ +--host 0.0.0.0 \ +--port $PORT \ +--tp_size=$TP \ +--pp_size=1 \ +> $SERVER_LOG 2>&1 & +SERVER_PID=$! -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 66a8642bd..50d86b52a 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -1,13 +1,15 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC +# ISL +# OSL # MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME # If the machine runs a MEC FW older than 177, RCCL # cannot reclaim some memory. @@ -24,6 +26,8 @@ export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -34,4 +38,24 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling +--async-scheduling > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 0ab5a250f..a86e66b3d 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -48,23 +45,22 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +SERVER_PID=$! -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 05250267f..2117f787e 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -1,13 +1,15 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC +# ISL +# OSL # MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME # If the machine runs a MEC FW older than 177, RCCL # cannot reclaim some memory. @@ -23,6 +25,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -33,4 +37,24 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling +--async-scheduling > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index cab549cbc..56c7651ed 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/bash -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE +# === Required Env Vars === # MODEL +# TP +# CONC # ISL # OSL # MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC # RESULT_FILENAME @@ -48,23 +45,22 @@ vllm serve $MODEL --port $PORT \ --async-scheduling \ > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") +SERVER_PID=$! -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 103e77fe3..68fc59f8c 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -1,13 +1,16 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL # PORT # TP # CONC +# ISL +# OSL # MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# RESULT_FILENAME +# NUM_PROMPTS cat > config.yaml << EOF compilation-config: '{"compile_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,256,512,1024,2048,8192] , "cudagraph_capture_sizes":[1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,122,124,126,128,136,144,152,160,168,176,184,192,200,208,216,224,232,240,248,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512,520,528,536,544,552,560,568,576,584,592,600,608,616,624,632,640,648,656,664,672,680,688,696,704,712,720,728,736,744,752,760,768,776,784,792,800,808,816,824,832,840,848,856,864,872,880,888,896,904,912,920,928,936,944,952,960,968,976,984,992,1000,1008,1016,1024,2048,4096,8192] , "cudagraph_mode": "FULL_AND_PIECEWISE"}' @@ -20,6 +23,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -30,4 +35,24 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling +--async-scheduling > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$NUM_PROMPTS" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 657bc1fdf..342b7dde3 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -1,16 +1,13 @@ #!/usr/bin/env bash -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE +# === Required Env Vars === # MODEL +# PORT +# TP +# CONC # ISL # OSL -# MAX_MODEL_LEN # RANDOM_RANGE_RATIO -# TP -# CONC -# PORT # RESULT_FILENAME SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) @@ -38,22 +35,22 @@ vllm serve $MODEL --port $PORT \ --disable-log-requests \ --async-scheduling > $SERVER_LOG 2>&1 & -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index 9a3dfa909..44392e3aa 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -12,4 +12,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \ --no-container-mount-home --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \ -bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh +bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh \ No newline at end of file diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index 21a10d48f..c5216b006 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -25,55 +25,27 @@ set -x # Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes. # Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register - -docker run --rm -d --init --network host --name $server_name \ ---runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ --e NCCL_GRAPH_REGISTER=0 \ --e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ ---entrypoint=/bin/bash \ -$(echo "$IMAGE" | sed 's/#/\//') \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - - if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then - NUM_PROMPTS=$(( CONC * 20 )) + export NUM_PROMPTS=$(( CONC * 20 )) else - NUM_PROMPTS=$(( CONC * 50 )) + export NUM_PROMPTS=$(( CONC * 50 )) fi else - NUM_PROMPTS=$(( CONC * 10 )) + export NUM_PROMPTS=$(( CONC * 10 )) fi -set -x -docker run --rm --network host --name $client_name \ +docker run --rm --init --network host --name $server_name \ +--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ +-e NCCL_GRAPH_REGISTER=0 \ +-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ --lc "pip install -q datasets pandas && \ -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $NUM_PROMPTS \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json" +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" # Try graceful first docker stop -t 90 "$server_name" || true diff --git a/runners/launch_b200-tg.sh b/runners/launch_b200-tg.sh deleted file mode 100644 index 9f313396c..000000000 --- a/runners/launch_b200-tg.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/bash - -HF_HUB_CACHE_MOUNT="/dev/shm/hf_hub_cache/" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') -PORT=8888 - -server_name="bmk-server" -client_name="bmk-client" - -set -x -docker run --rm -d --network host --name $server_name \ ---runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ --e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ ---entrypoint=/bin/bash \ -$(echo "$IMAGE" | sed 's/#/\//') \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network host --name $client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=/bin/bash \ -$(echo "$IMAGE" | sed 's/#/\//') \ --lc "pip install -q datasets pandas && \ -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json" - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - sleep 5 -done diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 47b350128..d1ddc26de 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -4,51 +4,14 @@ HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/" PORT=8888 server_name="bmk-server" -client_name="bmk-client" set -x -docker run --rm -d --network=host --name=$server_name \ +docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \ --e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then - echo "Server container launch failed." - exit 1 -fi - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=host --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=/bin/bash \ -$IMAGE \ --lc "pip install -q datasets pandas && \ -python3 bench_serving/benchmark_serving.py \ ---model=$MODEL \ ---backend=vllm \ ---base-url=\"http://localhost:$PORT\" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json" - -docker stop $server_name diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index 51e059d4c..780e5a2f0 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -5,52 +5,16 @@ sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' HF_HUB_CACHE_MOUNT="/shareddata/hf_hub_cache_$(hostname)/" PORT=8888 -network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh index 48be17610..8fbdaee63 100644 --- a/runners/launch_mi300x-cr.sh +++ b/runners/launch_mi300x-cr.sh @@ -5,52 +5,16 @@ sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' HF_HUB_CACHE_MOUNT="/mnt/vdb/gha_cache/hf_hub_cache/" PORT=8888 -network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - -set -x -docker run --rm --network=$network_name --name=$client_name \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ -$IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index b622ee2e8..1065167d7 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -21,4 +21,4 @@ srun --jobid=$JOB_ID \ --no-container-entrypoint --export=ALL \ bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_mi325x_slurm.sh -scancel $JOB_ID \ No newline at end of file +scancel $JOB_ID diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index 87ee8cbd2..5f3cbb290 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -17,68 +17,31 @@ HF_HUB_CACHE_MOUNT="/nfsdata/hf_hub_cache-1/" # Temp solution PORT=8888 -network_name="bmk-net" server_name="bmk-server" -client_name="bmk-client" - -docker network create $network_name - -set -x -docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ ---privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ ---cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL \ ---entrypoint=/bin/bash \ -$IMAGE \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then - NUM_PROMPTS=$(( CONC * 20 )) + export NUM_PROMPTS=$(( CONC * 20 )) else - NUM_PROMPTS=$(( CONC * 50 )) + export NUM_PROMPTS=$(( CONC * 50 )) fi else - NUM_PROMPTS=$(( CONC * 10 )) + export NUM_PROMPTS=$(( CONC * 10 )) fi -git clone https://github.com/kimbochen/bench_serving.git - set -x -docker run --rm --network=$network_name --name=$client_name \ +docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ +--privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ +--cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ ---entrypoint=python3 \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT -e NUM_PROMPTS \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ +--entrypoint=/bin/bash \ $IMAGE \ -bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$NUM_PROMPTS \ ---max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics="ttft,tpot,itl,e2el" \ ---result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x_docker.sh" if ls gpucore.* 1> /dev/null 2>&1; then echo "gpucore files exist. not good" rm -f gpucore.* fi - - -while [ -n "$(docker ps -aq)" ]; do - docker stop $server_name - docker network rm $network_name - sleep 5 -done