SemiAnalysisAI · cquil11 · Nov 21, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -0,0 +1,214 @@
+#!/usr/bin/env bash
+
+# Shared benchmarking utilities for InferenceMAX
+
+# Wait for server to be ready by polling the health endpoint
+# All parameters are required
+# Parameters:
+#   --port: Server port
+#   --server-log: Path to server log file
+#   --server-pid: Server process ID (required)
+#   --sleep-interval: Sleep interval between health checks (optional, default: 5)
+wait_for_server_ready() {
+    set +x
+    local port=""
+    local server_log=""
+    local server_pid=""
+    local sleep_interval=5
+
+    # Parse arguments
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --port)
+                port="$2"
+                shift 2
+                ;;
+            --server-log)
+                server_log="$2"
+                shift 2
+                ;;
+            --server-pid)
+                server_pid="$2"
+                shift 2
+                ;;
+            --sleep-interval)
+                sleep_interval="$2"
+                shift 2
+                ;;
+            *)
+                echo "Unknown parameter: $1"
+                return 1
+                ;;
+        esac
+    done
+
+    # Validate required parameters
+    if [[ -z "$port" ]]; then
+        echo "Error: --port is required"
+        return 1
+    fi
+    if [[ -z "$server_log" ]]; then
+        echo "Error: --server-log is required"
+        return 1
+    fi
+    if [[ -z "$server_pid" ]]; then
+        echo "Error: --server-pid is required"
+        return 1
+    fi
+
+    # Show logs until server is ready
+    tail -f "$server_log" &
+    local TAIL_PID=$!
+    until curl --output /dev/null --silent --fail http://0.0.0.0:$port/health; do
+        if ! kill -0 "$server_pid" 2>/dev/null; then
+            echo "Server died before becoming healthy. Exiting."
+            kill $TAIL_PID
+            exit 1
+        fi
+        sleep "$sleep_interval"
+    done
+    kill $TAIL_PID
-            kill $TAIL_PID
-            exit 1
-        fi
-        sleep "$sleep_interval"
-    done
-    kill $TAIL_PID
+            kill $TAIL_PID 2>/dev/null || true
+            exit 1
+        fi
+        sleep "$sleep_interval"
+    done
+    kill $TAIL_PID 2>/dev/null || true
-            kill $TAIL_PID
-            exit 1
-        fi
-        sleep "$sleep_interval"
-    done
-    kill $TAIL_PID
+            kill $TAIL_PID 2>/dev/null || true
+            exit 1
+        fi
+        sleep "$sleep_interval"
+    done
+    kill $TAIL_PID 2>/dev/null || true
+}
+
+# Run benchmark serving with standardized parameters
+# All parameters are required
+# Parameters:
+#   --model: Model name
+#   --port: Server port
+#   --backend: Backend type - e.g., 'vllm' or 'openai'
+#   --input-len: Random input sequence length
+#   --output-len: Random output sequence length
+#   --random-range-ratio: Random range ratio
+#   --num-prompts: Number of prompts
+#   --max-concurrency: Max concurrency
+#   --result-filename: Result filename without extension
+#   --result-dir: Result directory
+run_benchmark_serving() {
+    set +x
+    local model=""
+    local port=""
+    local backend=""
+    local input_len=""
+    local output_len=""
+    local random_range_ratio=""
+    local num_prompts=""
+    local max_concurrency=""
+    local result_filename=""
+    local result_dir=""
+
+    # Parse arguments
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --model)
+                model="$2"
+                shift 2
+                ;;
+            --port)
+                port="$2"
+                shift 2
+                ;;
+            --backend)
+                backend="$2"
+                shift 2
+                ;;
+            --input-len)
+                input_len="$2"
+                shift 2
+                ;;
+            --output-len)
+                output_len="$2"
+                shift 2
+                ;;
+            --random-range-ratio)
+                random_range_ratio="$2"
+                shift 2
+                ;;
+            --num-prompts)
+                num_prompts="$2"
+                shift 2
+                ;;
+            --max-concurrency)
+                max_concurrency="$2"
+                shift 2
+                ;;
+            --result-filename)
+                result_filename="$2"
+                shift 2
+                ;;
+            --result-dir)
+                result_dir="$2"
+                shift 2
+                ;;
+            *)
+                echo "Unknown parameter: $1"
+                return 1
+                ;;
+        esac
+    done
+
+    # Validate all required parameters
+    if [[ -z "$model" ]]; then
+        echo "Error: --model is required"
+        return 1
+    fi
+    if [[ -z "$port" ]]; then
+        echo "Error: --port is required"
+        return 1
+    fi
+    if [[ -z "$backend" ]]; then
+        echo "Error: --backend is required"
+        return 1
+    fi
+    if [[ -z "$input_len" ]]; then
+        echo "Error: --input-len is required"
+        return 1
+    fi
+    if [[ -z "$output_len" ]]; then
+        echo "Error: --output-len is required"
+        return 1
+    fi
+    if [[ -z "$random_range_ratio" ]]; then
+        echo "Error: --random-range-ratio is required"
+        return 1
+    fi
+    if [[ -z "$num_prompts" ]]; then
+        echo "Error: --num-prompts is required"
+        return 1
+    fi
+    if [[ -z "$max_concurrency" ]]; then
+        echo "Error: --max-concurrency is required"
+        return 1
+    fi
+    if [[ -z "$result_filename" ]]; then
+        echo "Error: --result-filename is required"
+        return 1
+    fi
+    if [[ -z "$result_dir" ]]; then
+        echo "Error: --result-dir is required"
+        return 1
+    fi
+
+    # Clone benchmark serving repo
+    local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)
+    git clone https://github.com/kimbochen/bench_serving.git "$BENCH_SERVING_DIR"
+
+    # Run benchmark
+    set -x
+    python3 "$BENCH_SERVING_DIR/benchmark_serving.py" \
+        --model "$model" \
+        --backend "$backend" \
+        --base-url "http://0.0.0.0:$port" \
+        --dataset-name random \
+        --random-input-len "$input_len" \
+        --random-output-len "$output_len" \
+        --random-range-ratio "$random_range_ratio" \
+        --num-prompts "$num_prompts" \
+        --max-concurrency "$max_concurrency" \
+        --request-rate inf \
+        --ignore-eos \
+        --save-result \
+        --percentile-metrics 'ttft,tpot,itl,e2el' \
+        --result-dir "$result_dir" \
+        --result-filename "$result_filename.json"
+    set +x
+}
diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh
@@ -1,11 +1,25 @@
 #!/usr/bin/env bash
 
+# === Required Env Vars ===
+# MODEL
+# PORT
+# TP
+# CONC
+# ISL
+# OSL
+# RANDOM_RANGE_RATIO
+# RESULT_FILENAME
+# EP_SIZE
+# NUM_PROMPTS
+
 nvidia-smi
 
 # To improve CI stability, we patch this helper function to prevent a race condition that
 # happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779
 sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
   SCHEDULER_RECV_INTERVAL=30
@@ -22,5 +36,27 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
 --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 16384 \
 --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10
+--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$NUM_PROMPTS" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
 
-
-
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -1,16 +1,13 @@
 #!/usr/bin/env bash
 
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
+# === Required Env Vars ===
 # MODEL
+# TP
+# CONC
 # ISL
 # OSL
 # MAX_MODEL_LEN
 # RANDOM_RANGE_RATIO
-# TP
-# CONC
 # RESULT_FILENAME
 # PORT_OFFSET
 # DP_ATTENTION
@@ -100,24 +97,22 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
     --extra_llm_api_options=$EXTRA_CONFIG_FILE \
     > $SERVER_LOG 2>&1 &
 
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-git clone https://github.com/kimbochen/bench_serving.git
-set -x
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+SERVER_PID=$!
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh
@@ -1,14 +1,15 @@
 #!/usr/bin/env bash
 
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
+# === Required Env Vars ===
 # MODEL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
+# PORT
 # TP
 # CONC
-# PORT
+# ISL
+# OSL
+# RANDOM_RANGE_RATIO
+# RESULT_FILENAME
+# NUM_PROMPTS
 export SGLANG_USE_AITER=1
 
 PREFILL_SIZE=196608
@@ -18,6 +19,8 @@ if [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
 	fi
 fi
 
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
 set -x
 python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --host=0.0.0.0 --port=$PORT \
@@ -27,5 +30,24 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --disable-radix-cache \
 --num-continuous-decode-steps=4 \
 --max-prefill-tokens=$PREFILL_SIZE \
---cuda-graph-max-bs=128
+--cuda-graph-max-bs=128 > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Source benchmark utilities
+source "$(dirname "$0")/benchmark_lib.sh"
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$NUM_PROMPTS" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/