Skip to content

Commit afb1d7d

Browse files
refactor: move qwen3.5_bf16_b200.sh to benchmarks/single_node/
Align with the multinode reorganization (PR #666): - Move script to benchmarks/single_node/ directory - Update source path to reference parent benchmark_lib.sh - Add EP_SIZE and data-parallel-size params for consistency - Reformat server launch command to match other B200 SGLang scripts Co-authored-by: Kedar Potdar <kedarpotdar-nv@users.noreply.github.com>
1 parent a0f59ef commit afb1d7d

1 file changed

Lines changed: 13 additions & 24 deletions

File tree

benchmarks/qwen3.5_bf16_b200.sh renamed to benchmarks/single_node/qwen3.5_bf16_b200.sh

Lines changed: 13 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env bash
22

3-
source "$(dirname "$0")/benchmark_lib.sh"
3+
source "$(dirname "$0")/../benchmark_lib.sh"
44

55
check_env_vars \
66
MODEL \
@@ -9,7 +9,8 @@ check_env_vars \
99
ISL \
1010
OSL \
1111
RANDOM_RANGE_RATIO \
12-
RESULT_FILENAME
12+
RESULT_FILENAME \
13+
EP_SIZE
1314

1415
if [[ -n "$SLURM_JOB_ID" ]]; then
1516
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -27,7 +28,7 @@ export PYTHONUNBUFFERED=1
2728
SERVER_LOG=/workspace/server.log
2829
PORT=${PORT:-8888}
2930

30-
# Low latency (conc 4,8): recv interval 10; max throughput (conc 16+): recv interval 30
31+
# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
3132
if [[ $CONC -ge 16 ]]; then
3233
SCHEDULER_RECV_INTERVAL=30
3334
else
@@ -46,27 +47,15 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL,
4647
ps aux
4748

4849
set -x
49-
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
50-
--model-path=$MODEL \
51-
--served-model-name "Qwen/Qwen3.5-397B-A17B" \
52-
--host=0.0.0.0 \
53-
--port=$PORT \
54-
--trust-remote-code \
55-
--tensor-parallel-size=$TP \
56-
--disable-radix-cache \
57-
--mem-fraction-static $MEM_FRAC_STATIC \
58-
--chunked-prefill-size $CHUNKED_PREFILL_SIZE \
59-
--max-prefill-tokens $MAX_PREFILL_TOKENS \
60-
--cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE \
61-
--max-running-requests $MAX_RUNNING_REQUESTS \
62-
--context-length $CONTEXT_LENGTH \
63-
--attention-backend trtllm_mha \
64-
--moe-runner-backend flashinfer_trtllm \
65-
--tokenizer-worker-num 6 \
66-
--stream-interval 30 \
67-
--scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
68-
--enable-flashinfer-allreduce-fusion \
69-
> $SERVER_LOG 2>&1 &
50+
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
51+
--served-model-name "Qwen/Qwen3.5-397B-A17B" --trust-remote-code \
52+
--tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
53+
--cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
54+
--mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
55+
--context-length $CONTEXT_LENGTH --disable-radix-cache \
56+
--attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \
57+
--enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
58+
--tokenizer-worker-num 6 --stream-interval 30 > $SERVER_LOG 2>&1 &
7059

7160
SERVER_PID=$!
7261

0 commit comments

Comments
 (0)