-
Notifications
You must be signed in to change notification settings - Fork 156
Add B300 config: qwen3.5-fp8-sglang-mtp #1035
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,87 @@ | ||
| #!/usr/bin/env bash | ||
|
|
||
| source "$(dirname "$0")/../benchmark_lib.sh" | ||
|
|
||
| check_env_vars \ | ||
| MODEL \ | ||
| TP \ | ||
| CONC \ | ||
| ISL \ | ||
| OSL \ | ||
| RANDOM_RANGE_RATIO \ | ||
| RESULT_FILENAME \ | ||
| EP_SIZE | ||
|
|
||
| if [[ -n "$SLURM_JOB_ID" ]]; then | ||
| echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" | ||
| fi | ||
|
|
||
| nvidia-smi | ||
|
|
||
| SERVER_LOG=/workspace/server.log | ||
| PORT=${PORT:-8888} | ||
|
|
||
| CONTEXT_LENGTH=$((ISL + OSL + 20)) | ||
| if [ "${EVAL_ONLY}" = "true" ]; then | ||
| setup_eval_context | ||
| CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" | ||
| fi | ||
|
|
||
| # Start GPU monitoring (power, temperature, clocks every second) | ||
| start_gpu_monitor | ||
|
|
||
| set -x | ||
| SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ | ||
| --trust-remote-code \ | ||
| --tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \ | ||
| --enable-symm-mem \ | ||
| --disable-radix-cache \ | ||
| --quantization fp8 \ | ||
| --kv-cache-dtype fp8_e4m3 \ | ||
| --mamba-ssm-dtype bfloat16 \ | ||
| --attention-backend trtllm_mha \ | ||
| --moe-runner-backend flashinfer_trtllm \ | ||
| --cuda-graph-max-bs $CONC \ | ||
| --max-running-requests $CONC \ | ||
| --max-prefill-tokens 16384 \ | ||
| --chunked-prefill-size 16384 \ | ||
| --mem-fraction-static 0.8 \ | ||
| --stream-interval 50 \ | ||
| --scheduler-recv-interval 10 \ | ||
| --tokenizer-worker-num 6 \ | ||
| --tokenizer-path $MODEL \ | ||
| --speculative-algorithm EAGLE \ | ||
| --speculative-num-steps 3 \ | ||
| --speculative-eagle-topk 1 \ | ||
| --speculative-num-draft-tokens 4 \ | ||
| --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & | ||
|
|
||
| SERVER_PID=$! | ||
|
|
||
| # Wait for server to be ready | ||
| wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" | ||
|
|
||
| pip install -q datasets pandas | ||
|
|
||
| run_benchmark_serving \ | ||
| --model "$MODEL" \ | ||
| --port "$PORT" \ | ||
| --backend vllm \ | ||
| --input-len "$ISL" \ | ||
| --output-len "$OSL" \ | ||
| --random-range-ratio "$RANDOM_RANGE_RATIO" \ | ||
| --num-prompts "$((CONC * 10))" \ | ||
| --max-concurrency "$CONC" \ | ||
| --result-filename "$RESULT_FILENAME" \ | ||
| --result-dir /workspace/ \ | ||
| --use-chat-template | ||
|
|
||
| # After throughput, run evaluation only if RUN_EVAL is true | ||
| if [ "${RUN_EVAL}" = "true" ]; then | ||
| run_eval --framework lm-eval --port "$PORT" | ||
| append_lm_eval_summary | ||
| fi | ||
|
|
||
| # Stop GPU monitoring | ||
| stop_gpu_monitor | ||
| set +x |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,8 @@ SLURM_ACCOUNT="benchmark" | |
|
|
||
| set -x | ||
|
|
||
| if [[ "$IS_MULTINODE" == "true" ]]; then | ||
|
|
||
| # Validate framework | ||
| if [[ $FRAMEWORK != "dynamo-sglang" && $FRAMEWORK != "dynamo-trt" ]]; then | ||
| echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang" | ||
|
|
@@ -211,3 +213,26 @@ for i in 1 2 3 4 5; do | |
| sleep 10 | ||
| done | ||
| find . -name '.nfs*' -delete 2>/dev/null || true | ||
|
|
||
| else | ||
|
|
||
| HF_HUB_CACHE_MOUNT="/scratch/models" | ||
| export MODEL="/scratch/models/${MODEL#*/}" | ||
| SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" | ||
| FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') | ||
| SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') | ||
|
|
||
| salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" | ||
| JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) | ||
|
|
||
| srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" | ||
|
Comment on lines
+220
to
+228
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 The newly added single-node else branch in Extended reasoning...What the bug is and how it manifests The single-node The specific code path that triggers it The script has no Why existing code doesn't prevent it The multinode branch (earlier in the same file, around lines 115-118) handles a similar situation—extracting a job ID from Addressing the refutation One verifier correctly notes that this exact pattern (no empty-JOB_ID check after squeue) exists across other scripts: What the impact would be A failed salloc results in srun failing with a SLURM error that does not explain the root cause. The benchmark CI job will eventually fail (the result file won't be produced), but the error will appear to be an srun problem rather than an allocation problem, making it harder and slower to diagnose. No data is corrupted; the impact is purely diagnostic and operational. How to fix it After the squeue line, add: if [ -z "$JOB_ID" ]; then
echo "Error: Failed to extract JOB_ID. salloc may have failed."
exit 1
fiStep-by-step proof
|
||
|
|
||
| srun --jobid=$JOB_ID \ | ||
| --container-image=$SQUASH_FILE \ | ||
| --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ | ||
| --no-container-mount-home \ | ||
| --container-workdir=/workspace/ \ | ||
| --no-container-entrypoint --export=ALL,PORT=8888 \ | ||
| bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b300${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh | ||
|
|
||
| fi | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Ankur-singh did u set up multiple runners on the b300 cluster now too?
+viz @SemiAnalysisAI/core