From 1ada0dcbc6e6f9d8dc3ad767215492a42020b1e9 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 24 Oct 2025 14:04:19 -0500 Subject: [PATCH 001/149] initial commit based on kimbos edits --- .github/workflows/1k1k-sweep.yml | 37 ++++++++++ .github/workflows/benchmark-tmpl.yml | 92 ++++++++----------------- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 35 +++------- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 25 ++----- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 25 ++----- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 26 ++++--- 6 files changed, 98 insertions(+), 142 deletions(-) create mode 100644 .github/workflows/1k1k-sweep.yml diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml new file mode 100644 index 000000000..e1a103f83 --- /dev/null +++ b/.github/workflows/1k1k-sweep.yml @@ -0,0 +1,37 @@ +name: '1K/1K Sweep' + +on: + workflow_dispatch: + +jobs: + get-1k1k-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-1k1k-configs.outputs.search-space-config }} + steps: + - id: get-1k1k-configs + run: python utils/print_configs_json.py configs.json 1k1k + + benchmark: + needs: get-1k1k-configs + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-1k1k-configs.outputs.search-space-config) }} + secrets: inherit + with: + isl: 1024 + osl: 1024 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + tp: ${{ matrix.config.tp }} + conc: ${{ matrix.config.conc }} + +# collect-results: +# needs: benchmark +# steps: \ No newline at end of file diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 313087946..c78dcb602 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -11,10 +11,10 @@ on: model: required: true type: string - framework: + precision: required: true type: string - precision: + framework: required: true type: string exp-name: @@ -26,18 +26,24 @@ on: osl: required: true type: string - max-model-len: - required: true - type: string random-range-ratio: - required: true + required: false type: string - tp-list: + default: '0.2' + tp: required: true type: string - conc-list: + ep: + required: false + type: string + default: '' + dp-attn: + required: false + type: boolean + default: false + conc: + required: true type: string - default: '[4, 8, 16, 32, 64]' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -46,69 +52,32 @@ env: MODEL: ${{ inputs.model }} ISL: ${{ inputs.isl }} OSL: ${{ inputs.osl }} - MAX_MODEL_LEN: ${{ inputs.max-model-len }} RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }} IMAGE: ${{ inputs.image }} FRAMEWORK: ${{ inputs.framework }} PRECISION: ${{ inputs.precision }} + TP: ${{ inputs.tp }} + EP_SIZE: ${{ inputs.ep }} + DP_ATTENTION: ${{ inputs.dp-attn }} + CONC: ${{ inputs.conc }} jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - - strategy: - fail-fast: false - matrix: - tp: ${{ fromJson(inputs.tp-list) }} - conc: ${{ fromJson(inputs.conc-list) }} - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.tp }} conc${{ matrix.conc }}' - - env: - TP: ${{ matrix.tp }} - CONC: ${{ matrix.conc }} + name: '${{ inputs.runner }} ${{ inputs.exp-name }}-${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa-${{ inputs.dp-attn }} conc${{ inputs.conc }}' steps: - name: Resource cleanup run: | if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then - host=$(hostname) - - if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then - echo "[INFO] Running container-by-container cleanup on $host" - - for cid in $(docker ps -aq); do - echo "[INFO] Cleaning container $cid" - - # Try graceful first - docker stop -t 90 "$cid" || true - - # Wait until it's really dead - docker wait "$cid" >/dev/null 2>&1 || true - - # Force remove if anything lingers - docker rm -f "$cid" >/dev/null 2>&1 || true - done - - # Give a moment for GPU processes to fully terminate - sleep 2 - - # Verify GPUs are now idle - if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then - echo "[WARN] After stop, GPU still busy:" - nvidia-smi - # Last resort if driver allows and GPUs appear idle otherwise: - # nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true - fi - else - echo "[Docker] Cleaning up resources ..." - docker ps -aq | xargs -r docker rm -f - docker network prune -f - while [ -n "$(docker ps -aq)" ]; do - docker ps -a - sleep 5 - done - fi + echo "[Docker] Cleaning up resources ..." + docker ps -aq | xargs -r docker rm -f + docker network prune -f + while [ -n "$(docker ps -aq)" ]; do + docker ps -a + sleep 5 + done fi if command -v squeue >/dev/null 2>&1; then echo "[Slurm] Cleaning up resources ..." @@ -127,7 +96,7 @@ jobs: - name: Launch job script env: RUNNER_NAME: ${{ runner.name }} - RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }} + RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP }}_conc${{ env.CONC }}_${{ runner.name }} run: | bash ./runners/launch_${RUNNER_NAME%%_*}.sh if [ -f "$RESULT_FILENAME.json" ]; then @@ -139,10 +108,9 @@ jobs: - name: Process result run: | - python3 utils/process_result.py ${{ inputs.runner }} $TP $RESULT_FILENAME $FRAMEWORK $PRECISION - + python3 utils/process_result.py ${{ inputs.runner }} - name: Upload result uses: actions/upload-artifact@v4 with: name: ${{ env.RESULT_FILENAME }} - path: agg_${{ env.RESULT_FILENAME }}.json + path: agg_${{ env.RESULT_FILENAME }}.json \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index ffdae541c..d13584078 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -13,69 +13,50 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +# Default +MOE_BACKEND="TRTLLM" -hf download $MODEL +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="1" -MOE_BACKEND="TRTLLM" -DP_ATTENTION=false +hf download $MODEL +# ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= if [[ "$TP" == "4" ]]; then if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - fi if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - fi if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi fi elif [[ "$TP" == "8" ]]; then if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 8 ]]; then - EP_SIZE="$TP" - fi if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 16 ]]; then - EP_SIZE="$TP" - fi if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi fi fi -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +echo "MOE_BACKEND set to '$MOE_BACKEND'" SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index e909b954a..6bc8c9fa7 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -13,33 +13,16 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" - -hf download $MODEL - -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="$TP" MOE_BACKEND="DEEPGEMM" -DP_ATTENTION=false -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -fi +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 20101e466..5dfdf8617 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -13,33 +13,16 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" - -hf download $MODEL - -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="$TP" MOE_BACKEND="CUTLASS" -DP_ATTENTION=false -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - DP_ATTENTION=true - fi -fi +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND" -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index f85f5c13f..4f17d4d4f 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -13,36 +13,34 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION # GPTOSS TRTLLM Deployment Guide: # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION" hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="1" +# ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= +# Default MOE_BACKEND="TRTLLM" -DP_ATTENTION=false # Higher concurrencies: Concurrency >= 256 # MoE Backend = CUTLASS -# Use DP attention with expert parallel MoE if [[ $CONC -ge 256 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true + MOE_BACKEND="CUTLASS" fi -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +echo "MOE_BACKEND set to $MOE_BACKEND" EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 -export NCCL_GRAPH_REGISTER=0 cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: @@ -50,7 +48,7 @@ cuda_graph_config: max_batch_size: $CONC enable_attention_dp: $DP_ATTENTION kv_cache_config: - dtype: fp8 + dtype: auto enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true @@ -90,6 +88,12 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ set +x while IFS= read -r line; do printf '%s\n' "$line" + if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then + sleep 5 + tail -n100 $SERVER_LOG + echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" + exit 1 + fi if [[ "$line" == *"Application startup complete"* ]]; then break fi @@ -106,4 +110,4 @@ python3 bench_serving/benchmark_serving.py \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ --result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json +--result-filename $RESULT_FILENAME.json \ No newline at end of file From a1b74760a88bf4576504f2d6d65bd1450392df31 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 24 Oct 2025 14:57:41 -0500 Subject: [PATCH 002/149] adding config and python script: --- .github/configs/master.json | 1025 +++++++++++++++++++++++++++++++++++ utils/get_configs.py | 29 + 2 files changed, 1054 insertions(+) create mode 100644 .github/configs/master.json create mode 100644 utils/get_configs.py diff --git a/.github/configs/master.json b/.github/configs/master.json new file mode 100644 index 000000000..1706be9ab --- /dev/null +++ b/.github/configs/master.json @@ -0,0 +1,1025 @@ +{ + "70b-fp8-h100-vllm": { + "image": "vllm/vllm-openai:v0.10.2", + "model": "nvidia/Llama-3.3-70B-Instruct-FP8", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 2, "conc-start": 64, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 2, "conc-start": 64, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "70b-fp8-h200-vllm": { + "image": "vllm/vllm-openai:v0.10.2", + "model": "nvidia/Llama-3.3-70B-Instruct-FP8", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "conc-start": 64, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 16, "conc-end": 64}, + {"tp": 2, "conc-start": 16, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "70b-fp8-b200-vllm": { + "image": "vllm/vllm-openai:v0.10.2", + "model": "nvidia/Llama-3.3-70B-Instruct-FP8", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "conc-start": 64, "conc-end": 64}, + {"tp": 4, "conc-start": 16, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 16, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 32} + ] + } + ] + }, + "70b-fp8-h200-trt": { + "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", + "model": "nvidia/Llama-3.3-70B-Instruct-FP8", + "precision": "fp8", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 128, "conc-end": 128}, + {"tp": 2, "conc-start": 64, "conc-end": 128}, + {"tp": 4, "conc-start": 4, "conc-end": 128}, + {"tp": 8, "conc-start": 4, "conc-end": 32} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 128, "conc-end": 128}, + {"tp": 2, "conc-start": 64, "conc-end": 128}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 32} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 16, "conc-end": 128}, + {"tp": 4, "conc-start": 4, "conc-end": 128}, + {"tp": 8, "conc-start": 4, "conc-end": 32} + ] + } + ] + }, + "70b-fp8-b200-trt": { + "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", + "model": "nvidia/Llama-3.3-70B-Instruct-FP8", + "precision": "fp8", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 128, "conc-end": 128}, + {"tp": 2, "conc-start": 64, "conc-end": 128}, + {"tp": 4, "conc-start": 4, "conc-end": 128}, + {"tp": 8, "conc-start": 4, "conc-end": 32} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 128, "conc-end": 128}, + {"tp": 2, "conc-start": 64, "conc-end": 128}, + {"tp": 4, "conc-start": 16, "conc-end": 128}, + {"tp": 8, "conc-start": 4, "conc-end": 32} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 128}, + {"tp": 2, "conc-start": 16, "conc-end": 128}, + {"tp": 4, "conc-start": 4, "conc-end": 128}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + } + ] + }, + "70b-fp8-mi300x-vllm": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", + "model": "amd/Llama-3.3-70B-Instruct-FP8-KV", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "conc-start": 64, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "70b-fp8-mi325x-vllm": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", + "model": "amd/Llama-3.3-70B-Instruct-FP8-KV", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 32, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 64, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 16, "conc-end": 64}, + {"tp": 2, "conc-start": 4, "conc-end": 32}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "70b-fp8-mi355x-vllm": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", + "model": "amd/Llama-3.3-70B-Instruct-FP8-KV", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "70b-fp4-b200-vllm": { + "image": "vllm/vllm-openai:v0.10.2", + "model": "nvidia/Llama-3.3-70B-Instruct-FP4", + "precision": "fp4", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "conc-start": 32, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 32} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 16, "conc-end": 64}, + {"tp": 2, "conc-start": 16, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 32}, + {"tp": 8, "conc-start": 4, "conc-end": 8} + ] + } + ] + }, + "70b-fp4-b200-trt": { + "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", + "model": "nvidia/Llama-3.3-70B-Instruct-FP4", + "precision": "fp4", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 128, "conc-end": 128}, + {"tp": 2, "conc-start": 64, "conc-end": 128}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 128, "conc-end": 128}, + {"tp": 2, "conc-start": 64, "conc-end": 128}, + {"tp": 4, "conc-start": 16, "conc-end": 128}, + {"tp": 8, "conc-start": 4, "conc-end": 32} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 128}, + {"tp": 2, "conc-start": 16, "conc-end": 128}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + } + ] + }, + "70b-fp4-mi355x-vllm": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", + "model": "amd/Llama-3.3-70B-Instruct-MXFP4-Preview", + "precision": "fp4", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + } + ] + }, + "dsr1-fp8-h200-sgl": { + "image": "lmsysorg/sglang:v0.5.2rc2-cu126", + "model": "deepseek-ai/DeepSeek-R1-0528", + "precision": "fp8", + "framework": "sglang", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "dsr1-fp8-b200-sgl": { + "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200", + "model": "deepseek-ai/DeepSeek-R1-0528", + "precision": "fp8", + "framework": "sglang", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "dsr1-fp8-h200-trt": { + "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", + "model": "deepseek-ai/DeepSeek-R1-0528", + "precision": "fp8", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32}, + {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 64} + ] + } + ] + }, + "dsr1-fp8-b200-trt": { + "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", + "model": "deepseek-ai/DeepSeek-R1-0528", + "precision": "fp8", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32}, + {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "dsr1-fp8-mi300x-sgl": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", + "model": "deepseek-ai/DeepSeek-R1-0528", + "precision": "fp8", + "framework": "sglang", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "dsr1-fp8-mi325x-sgl": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", + "model": "deepseek-ai/DeepSeek-R1-0528", + "precision": "fp8", + "framework": "sglang", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "dsr1-fp8-mi355x-sgl": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", + "model": "deepseek-ai/DeepSeek-R1-0528", + "precision": "fp8", + "framework": "sglang", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "dsr1-fp4-b200-sgl": { + "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200", + "model": "nvidia/DeepSeek-R1-0528-FP4-V2", + "precision": "fp4", + "framework": "sglang", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 128} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 128} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 16} + ] + } + ] + }, + "dsr1-fp4-b200-trt": { + "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", + "model": "nvidia/DeepSeek-R1-0528-FP4-V2", + "precision": "fp4", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 64, "conc-end": 128}, + {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 16, "conc-end": 128}, + {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 64, "conc-end": 128}, + {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 16}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 32, "conc-end": 128}, + {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32}, + {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 64, "conc-end": 256}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32}, + {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 256} + ] + } + ] + }, + "dsr1-fp4-mi355x-sgl": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", + "model": "amd/DeepSeek-R1-0528-MXFP4-Preview", + "precision": "fp4", + "framework": "sglang", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "gptoss-fp4-h100-vllm": { + "image": "vllm/vllm-openai:v0.10.2", + "model": "openai/gpt-oss-120b", + "precision": "fp4", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 32} + ] + } + ] + }, + "gptoss-fp4-h200-vllm": { + "image": "vllm/vllm-openai:v0.10.2", + "model": "openai/gpt-oss-120b", + "precision": "fp4", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64}, + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 16}, + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64}, + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 32} + ] + } + ] + }, + "gptoss-fp4-b200-vllm": { + "image": "vllm/vllm-openai:v0.10.2", + "model": "openai/gpt-oss-120b", + "precision": "fp4", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 8} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 8} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64}, + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "gptoss-fp4-h200-trt": { + "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", + "model": "openai/gpt-oss-120b", + "precision": "fp4", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} + ] + } + ] + }, + "gptoss-fp4-b200-trt": { + "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", + "model": "openai/gpt-oss-120b", + "precision": "fp4", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 8}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 16, "conc-end": 64}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 8}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 16, "conc-end": 64}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8} + ] + } + ] + }, + "gptoss-fp4-mi300x-vllm": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", + "model": "openai/gpt-oss-120b", + "precision": "fp4", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64}, + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16} + ] + } + ] + }, + "gptoss-fp4-mi325x-vllm": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", + "model": "openai/gpt-oss-120b", + "precision": "fp4", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64}, + {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 64, "conc-end": 64}, + {"tp": 2, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "conc-start": 64, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 4, "conc-end": 64}, + {"tp": 2, "conc-start": 4, "conc-end": 8}, + {"tp": 4, "conc-start": 4, "conc-end": 8}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + } + ] + }, + "gptoss-fp4-mi355x-vllm": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", + "model": "openai/gpt-oss-120b", + "precision": "fp4", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 16}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + }, + { + "isl": 1024, + "osl": 8192, + "bmk-space": [ + {"tp": 1, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 16}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + }, + { + "isl": 8192, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 4, "conc-end": 64}, + {"tp": 4, "conc-start": 4, "conc-end": 16}, + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + } + ] + } +} \ No newline at end of file diff --git a/utils/get_configs.py b/utils/get_configs.py new file mode 100644 index 000000000..cf160895b --- /dev/null +++ b/utils/get_configs.py @@ -0,0 +1,29 @@ +import json +import sys + +seq_len_stoi = { + "1k1k": (1024, 1024), + "1k8k": (1024, 8192), + "8k1k": (8192, 1024) +} + +def main(): + if len(sys.argv) < 3: + print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}}") + exit(1) + + config_file = sys.argv[1] + seq_len = sys.argv[2] + + isl, osl = seq_len_stoi.get(seq_len) or (None, None) + if not (isl or osl): + raise ValueError(f"Input 'isl-osl' must be one of '{', '.join(seq_len_stoi.keys())}'.") + + try: + with open(config_file, 'r') as f: + config_data = json.load(f) + except Exception as e: + raise ValueError(f"Input file '{config_file}' does not exist.") + +if __name__ == "__main__": + main() \ No newline at end of file From d4747184bf40ab5446b22f7c8dec16d5a4596686 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 24 Oct 2025 15:48:25 -0500 Subject: [PATCH 003/149] adding runner field --- .github/configs/master.json | 29 ++++++++++++++++ utils/get_configs.py | 66 +++++++++++++++++++++++++++++++++++-- 2 files changed, 93 insertions(+), 2 deletions(-) diff --git a/.github/configs/master.json b/.github/configs/master.json index 1706be9ab..274c22512 100644 --- a/.github/configs/master.json +++ b/.github/configs/master.json @@ -2,6 +2,7 @@ "70b-fp8-h100-vllm": { "image": "vllm/vllm-openai:v0.10.2", "model": "nvidia/Llama-3.3-70B-Instruct-FP8", + "runner": "h100", "precision": "fp8", "framework": "vllm", "seq-len-configs": [ @@ -37,6 +38,7 @@ "70b-fp8-h200-vllm": { "image": "vllm/vllm-openai:v0.10.2", "model": "nvidia/Llama-3.3-70B-Instruct-FP8", + "runner": "h200", "precision": "fp8", "framework": "vllm", "seq-len-configs": [ @@ -75,6 +77,7 @@ "70b-fp8-b200-vllm": { "image": "vllm/vllm-openai:v0.10.2", "model": "nvidia/Llama-3.3-70B-Instruct-FP8", + "runner": "b200", "precision": "fp8", "framework": "vllm", "seq-len-configs": [ @@ -113,6 +116,7 @@ "70b-fp8-h200-trt": { "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", "model": "nvidia/Llama-3.3-70B-Instruct-FP8", + "runner": "h200-trt", "precision": "fp8", "framework": "trt", "seq-len-configs": [ @@ -150,6 +154,7 @@ "70b-fp8-b200-trt": { "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", "model": "nvidia/Llama-3.3-70B-Instruct-FP8", + "runner": "b200-trt", "precision": "fp8", "framework": "trt", "seq-len-configs": [ @@ -188,6 +193,7 @@ "70b-fp8-mi300x-vllm": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", "model": "amd/Llama-3.3-70B-Instruct-FP8-KV", + "runner": "mi300x", "precision": "fp8", "framework": "vllm", "seq-len-configs": [ @@ -226,6 +232,7 @@ "70b-fp8-mi325x-vllm": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", "model": "amd/Llama-3.3-70B-Instruct-FP8-KV", + "runner": "mi325x", "precision": "fp8", "framework": "vllm", "seq-len-configs": [ @@ -264,6 +271,7 @@ "70b-fp8-mi355x-vllm": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", "model": "amd/Llama-3.3-70B-Instruct-FP8-KV", + "runner": "mi355x", "precision": "fp8", "framework": "vllm", "seq-len-configs": [ @@ -302,6 +310,7 @@ "70b-fp4-b200-vllm": { "image": "vllm/vllm-openai:v0.10.2", "model": "nvidia/Llama-3.3-70B-Instruct-FP4", + "runner": "b200", "precision": "fp4", "framework": "vllm", "seq-len-configs": [ @@ -340,6 +349,7 @@ "70b-fp4-b200-trt": { "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", "model": "nvidia/Llama-3.3-70B-Instruct-FP4", + "runner": "b200-trt", "precision": "fp4", "framework": "trt", "seq-len-configs": [ @@ -378,6 +388,7 @@ "70b-fp4-mi355x-vllm": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", "model": "amd/Llama-3.3-70B-Instruct-MXFP4-Preview", + "runner": "mi355x", "precision": "fp4", "framework": "vllm", "seq-len-configs": [ @@ -416,6 +427,7 @@ "dsr1-fp8-h200-sgl": { "image": "lmsysorg/sglang:v0.5.2rc2-cu126", "model": "deepseek-ai/DeepSeek-R1-0528", + "runner": "h200", "precision": "fp8", "framework": "sglang", "seq-len-configs": [ @@ -445,6 +457,7 @@ "dsr1-fp8-b200-sgl": { "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200", "model": "deepseek-ai/DeepSeek-R1-0528", + "runner": "b200", "precision": "fp8", "framework": "sglang", "seq-len-configs": [ @@ -474,6 +487,7 @@ "dsr1-fp8-h200-trt": { "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", "model": "deepseek-ai/DeepSeek-R1-0528", + "runner": "h200-trt", "precision": "fp8", "framework": "trt", "seq-len-configs": [ @@ -504,6 +518,7 @@ "dsr1-fp8-b200-trt": { "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", "model": "deepseek-ai/DeepSeek-R1-0528", + "runner": "b200-trt", "precision": "fp8", "framework": "trt", "seq-len-configs": [ @@ -534,6 +549,7 @@ "dsr1-fp8-mi300x-sgl": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", "model": "deepseek-ai/DeepSeek-R1-0528", + "runner": "mi300x", "precision": "fp8", "framework": "sglang", "seq-len-configs": [ @@ -563,6 +579,7 @@ "dsr1-fp8-mi325x-sgl": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", "model": "deepseek-ai/DeepSeek-R1-0528", + "runner": "mi325x", "precision": "fp8", "framework": "sglang", "seq-len-configs": [ @@ -592,6 +609,7 @@ "dsr1-fp8-mi355x-sgl": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", "model": "deepseek-ai/DeepSeek-R1-0528", + "runner": "mi355x", "precision": "fp8", "framework": "sglang", "seq-len-configs": [ @@ -621,6 +639,7 @@ "dsr1-fp4-b200-sgl": { "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200", "model": "nvidia/DeepSeek-R1-0528-FP4-V2", + "runner": "b200", "precision": "fp4", "framework": "sglang", "seq-len-configs": [ @@ -653,6 +672,7 @@ "dsr1-fp4-b200-trt": { "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", "model": "nvidia/DeepSeek-R1-0528-FP4-V2", + "runner": "b200-trt", "precision": "fp4", "framework": "trt", "seq-len-configs": [ @@ -695,6 +715,7 @@ "dsr1-fp4-mi355x-sgl": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", "model": "amd/DeepSeek-R1-0528-MXFP4-Preview", + "runner": "mi355x", "precision": "fp4", "framework": "sglang", "seq-len-configs": [ @@ -725,6 +746,7 @@ "gptoss-fp4-h100-vllm": { "image": "vllm/vllm-openai:v0.10.2", "model": "openai/gpt-oss-120b", + "runner": "h100", "precision": "fp4", "framework": "vllm", "seq-len-configs": [ @@ -760,6 +782,7 @@ "gptoss-fp4-h200-vllm": { "image": "vllm/vllm-openai:v0.10.2", "model": "openai/gpt-oss-120b", + "runner": "h200", "precision": "fp4", "framework": "vllm", "seq-len-configs": [ @@ -798,6 +821,7 @@ "gptoss-fp4-b200-vllm": { "image": "vllm/vllm-openai:v0.10.2", "model": "openai/gpt-oss-120b", + "runner": "b200", "precision": "fp4", "framework": "vllm", "seq-len-configs": [ @@ -836,6 +860,7 @@ "gptoss-fp4-h200-trt": { "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", "model": "openai/gpt-oss-120b", + "runner": "h200-trt", "precision": "fp4", "framework": "trt", "seq-len-configs": [ @@ -874,6 +899,7 @@ "gptoss-fp4-b200-trt": { "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", "model": "openai/gpt-oss-120b", + "runner": "b200-trt", "precision": "fp4", "framework": "trt", "seq-len-configs": [ @@ -914,6 +940,7 @@ "gptoss-fp4-mi300x-vllm": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", "model": "openai/gpt-oss-120b", + "runner": "mi300x", "precision": "fp4", "framework": "vllm", "seq-len-configs": [ @@ -952,6 +979,7 @@ "gptoss-fp4-mi325x-vllm": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", "model": "openai/gpt-oss-120b", + "runner": "mi325x", "precision": "fp4", "framework": "vllm", "seq-len-configs": [ @@ -990,6 +1018,7 @@ "gptoss-fp4-mi355x-vllm": { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", "model": "openai/gpt-oss-120b", + "runner": "mi355x", "precision": "fp4", "framework": "vllm", "seq-len-configs": [ diff --git a/utils/get_configs.py b/utils/get_configs.py index cf160895b..787db0b72 100644 --- a/utils/get_configs.py +++ b/utils/get_configs.py @@ -9,11 +9,12 @@ def main(): if len(sys.argv) < 3: - print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}}") + print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}} [step-size]") exit(1) config_file = sys.argv[1] seq_len = sys.argv[2] + step_size = int(sys.argv[3]) if len(sys.argv) > 3 else 2 isl, osl = seq_len_stoi.get(seq_len) or (None, None) if not (isl or osl): @@ -22,8 +23,69 @@ def main(): try: with open(config_file, 'r') as f: config_data = json.load(f) - except Exception as e: + assert isinstance(config_data, dict) + except FileNotFoundError: raise ValueError(f"Input file '{config_file}' does not exist.") + + matrix_values = [] + for key, val in config_data.items(): + seq_len_configs = val.get('seq-len-configs') + assert seq_len_configs, f"Missing 'seq-len-configs' for key '{key}'" + + image = val.get('image') + model = val.get('model') + precision = val.get('precision') + framework = val.get('framework') + runner = val.get('runner') + bmk_space = val.get('bmk-space') + + assert None not in (image, model, precision, framework, runner), \ + f"Missing required fields for key '{key}'" + assert bmk_space, f"Missing 'bmk-space' for key '{key}'" + + # Check if this config has matching sequence lengths + matching_seq_config = None + for slq in seq_len_configs: + if slq.get('isl') == isl and slq.get('osl') == osl: + matching_seq_config = slq + break + + if not matching_seq_config: + continue # Skip this config if no matching sequence length + + # Now flatten the bmk-space + for bmk in bmk_space: + tp = bmk.get('tp') + conc_start = bmk.get('conc-start') + conc_end = bmk.get('conc-end') + + assert None not in (tp, conc_start, conc_end), \ + f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{key}'" + + # Generate entries for each concurrency value in the range + conc = conc_start + while conc <= conc_end: + entry = { + 'image': image, + 'model': model, + 'precision': precision, + 'framework': framework, + 'runner': runner, + 'isl': isl, + 'osl': osl, + 'tp': tp, + 'conc': conc + } + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= step_size + if conc > conc_end: + conc = conc_end # Ensure we hit the end value + + print(json.dumps(matrix_values)) + return matrix_values if __name__ == "__main__": main() \ No newline at end of file From 346b10d97de81703f77adf18c0f35e095ca1ef76 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 24 Oct 2025 16:15:18 -0500 Subject: [PATCH 004/149] finishing up script, ready for testing --- .github/workflows/1k1k-sweep.yml | 7 ++++++- utils/get_configs.py | 19 ++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index e1a103f83..3d01ceabf 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -9,8 +9,13 @@ jobs: outputs: search-space-config: ${{ steps.get-1k1k-configs.outputs.search-space-config }} steps: + - name: Checkout code + uses: actions/checkout@v4 + - id: get-1k1k-configs - run: python utils/print_configs_json.py configs.json 1k1k + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/print_configs_json.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark: needs: get-1k1k-configs diff --git a/utils/get_configs.py b/utils/get_configs.py index 787db0b72..9fa911d3e 100644 --- a/utils/get_configs.py +++ b/utils/get_configs.py @@ -37,11 +37,9 @@ def main(): precision = val.get('precision') framework = val.get('framework') runner = val.get('runner') - bmk_space = val.get('bmk-space') assert None not in (image, model, precision, framework, runner), \ f"Missing required fields for key '{key}'" - assert bmk_space, f"Missing 'bmk-space' for key '{key}'" # Check if this config has matching sequence lengths matching_seq_config = None @@ -51,13 +49,17 @@ def main(): break if not matching_seq_config: - continue # Skip this config if no matching sequence length + continue # Skip this config if no matching sequence length, this is possible + + bmk_space = matching_seq_config.get('bmk-space') + assert bmk_space, f"Missing 'bmk-space' in matching seq-len-config for key '{key}'" - # Now flatten the bmk-space for bmk in bmk_space: tp = bmk.get('tp') conc_start = bmk.get('conc-start') conc_end = bmk.get('conc-end') + ep = bmk.get('ep') + dp_attn = bmk.get('dp-attn') assert None not in (tp, conc_start, conc_end), \ f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{key}'" @@ -76,13 +78,20 @@ def main(): 'tp': tp, 'conc': conc } + + # Add optional fields if they exist + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + matrix_values.append(entry) if conc == conc_end: break conc *= step_size if conc > conc_end: - conc = conc_end # Ensure we hit the end value + conc = conc_end print(json.dumps(matrix_values)) return matrix_values From 0dc246c85eafc50da9b2c5b434e5e7d5c495c43d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 24 Oct 2025 16:22:09 -0500 Subject: [PATCH 005/149] testing purposes --- .github/workflows/1k1k-sweep.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 3d01ceabf..c0bbbf57e 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -1,6 +1,7 @@ name: '1K/1K Sweep' on: + pull_request: workflow_dispatch: jobs: From 02f57924c32cd142ba8a7064e35358e015bebf39 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 24 Oct 2025 16:22:50 -0500 Subject: [PATCH 006/149] testing purposes --- .github/workflows/1k1k-sweep.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index c0bbbf57e..73bb9d5a6 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -15,7 +15,7 @@ jobs: - id: get-1k1k-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/print_configs_json.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark: From e93d20bb7f6716daa0a161fa94cafba37555d5de Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 24 Oct 2025 16:43:09 -0500 Subject: [PATCH 007/149] refactoring more --- .github/workflows/1k1k-sweep.yml | 90 ++++++++++++++++++++++++++++---- utils/get_configs.py | 13 +++-- 2 files changed, 88 insertions(+), 15 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 73bb9d5a6..3e199572d 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -5,26 +5,92 @@ on: workflow_dispatch: jobs: - get-1k1k-configs: + get-70b-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-1k1k-configs.outputs.search-space-config }} + search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }} steps: - name: Checkout code uses: actions/checkout@v4 - - - id: get-1k1k-configs + + - id: get-70b-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k 70b) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + get-dsr1-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-dsr1-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - - benchmark: - needs: get-1k1k-configs + + get-gptoss-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-gptoss-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k gptoss) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + benchmark-70b: + needs: get-70b-configs + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} + secrets: inherit + with: + isl: 1024 + osl: 1024 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + tp: ${{ matrix.config.tp }} + conc: ${{ matrix.config.conc }} + + benchmark-dsr1: + needs: get-dsr1-configs + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + secrets: inherit + with: + isl: 1024 + osl: 1024 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + tp: ${{ matrix.config.tp }} + conc: ${{ matrix.config.conc }} + + benchmark-gptoss: + needs: get-gptoss-configs uses: ./.github/workflows/benchmark-tmpl.yml strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-1k1k-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} secrets: inherit with: isl: 1024 @@ -39,5 +105,7 @@ jobs: conc: ${{ matrix.config.conc }} # collect-results: -# needs: benchmark -# steps: \ No newline at end of file +# needs: [benchmark-70b, benchmark-dsr1, benchmark-gptoss] +# uses: ./.github/workflows/collect-results.yml +# with: +# exp-name: 1k1k \ No newline at end of file diff --git a/utils/get_configs.py b/utils/get_configs.py index 9fa911d3e..7aec991b0 100644 --- a/utils/get_configs.py +++ b/utils/get_configs.py @@ -8,13 +8,14 @@ } def main(): - if len(sys.argv) < 3: - print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}} [step-size]") + if len(sys.argv) < 4: + print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}} {{model-prefix}} [step-size]") exit(1) - + config_file = sys.argv[1] seq_len = sys.argv[2] - step_size = int(sys.argv[3]) if len(sys.argv) > 3 else 2 + model_prefix = sys.argv[3] + step_size = int(sys.argv[4]) if len(sys.argv) > 4 else 2 isl, osl = seq_len_stoi.get(seq_len) or (None, None) if not (isl or osl): @@ -29,6 +30,10 @@ def main(): matrix_values = [] for key, val in config_data.items(): + # Filter by model prefix + if not key.startswith(model_prefix): + continue + seq_len_configs = val.get('seq-len-configs') assert seq_len_configs, f"Missing 'seq-len-configs' for key '{key}'" From 88239ac6400dff146621c62fb8d164f4fba73b34 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 24 Oct 2025 16:48:51 -0500 Subject: [PATCH 008/149] refactoring more --- .github/workflows/benchmark-tmpl.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index c78dcb602..4a21825d5 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -65,8 +65,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - name: '${{ inputs.runner }} ${{ inputs.exp-name }}-${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa-${{ inputs.dp-attn }} conc${{ inputs.conc }}' - + name: '${{ inputs.exp-name }} ${{ inputs.runner }}-${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa-${{ inputs.dp-attn }} conc${{ inputs.conc }}' steps: - name: Resource cleanup run: | From f00e47da557329fbbf0ae373de290a9e2e7b8628 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 24 Oct 2025 16:53:21 -0500 Subject: [PATCH 009/149] refactoring more --- .github/workflows/1k1k-sweep.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 3e199572d..512ae819c 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -53,6 +53,7 @@ jobs: config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} secrets: inherit with: + exp-name: "70b_1k1k" isl: 1024 osl: 1024 runner: ${{ matrix.config.runner }} @@ -60,7 +61,6 @@ jobs: model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} tp: ${{ matrix.config.tp }} conc: ${{ matrix.config.conc }} @@ -73,6 +73,7 @@ jobs: config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} secrets: inherit with: + exp-name: "dsr1_1k1k" isl: 1024 osl: 1024 runner: ${{ matrix.config.runner }} @@ -80,7 +81,6 @@ jobs: model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} tp: ${{ matrix.config.tp }} conc: ${{ matrix.config.conc }} @@ -93,6 +93,7 @@ jobs: config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} secrets: inherit with: + exp-name: "gptoss_1k1k" isl: 1024 osl: 1024 runner: ${{ matrix.config.runner }} @@ -100,7 +101,6 @@ jobs: model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} tp: ${{ matrix.config.tp }} conc: ${{ matrix.config.conc }} From 8cc9eebd6ab0879092c759d120747886b1e5771a Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 24 Oct 2025 16:54:48 -0500 Subject: [PATCH 010/149] refactoring more --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 4a21825d5..5f52c94ef 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -65,7 +65,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - name: '${{ inputs.exp-name }} ${{ inputs.runner }}-${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa-${{ inputs.dp-attn }} conc${{ inputs.conc }}' + name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} conc${{ inputs.conc }}' steps: - name: Resource cleanup run: | From 7be26739febd03147baf233dd82ab1b280679751 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sat, 25 Oct 2025 15:17:29 -0500 Subject: [PATCH 011/149] refactoring more --- .github/workflows/1k1k-sweep.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 512ae819c..85a175a4a 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -47,6 +47,7 @@ jobs: benchmark-70b: needs: get-70b-configs uses: ./.github/workflows/benchmark-tmpl.yml + name: 70b 1k1k strategy: fail-fast: false matrix: @@ -67,6 +68,7 @@ jobs: benchmark-dsr1: needs: get-dsr1-configs uses: ./.github/workflows/benchmark-tmpl.yml + name: dsr1 1k1k strategy: fail-fast: false matrix: @@ -87,6 +89,7 @@ jobs: benchmark-gptoss: needs: get-gptoss-configs uses: ./.github/workflows/benchmark-tmpl.yml + name: gptoss 1k1k strategy: fail-fast: false matrix: From f9c5e2757e1d617a2f79d80db9a124f91e500b3b Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sat, 25 Oct 2025 15:24:29 -0500 Subject: [PATCH 012/149] refactoring more --- .github/workflows/1k1k-sweep.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 85a175a4a..2e6c3cffa 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -57,6 +57,7 @@ jobs: exp-name: "70b_1k1k" isl: 1024 osl: 1024 + max-model-len: 2048 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -78,6 +79,7 @@ jobs: exp-name: "dsr1_1k1k" isl: 1024 osl: 1024 + max-model-len: 2048 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -99,6 +101,7 @@ jobs: exp-name: "gptoss_1k1k" isl: 1024 osl: 1024 + max-model-len: 2048 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} From bb460c7d3516b772e072f3830c0a7a91f385ad18 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sat, 25 Oct 2025 15:25:14 -0500 Subject: [PATCH 013/149] refactoring more --- .github/workflows/benchmark-tmpl.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 5f52c94ef..d785e32e5 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -41,6 +41,9 @@ on: required: false type: boolean default: false + max-model-len: + required: true + type: string conc: required: true type: string @@ -52,6 +55,7 @@ env: MODEL: ${{ inputs.model }} ISL: ${{ inputs.isl }} OSL: ${{ inputs.osl }} + MAX_MODEL_LEN: ${{ inputs.max-model-len }} RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }} IMAGE: ${{ inputs.image }} FRAMEWORK: ${{ inputs.framework }} From 2a5658adbbd971ec7400676301890e3a03fc352b Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sat, 25 Oct 2025 15:29:49 -0500 Subject: [PATCH 014/149] refactoring more --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index d785e32e5..90df56641 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -36,7 +36,7 @@ on: ep: required: false type: string - default: '' + default: '1' dp-attn: required: false type: boolean From 15da179aef6662a3101e462337d19ffd8e104553 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sat, 25 Oct 2025 15:39:29 -0500 Subject: [PATCH 015/149] refactoring more --- .github/workflows/benchmark-tmpl.yml | 2 +- utils/process_result.py | 23 +++++++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 90df56641..66373a2f5 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -111,7 +111,7 @@ jobs: - name: Process result run: | - python3 utils/process_result.py ${{ inputs.runner }} + python3 utils/process_result.py ${{ inputs.runner }} $TP $EP_SIZE $DP_ATTENTION $RESULT_FILENAME $FRAMEWORK $PRECISION - name: Upload result uses: actions/upload-artifact@v4 with: diff --git a/utils/process_result.py b/utils/process_result.py index aaf8ac0d2..a59d1f7f3 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -5,31 +5,30 @@ hw = sys.argv[1] tp_size = int(sys.argv[2]) -result_filename = sys.argv[3] -framework = sys.argv[4] -precision = sys.argv[5] +ep_size = int(sys.argv[3]) +dp_attention = sys.argv[4] +result_filename = sys.argv[5] +framework = sys.argv[6] +precision = sys.argv[7] with open(f'{result_filename}.json') as f: bmk_result = json.load(f) -tput_per_gpu = float(bmk_result['total_token_throughput']) / tp_size -output_tput_per_gpu = float(bmk_result['output_throughput']) / tp_size -input_tput_per_gpu = tput_per_gpu - output_tput_per_gpu - data = { 'hw': hw, 'tp': tp_size, + 'ep': ep_size, 'conc': int(bmk_result['max_concurrency']), + 'dp_attention': dp_attention, # true or false 'model': bmk_result['model_id'], 'framework': framework, 'precision': precision, - 'tput_per_gpu': tput_per_gpu, - 'output_tput_per_gpu': output_tput_per_gpu, - 'input_tput_per_gpu': input_tput_per_gpu + 'tput_per_gpu': float(bmk_result['total_token_throughput']) / tp_size, + 'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size } -if len(sys.argv) == 7: # MTP - data['mtp'] = sys.argv[6] +if len(sys.argv) == 9: # MTP + data['mtp'] = sys.argv[8] for key, value in bmk_result.items(): if key.endswith('ms'): From 9bf6b1fdf5facb079b8a5d6d76eb473bfdeed8a9 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sat, 25 Oct 2025 16:07:09 -0500 Subject: [PATCH 016/149] refactoring more --- .github/configs/master.json | 6 ++---- .github/workflows/benchmark-tmpl.yml | 2 +- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 25 ++++++++++--------------- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/.github/configs/master.json b/.github/configs/master.json index 274c22512..00a2c0a17 100644 --- a/.github/configs/master.json +++ b/.github/configs/master.json @@ -909,8 +909,7 @@ "bmk-space": [ {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64}, {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 8}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 16, "conc-end": 64}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64}, {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8} ] }, @@ -920,8 +919,7 @@ "bmk-space": [ {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64}, {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 8}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 16, "conc-end": 64}, + {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64}, {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8} ] }, diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 66373a2f5..6cfb692fe 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -99,7 +99,7 @@ jobs: - name: Launch job script env: RUNNER_NAME: ${{ runner.name }} - RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP }}_conc${{ env.CONC }}_${{ runner.name }} + RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_conc${{ env.CONC }}_${{ runner.name }} run: | bash ./runners/launch_${RUNNER_NAME%%_*}.sh if [ -f "$RESULT_FILENAME.json" ]; then diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 4f17d4d4f..6b2f251dd 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -21,26 +21,27 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION" +MOE_BACKEND="TRTLLM" + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, DP_ATTENTION: $DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND" hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) -# ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= -# Default -MOE_BACKEND="TRTLLM" +# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= # Higher concurrencies: Concurrency >= 256 # MoE Backend = CUTLASS +# Use DP attention with expert parallel MoE if [[ $CONC -ge 256 ]]; then - MOE_BACKEND="CUTLASS" + EP_SIZE="$TP" + DP_ATTENTION=true fi -echo "MOE_BACKEND set to $MOE_BACKEND" - EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 +export NCCL_GRAPH_REGISTER=0 cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: @@ -48,7 +49,7 @@ cuda_graph_config: max_batch_size: $CONC enable_attention_dp: $DP_ATTENTION kv_cache_config: - dtype: auto + dtype: fp8 enable_block_reuse: false free_gpu_memory_fraction: 0.85 print_iter_log: true @@ -88,12 +89,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi if [[ "$line" == *"Application startup complete"* ]]; then break fi @@ -110,4 +105,4 @@ python3 bench_serving/benchmark_serving.py \ --request-rate inf --ignore-eos \ --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ --result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json \ No newline at end of file +--result-filename $RESULT_FILENAME.json From 8d330cd0a7b00a6ca1457e2d3a10fe44623d8209 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sat, 25 Oct 2025 16:09:17 -0500 Subject: [PATCH 017/149] refactoring more --- .github/configs/master.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/master.json b/.github/configs/master.json index 00a2c0a17..d42b98b46 100644 --- a/.github/configs/master.json +++ b/.github/configs/master.json @@ -897,7 +897,7 @@ ] }, "gptoss-fp4-b200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", + "image": "nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1", "model": "openai/gpt-oss-120b", "runner": "b200-trt", "precision": "fp4", From 8f665ddade9354db612b008f75e59be7b5ec8e6c Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 12:13:39 -0500 Subject: [PATCH 018/149] updating the benchmark files with logic --- .github/configs/master.json | 1052 ----------------------- .github/configs/master.yaml | 784 +++++++++++++++++ benchmarks/dsr1_fp4_b200_trt_slurm.sh | 11 +- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 11 +- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 11 +- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 15 +- utils/get_configs.py | 3 +- 7 files changed, 809 insertions(+), 1078 deletions(-) delete mode 100644 .github/configs/master.json create mode 100644 .github/configs/master.yaml diff --git a/.github/configs/master.json b/.github/configs/master.json deleted file mode 100644 index d42b98b46..000000000 --- a/.github/configs/master.json +++ /dev/null @@ -1,1052 +0,0 @@ -{ - "70b-fp8-h100-vllm": { - "image": "vllm/vllm-openai:v0.10.2", - "model": "nvidia/Llama-3.3-70B-Instruct-FP8", - "runner": "h100", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 2, "conc-start": 64, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 2, "conc-start": 64, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "70b-fp8-h200-vllm": { - "image": "vllm/vllm-openai:v0.10.2", - "model": "nvidia/Llama-3.3-70B-Instruct-FP8", - "runner": "h200", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "conc-start": 64, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 16, "conc-end": 64}, - {"tp": 2, "conc-start": 16, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "70b-fp8-b200-vllm": { - "image": "vllm/vllm-openai:v0.10.2", - "model": "nvidia/Llama-3.3-70B-Instruct-FP8", - "runner": "b200", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "conc-start": 64, "conc-end": 64}, - {"tp": 4, "conc-start": 16, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 16, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 32} - ] - } - ] - }, - "70b-fp8-h200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", - "model": "nvidia/Llama-3.3-70B-Instruct-FP8", - "runner": "h200-trt", - "precision": "fp8", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 128, "conc-end": 128}, - {"tp": 2, "conc-start": 64, "conc-end": 128}, - {"tp": 4, "conc-start": 4, "conc-end": 128}, - {"tp": 8, "conc-start": 4, "conc-end": 32} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 128, "conc-end": 128}, - {"tp": 2, "conc-start": 64, "conc-end": 128}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 32} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 16, "conc-end": 128}, - {"tp": 4, "conc-start": 4, "conc-end": 128}, - {"tp": 8, "conc-start": 4, "conc-end": 32} - ] - } - ] - }, - "70b-fp8-b200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", - "model": "nvidia/Llama-3.3-70B-Instruct-FP8", - "runner": "b200-trt", - "precision": "fp8", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 128, "conc-end": 128}, - {"tp": 2, "conc-start": 64, "conc-end": 128}, - {"tp": 4, "conc-start": 4, "conc-end": 128}, - {"tp": 8, "conc-start": 4, "conc-end": 32} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 128, "conc-end": 128}, - {"tp": 2, "conc-start": 64, "conc-end": 128}, - {"tp": 4, "conc-start": 16, "conc-end": 128}, - {"tp": 8, "conc-start": 4, "conc-end": 32} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 128}, - {"tp": 2, "conc-start": 16, "conc-end": 128}, - {"tp": 4, "conc-start": 4, "conc-end": 128}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - } - ] - }, - "70b-fp8-mi300x-vllm": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", - "model": "amd/Llama-3.3-70B-Instruct-FP8-KV", - "runner": "mi300x", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "conc-start": 64, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "70b-fp8-mi325x-vllm": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", - "model": "amd/Llama-3.3-70B-Instruct-FP8-KV", - "runner": "mi325x", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 32, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 64, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 16, "conc-end": 64}, - {"tp": 2, "conc-start": 4, "conc-end": 32}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "70b-fp8-mi355x-vllm": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", - "model": "amd/Llama-3.3-70B-Instruct-FP8-KV", - "runner": "mi355x", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "70b-fp4-b200-vllm": { - "image": "vllm/vllm-openai:v0.10.2", - "model": "nvidia/Llama-3.3-70B-Instruct-FP4", - "runner": "b200", - "precision": "fp4", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "conc-start": 32, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 32} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 16, "conc-end": 64}, - {"tp": 2, "conc-start": 16, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 32}, - {"tp": 8, "conc-start": 4, "conc-end": 8} - ] - } - ] - }, - "70b-fp4-b200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", - "model": "nvidia/Llama-3.3-70B-Instruct-FP4", - "runner": "b200-trt", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 128, "conc-end": 128}, - {"tp": 2, "conc-start": 64, "conc-end": 128}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 128, "conc-end": 128}, - {"tp": 2, "conc-start": 64, "conc-end": 128}, - {"tp": 4, "conc-start": 16, "conc-end": 128}, - {"tp": 8, "conc-start": 4, "conc-end": 32} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 128}, - {"tp": 2, "conc-start": 16, "conc-end": 128}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - } - ] - }, - "70b-fp4-mi355x-vllm": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", - "model": "amd/Llama-3.3-70B-Instruct-MXFP4-Preview", - "runner": "mi355x", - "precision": "fp4", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - } - ] - }, - "dsr1-fp8-h200-sgl": { - "image": "lmsysorg/sglang:v0.5.2rc2-cu126", - "model": "deepseek-ai/DeepSeek-R1-0528", - "runner": "h200", - "precision": "fp8", - "framework": "sglang", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "dsr1-fp8-b200-sgl": { - "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200", - "model": "deepseek-ai/DeepSeek-R1-0528", - "runner": "b200", - "precision": "fp8", - "framework": "sglang", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "dsr1-fp8-h200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", - "model": "deepseek-ai/DeepSeek-R1-0528", - "runner": "h200-trt", - "precision": "fp8", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32}, - {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 64} - ] - } - ] - }, - "dsr1-fp8-b200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", - "model": "deepseek-ai/DeepSeek-R1-0528", - "runner": "b200-trt", - "precision": "fp8", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32}, - {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "dsr1-fp8-mi300x-sgl": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", - "model": "deepseek-ai/DeepSeek-R1-0528", - "runner": "mi300x", - "precision": "fp8", - "framework": "sglang", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "dsr1-fp8-mi325x-sgl": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", - "model": "deepseek-ai/DeepSeek-R1-0528", - "runner": "mi325x", - "precision": "fp8", - "framework": "sglang", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "dsr1-fp8-mi355x-sgl": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", - "model": "deepseek-ai/DeepSeek-R1-0528", - "runner": "mi355x", - "precision": "fp8", - "framework": "sglang", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "dsr1-fp4-b200-sgl": { - "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200", - "model": "nvidia/DeepSeek-R1-0528-FP4-V2", - "runner": "b200", - "precision": "fp4", - "framework": "sglang", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 128} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 128} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 16} - ] - } - ] - }, - "dsr1-fp4-b200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", - "model": "nvidia/DeepSeek-R1-0528-FP4-V2", - "runner": "b200-trt", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 64, "conc-end": 128}, - {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 16, "conc-end": 128}, - {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 64, "conc-end": 128}, - {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 16}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 32, "conc-end": 128}, - {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32}, - {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 64, "conc-end": 256}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32}, - {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 256} - ] - } - ] - }, - "dsr1-fp4-mi355x-sgl": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", - "model": "amd/DeepSeek-R1-0528-MXFP4-Preview", - "runner": "mi355x", - "precision": "fp4", - "framework": "sglang", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "gptoss-fp4-h100-vllm": { - "image": "vllm/vllm-openai:v0.10.2", - "model": "openai/gpt-oss-120b", - "runner": "h100", - "precision": "fp4", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 32} - ] - } - ] - }, - "gptoss-fp4-h200-vllm": { - "image": "vllm/vllm-openai:v0.10.2", - "model": "openai/gpt-oss-120b", - "runner": "h200", - "precision": "fp4", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64}, - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 16}, - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64}, - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 32} - ] - } - ] - }, - "gptoss-fp4-b200-vllm": { - "image": "vllm/vllm-openai:v0.10.2", - "model": "openai/gpt-oss-120b", - "runner": "b200", - "precision": "fp4", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 8} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 8} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64}, - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "gptoss-fp4-h200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", - "model": "openai/gpt-oss-120b", - "runner": "h200-trt", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64} - ] - } - ] - }, - "gptoss-fp4-b200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1", - "model": "openai/gpt-oss-120b", - "runner": "b200-trt", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8} - ] - } - ] - }, - "gptoss-fp4-mi300x-vllm": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", - "model": "openai/gpt-oss-120b", - "runner": "mi300x", - "precision": "fp4", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64}, - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16} - ] - } - ] - }, - "gptoss-fp4-mi325x-vllm": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", - "model": "openai/gpt-oss-120b", - "runner": "mi325x", - "precision": "fp4", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64}, - {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 64, "conc-end": 64}, - {"tp": 2, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "conc-start": 64, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 4, "conc-end": 64}, - {"tp": 2, "conc-start": 4, "conc-end": 8}, - {"tp": 4, "conc-start": 4, "conc-end": 8}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - } - ] - }, - "gptoss-fp4-mi355x-vllm": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", - "model": "openai/gpt-oss-120b", - "runner": "mi355x", - "precision": "fp4", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 16}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 1, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 16}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - }, - { - "isl": 8192, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 4, "conc-end": 64}, - {"tp": 4, "conc-start": 4, "conc-end": 16}, - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - } - ] - } -} \ No newline at end of file diff --git a/.github/configs/master.yaml b/.github/configs/master.yaml new file mode 100644 index 000000000..e83df34c0 --- /dev/null +++ b/.github/configs/master.yaml @@ -0,0 +1,784 @@ +70b-fp4-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/Llama-3.3-70B-Instruct-FP4 + runner: b200-trt + precision: fp4 + framework: trt + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 16, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 128 } + - { tp: 2, conc-start: 16, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + +70b-fp4-b200-vllm: + image: vllm/vllm-openai:v0.10.2 + model: nvidia/Llama-3.3-70B-Instruct-FP4 + runner: b200 + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 16, conc-end: 64 } + - { tp: 2, conc-start: 16, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 8, conc-start: 4, conc-end: 8 } + +70b-fp4-mi355x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: amd/Llama-3.3-70B-Instruct-MXFP4-Preview + runner: mi355x + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + +70b-fp8-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/Llama-3.3-70B-Instruct-FP8 + runner: b200-trt + precision: fp8 + framework: trt + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 16, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 128 } + - { tp: 2, conc-start: 16, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 16 } + +70b-fp8-b200-vllm: + image: vllm/vllm-openai:v0.10.2 + model: nvidia/Llama-3.3-70B-Instruct-FP8 + runner: b200 + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 16, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 16, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 32 } + +70b-fp8-h100-vllm: + image: vllm/vllm-openai:v0.10.2 + model: nvidia/Llama-3.3-70B-Instruct-FP8 + runner: h100 + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 2, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 2, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +70b-fp8-h200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/Llama-3.3-70B-Instruct-FP8 + runner: h200-trt + precision: fp8 + framework: trt + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 16, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 32 } + +70b-fp8-h200-vllm: + image: vllm/vllm-openai:v0.10.2 + model: nvidia/Llama-3.3-70B-Instruct-FP8 + runner: h200 + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 16, conc-end: 64 } + - { tp: 2, conc-start: 16, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +70b-fp8-mi300x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: amd/Llama-3.3-70B-Instruct-FP8-KV + runner: mi300x + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +70b-fp8-mi325x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: amd/Llama-3.3-70B-Instruct-FP8-KV + runner: mi325x + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 32, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 64, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 16, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 32 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +70b-fp8-mi355x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: amd/Llama-3.3-70B-Instruct-FP8-KV + runner: mi355x + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp4-b200-sgl: + image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + runner: b200 + precision: fp4 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + +dsr1-fp4-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + runner: b200-trt + precision: fp4 + framework: trt + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + # If TP=4, + # If CONC > 32, then EP=4 + # If CONC >= 256, DP_ATTN=true + - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } + # If TP=8, + # If CONC > 8, then EP=8 + # If CONC >= 256, DP_ATTN=true + - { tp: 8, conc-start: 4, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } + - isl: 1024 + osl: 8192 + bmk-space: + # If TP=4, + # If CONC > 32, then EP=4 + # If CONC >= 256, DP_ATTN=true + - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } + # If TP=8, + # If CONC > 16, then EP=8 + # If CONC >= 256, DP_ATTN=true + - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } + - isl: 8192 + osl: 1024 + bmk-space: + # If TP=4, + # If CONC > 32, then EP=4 and DP_ATTN=true + - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } + # If TP=8, + # If CONC > 32, then EP=8 and DP_ATTN=true + - { tp: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } + +dsr1-fp4-mi355x-sgl: + image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915 + model: amd/DeepSeek-R1-0528-MXFP4-Preview + runner: mi355x + precision: fp4 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-b200-sgl: + image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 + model: deepseek-ai/DeepSeek-R1-0528 + runner: b200 + precision: fp8 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: deepseek-ai/DeepSeek-R1-0528 + runner: b200-trt + precision: fp8 + framework: trt + seq-len-configs: + # For all sequence lengths, EP=TP + - isl: 1024 + osl: 1024 + bmk-space: + # If CONC > 32, then DP_ATTN=true + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + # If CONC > 64, then DP_ATTN=true + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + # If CONC > 64, then DP_ATTN=true + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-h200-sgl: + image: lmsysorg/sglang:v0.5.2rc2-cu126 + model: deepseek-ai/DeepSeek-R1-0528 + runner: h200 + precision: fp8 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-h200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: deepseek-ai/DeepSeek-R1-0528 + runner: h200-trt + precision: fp8 + framework: trt + # For all sequence lengths, EP=TP + seq-len-configs: + - isl: 1024 + osl: 1024 + # If CONC > 64, then DP_ATTN=true + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + # If CONC > 64, then DP_ATTN=true + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + # If CONC > 32, then DP_ATTN=true + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } + +dsr1-fp8-mi300x-sgl: + image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915 + model: deepseek-ai/DeepSeek-R1-0528 + runner: mi300x + precision: fp8 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-mi325x-sgl: + image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915 + model: deepseek-ai/DeepSeek-R1-0528 + runner: mi325x + precision: fp8 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-mi355x-sgl: + image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915 + model: deepseek-ai/DeepSeek-R1-0528 + runner: mi355x + precision: fp8 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +gptoss-fp4-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 + model: openai/gpt-oss-120b + runner: b200-trt + precision: fp4 + framework: trt + # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + +gptoss-fp4-b200-vllm: + image: vllm/vllm-openai:v0.10.2 + model: openai/gpt-oss-120b + runner: b200 + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + +gptoss-fp4-h100-vllm: + image: vllm/vllm-openai:v0.10.2 + model: openai/gpt-oss-120b + runner: h100 + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + +gptoss-fp4-h200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: openai/gpt-oss-120b + runner: h200-trt + precision: fp4 + framework: trt + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, ep: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + +gptoss-fp4-h200-vllm: + image: vllm/vllm-openai:v0.10.2 + model: openai/gpt-oss-120b + runner: h200 + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 16 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + +gptoss-fp4-mi300x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: openai/gpt-oss-120b + runner: mi300x + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + +gptoss-fp4-mi325x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: openai/gpt-oss-120b + runner: mi325x + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 64, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 8 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 16 } + +gptoss-fp4-mi355x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: openai/gpt-oss-120b + runner: mi355x + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 16 } diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index d13584078..ababfa150 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -13,19 +13,18 @@ # CONC # RESULT_FILENAME # PORT_OFFSET -# EP_SIZE # DP_ATTENTION +# EP_SIZE echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -# Default -MOE_BACKEND="TRTLLM" - -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" hf download $MODEL -# ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= +# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= +MOE_BACKEND="TRTLLM" + if [[ "$TP" == "4" ]]; then if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then if [[ $CONC -ge 256 ]]; then diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 6bc8c9fa7..509cca7ba 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -13,17 +13,20 @@ # CONC # RESULT_FILENAME # PORT_OFFSET -# EP_SIZE # DP_ATTENTION +# EP_SIZE echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -MOE_BACKEND="DEEPGEMM" - -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" hf download $MODEL +# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= +MOE_BACKEND="DEEPGEMM" + +echo "MOE_BACKEND set to '$MOE_BACKEND'" + SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) EXTRA_CONFIG_FILE="dsr1-fp8.yml" diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 5dfdf8617..174d67b53 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -13,17 +13,20 @@ # CONC # RESULT_FILENAME # PORT_OFFSET -# EP_SIZE # DP_ATTENTION +# EP_SIZE echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -MOE_BACKEND="CUTLASS" - -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" hf download $MODEL +# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= +MOE_BACKEND="CUTLASS" + +echo "MOE_BACKEND set to '$MOE_BACKEND'" + SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) EXTRA_CONFIG_FILE="dsr1-fp8.yml" diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 6b2f251dd..349930dfb 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -13,31 +13,24 @@ # CONC # RESULT_FILENAME # PORT_OFFSET -# EP_SIZE # DP_ATTENTION +# EP_SIZE # GPTOSS TRTLLM Deployment Guide: # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -MOE_BACKEND="TRTLLM" - -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, DP_ATTENTION: $DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= +MOE_BACKEND="TRTLLM" -# Higher concurrencies: Concurrency >= 256 -# MoE Backend = CUTLASS -# Use DP attention with expert parallel MoE -if [[ $CONC -ge 256 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true -fi +echo "MOE_BACKEND set to '$MOE_BACKEND'" EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 diff --git a/utils/get_configs.py b/utils/get_configs.py index 7aec991b0..24c6ea8a3 100644 --- a/utils/get_configs.py +++ b/utils/get_configs.py @@ -1,4 +1,5 @@ import json +import yaml import sys seq_len_stoi = { @@ -23,7 +24,7 @@ def main(): try: with open(config_file, 'r') as f: - config_data = json.load(f) + config_data = yaml.safe_load(f) assert isinstance(config_data, dict) except FileNotFoundError: raise ValueError(f"Input file '{config_file}' does not exist.") From 098748283eca956be8d7255a42f006fbff31475a Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 12:14:38 -0500 Subject: [PATCH 019/149] updating the benchmark files with logic --- .github/workflows/1k1k-sweep.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 2e6c3cffa..9b1c10aae 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -15,7 +15,7 @@ jobs: - id: get-70b-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k 70b) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k 70b) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-dsr1-configs: @@ -28,7 +28,7 @@ jobs: - id: get-dsr1-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k dsr1) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-gptoss-configs: @@ -41,7 +41,7 @@ jobs: - id: get-gptoss-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k gptoss) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark-70b: From d9fd1910668bec01a9632588e4c74caa6df77fb7 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 12:21:36 -0500 Subject: [PATCH 020/149] updating the benchmark files with logic --- .github/configs/master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/master.yaml b/.github/configs/master.yaml index e83df34c0..e085deb3f 100644 --- a/.github/configs/master.yaml +++ b/.github/configs/master.yaml @@ -559,7 +559,7 @@ dsr1-fp8-mi355x-sgl: gptoss-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 model: openai/gpt-oss-120b - runner: b200-trt + runner: b200-nvs precision: fp4 framework: trt # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true From 78f6b8d2d0e20610fba59b43a5129606cbd1111e Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 12:37:08 -0500 Subject: [PATCH 021/149] updating the benchmark files with logic --- .github/configs/master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/master.yaml b/.github/configs/master.yaml index e085deb3f..b5743fe51 100644 --- a/.github/configs/master.yaml +++ b/.github/configs/master.yaml @@ -642,7 +642,7 @@ gptoss-fp4-h100-vllm: - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } gptoss-fp4-h200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev model: openai/gpt-oss-120b runner: h200-trt precision: fp4 From d808413f42ebd9c384901f6fea46e0cfa59d797b Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 13:36:08 -0500 Subject: [PATCH 022/149] updating the benchmark files with logic --- .github/workflows/1k1k-sweep.yml | 28 +++++++++++++++++++++++----- utils/summarize.py | 8 +++++--- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 9b1c10aae..129bad2af 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -110,8 +110,26 @@ jobs: tp: ${{ matrix.config.tp }} conc: ${{ matrix.config.conc }} -# collect-results: -# needs: [benchmark-70b, benchmark-dsr1, benchmark-gptoss] -# uses: ./.github/workflows/collect-results.yml -# with: -# exp-name: 1k1k \ No newline at end of file + collect-70b-results: + needs: benchmark-70b + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: '70b_1k1k' + + collect-dsr1-results: + needs: benchmark-dsr1 + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: 'dsr1_1k1k' + + collect-gptoss-results: + needs: benchmark-gptoss + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: 'gptoss_1k1k' \ No newline at end of file diff --git a/utils/summarize.py b/utils/summarize.py index 1f78caf9c..de8863c78 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -9,11 +9,11 @@ with open(result_path) as f: result = json.load(f) results.append(result) -results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['conc'])) +results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['ep'], r['conc'])) summary_header = f'''\ -| Hardware | Framework | Precision | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ +| Hardware | Framework | Precision | TP | EP | Conc | DP Attention | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | +| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(summary_header) @@ -25,7 +25,9 @@ f"| {framework.upper()} " f"| {precision.upper()} " f"| {result['tp']} " + f"| {result['ep']} " f"| {result['conc']} " + f"| {result['dp_attention']} " f"| {(result['median_ttft'] * 1000):.4f} " f"| {(result['median_tpot'] * 1000):.4f} " f"| {result['median_e2el']:.4f} " From bc24be4bc587f72bca15d50c1446e50016cbc433 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 18:32:00 -0500 Subject: [PATCH 023/149] updating the benchmark files with logic --- .github/workflows/1k1k-sweep.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 129bad2af..3341b1b50 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -64,6 +64,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} benchmark-dsr1: @@ -86,6 +88,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} benchmark-gptoss: @@ -108,6 +112,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} collect-70b-results: From 7479f743f52a962706b10f9d51d8329afc5e0904 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 18:49:34 -0500 Subject: [PATCH 024/149] testing concurrency From 93fba3b8dbc302e8443aac0f6b0c7a25296773e9 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 18:54:00 -0500 Subject: [PATCH 025/149] updating the benchmark files with logic --- .github/workflows/1k1k-sweep.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 3341b1b50..313470c31 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -64,8 +64,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} conc: ${{ matrix.config.conc }} benchmark-dsr1: @@ -88,8 +88,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} conc: ${{ matrix.config.conc }} benchmark-gptoss: @@ -112,8 +112,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} conc: ${{ matrix.config.conc }} collect-70b-results: From d021eb3627b71f771d51efde5f723545898db81e Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 09:06:50 -0500 Subject: [PATCH 026/149] updating the benchmark files with logic --- .github/configs/amd-master.yaml | 280 +++++++++++++++ .github/configs/nvidia-master.yaml | 503 +++++++++++++++++++++++++++ .github/workflows/1k1k-sweep.yml | 252 +++++++------- .github/workflows/benchmark-tmpl.yml | 36 +- utils/get_configs.py | 73 ++-- 5 files changed, 990 insertions(+), 154 deletions(-) create mode 100644 .github/configs/amd-master.yaml create mode 100644 .github/configs/nvidia-master.yaml diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml new file mode 100644 index 000000000..2465ee5b6 --- /dev/null +++ b/.github/configs/amd-master.yaml @@ -0,0 +1,280 @@ +70b-fp4-mi355x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: amd/Llama-3.3-70B-Instruct-MXFP4-Preview + runner: mi355x + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + +70b-fp8-mi300x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: amd/Llama-3.3-70B-Instruct-FP8-KV + runner: mi300x + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +70b-fp8-mi325x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: amd/Llama-3.3-70B-Instruct-FP8-KV + runner: mi325x + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 32, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 64, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 16, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 32 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +70b-fp8-mi355x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: amd/Llama-3.3-70B-Instruct-FP8-KV + runner: mi355x + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp4-mi355x-sgl: + image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915 + model: amd/DeepSeek-R1-0528-MXFP4-Preview + runner: mi355x + precision: fp4 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-mi300x-sgl: + image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915 + model: deepseek-ai/DeepSeek-R1-0528 + runner: mi300x + precision: fp8 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-mi325x-sgl: + image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915 + model: deepseek-ai/DeepSeek-R1-0528 + runner: mi325x + precision: fp8 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-mi355x-sgl: + image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915 + model: deepseek-ai/DeepSeek-R1-0528 + runner: mi355x + precision: fp8 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +gptoss-fp4-mi300x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: openai/gpt-oss-120b + runner: mi300x + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + +gptoss-fp4-mi325x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: openai/gpt-oss-120b + runner: mi325x + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 64, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 8 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 16 } + +gptoss-fp4-mi355x-vllm: + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + model: openai/gpt-oss-120b + runner: mi355x + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 8, conc-start: 4, conc-end: 16 } diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml new file mode 100644 index 000000000..9ac3fbcf3 --- /dev/null +++ b/.github/configs/nvidia-master.yaml @@ -0,0 +1,503 @@ +70b-fp4-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/Llama-3.3-70B-Instruct-FP4 + runner: b200-trt + precision: fp4 + framework: trt + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 16, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 128 } + - { tp: 2, conc-start: 16, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + +70b-fp4-b200-vllm: + image: vllm/vllm-openai:v0.10.2 + model: nvidia/Llama-3.3-70B-Instruct-FP4 + runner: b200 + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 16, conc-end: 64 } + - { tp: 2, conc-start: 16, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 8, conc-start: 4, conc-end: 8 } + +70b-fp8-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/Llama-3.3-70B-Instruct-FP8 + runner: b200-trt + precision: fp8 + framework: trt + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 16, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 128 } + - { tp: 2, conc-start: 16, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 16 } + +70b-fp8-b200-vllm: + image: vllm/vllm-openai:v0.10.2 + model: nvidia/Llama-3.3-70B-Instruct-FP8 + runner: b200 + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 16, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, conc-start: 16, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 32 } + +70b-fp8-h100-vllm: + image: vllm/vllm-openai:v0.10.2 + model: nvidia/Llama-3.3-70B-Instruct-FP8 + runner: h100 + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 2, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 2, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +70b-fp8-h200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/Llama-3.3-70B-Instruct-FP8 + runner: h200-trt + precision: fp8 + framework: trt + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 128, conc-end: 128 } + - { tp: 2, conc-start: 64, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 16, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 32 } + +70b-fp8-h200-vllm: + image: vllm/vllm-openai:v0.10.2 + model: nvidia/Llama-3.3-70B-Instruct-FP8 + runner: h200 + precision: fp8 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 32, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, conc-start: 16, conc-end: 64 } + - { tp: 2, conc-start: 16, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp4-b200-sgl: + image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + runner: b200 + precision: fp4 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + +dsr1-fp4-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + runner: b200-trt + precision: fp4 + framework: trt + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + # If TP=4, + # If CONC > 32, then EP=4 + # If CONC >= 256, DP_ATTN=true + - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } + # If TP=8, + # If CONC > 8, then EP=8 + # If CONC >= 256, DP_ATTN=true + - { tp: 8, conc-start: 4, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } + - isl: 1024 + osl: 8192 + bmk-space: + # If TP=4, + # If CONC > 32, then EP=4 + # If CONC >= 256, DP_ATTN=true + - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } + # If TP=8, + # If CONC > 16, then EP=8 + # If CONC >= 256, DP_ATTN=true + - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } + - isl: 8192 + osl: 1024 + bmk-space: + # If TP=4, + # If CONC > 32, then EP=4 and DP_ATTN=true + - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } + # If TP=8, + # If CONC > 32, then EP=8 and DP_ATTN=true + - { tp: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } + +dsr1-fp8-b200-sgl: + image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 + model: deepseek-ai/DeepSeek-R1-0528 + runner: b200 + precision: fp8 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: deepseek-ai/DeepSeek-R1-0528 + runner: b200-trt + precision: fp8 + framework: trt + seq-len-configs: + # For all sequence lengths, EP=TP + - isl: 1024 + osl: 1024 + bmk-space: + # If CONC > 32, then DP_ATTN=true + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + # If CONC > 64, then DP_ATTN=true + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + # If CONC > 64, then DP_ATTN=true + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-h200-sgl: + image: lmsysorg/sglang:v0.5.2rc2-cu126 + model: deepseek-ai/DeepSeek-R1-0528 + runner: h200 + precision: fp8 + framework: sglang + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-h200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: deepseek-ai/DeepSeek-R1-0528 + runner: h200-trt + precision: fp8 + framework: trt + # For all sequence lengths, EP=TP + seq-len-configs: + - isl: 1024 + osl: 1024 + # If CONC > 64, then DP_ATTN=true + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + # If CONC > 64, then DP_ATTN=true + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + # If CONC > 32, then DP_ATTN=true + bmk-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } + +gptoss-fp4-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 + model: openai/gpt-oss-120b + runner: b200-nvs + precision: fp4 + framework: trt + # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + +gptoss-fp4-b200-vllm: + image: vllm/vllm-openai:v0.10.2 + model: openai/gpt-oss-120b + runner: b200 + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + +gptoss-fp4-h100-vllm: + image: vllm/vllm-openai:v0.10.2 + model: openai/gpt-oss-120b + runner: h100 + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + +gptoss-fp4-h200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev + model: openai/gpt-oss-120b + runner: h200-trt + precision: fp4 + framework: trt + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, ep: 1, conc-start: 32, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + +gptoss-fp4-h200-vllm: + image: vllm/vllm-openai:v0.10.2 + model: openai/gpt-oss-120b + runner: h200 + precision: fp4 + framework: vllm + seq-len-configs: + - isl: 1024 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 16 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + bmk-space: + - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 313470c31..04b0b9b86 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -1,141 +1,141 @@ -name: '1K/1K Sweep' +name: "1K/1K Sweep" on: - pull_request: - workflow_dispatch: + pull_request: + workflow_dispatch: jobs: - get-70b-configs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 + get-70b-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 - - id: get-70b-configs - run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k 70b) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + - id: get-70b-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - get-dsr1-configs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 + get-dsr1-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 - - id: get-dsr1-configs - run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k dsr1) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + - id: get-dsr1-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - get-gptoss-configs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 + get-gptoss-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 - - id: get-gptoss-configs - run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k gptoss) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + - id: get-gptoss-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - benchmark-70b: - needs: get-70b-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: 70b 1k1k - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "70b_1k1k" - isl: 1024 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} - conc: ${{ matrix.config.conc }} + benchmark-70b: + needs: get-70b-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: 70b 1k1k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "70b_1k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} - benchmark-dsr1: - needs: get-dsr1-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k1k - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "dsr1_1k1k" - isl: 1024 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} - conc: ${{ matrix.config.conc }} + benchmark-dsr1: + needs: get-dsr1-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: dsr1 1k1k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "dsr1_1k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} - benchmark-gptoss: - needs: get-gptoss-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k1k - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "gptoss_1k1k" - isl: 1024 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} - conc: ${{ matrix.config.conc }} + benchmark-gptoss: + needs: get-gptoss-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: gptoss 1k1k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "gptoss_1k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} - collect-70b-results: - needs: benchmark-70b - if: ${{ always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: '70b_1k1k' + collect-70b-results: + needs: benchmark-70b + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "70b_1k1k" - collect-dsr1-results: - needs: benchmark-dsr1 - if: ${{ always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'dsr1_1k1k' + collect-dsr1-results: + needs: benchmark-dsr1 + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "dsr1_1k1k" - collect-gptoss-results: - needs: benchmark-gptoss - if: ${{ always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'gptoss_1k1k' \ No newline at end of file + collect-gptoss-results: + needs: benchmark-gptoss + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "gptoss_1k1k" diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 6cfb692fe..e4ec98314 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -74,13 +74,35 @@ jobs: - name: Resource cleanup run: | if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then - echo "[Docker] Cleaning up resources ..." - docker ps -aq | xargs -r docker rm -f - docker network prune -f - while [ -n "$(docker ps -aq)" ]; do - docker ps -a - sleep 5 - done + host=$(hostname) + if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then + echo "[INFO] Running container-by-container cleanup on $host" + for cid in $(docker ps -aq); do + echo "[INFO] Cleaning container $cid" + # Try graceful first + docker stop -t 90 "$cid" || true + # Wait until it's really dead + docker wait "$cid" >/dev/null 2>&1 || true + # Force remove if anything lingers + docker rm -f "$cid" >/dev/null 2>&1 || true + done + # Give a moment for GPU processes to fully terminate + sleep 2 + # Verify GPUs are now idle + if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then + echo "[WARN] After stop, GPU still busy:" + nvidia-smi + # Last resort if driver allows and GPUs appear idle otherwise: + # nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true + fi + else + echo "[Docker] Cleaning up resources ..." + docker ps -aq | xargs -r docker rm -f + docker network prune -f + while [ -n "$(docker ps -aq)" ]; do + docker ps -a + sleep 5 + done fi if command -v squeue >/dev/null 2>&1; then echo "[Slurm] Cleaning up resources ..." diff --git a/utils/get_configs.py b/utils/get_configs.py index 24c6ea8a3..01e13f313 100644 --- a/utils/get_configs.py +++ b/utils/get_configs.py @@ -1,6 +1,7 @@ import json import yaml import sys +import argparse seq_len_stoi = { "1k1k": (1024, 1024), @@ -9,30 +10,60 @@ } def main(): - if len(sys.argv) < 4: - print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}} {{model-prefix}} [step-size]") - exit(1) - - config_file = sys.argv[1] - seq_len = sys.argv[2] - model_prefix = sys.argv[3] - step_size = int(sys.argv[4]) if len(sys.argv) > 4 else 2 + parser = argparse.ArgumentParser( + description='Generate benchmark matrix from configuration files' + ) + parser.add_argument( + '--config-files', + nargs='+', + required=True, + help='One or more configuration files (YAML format)' + ) + parser.add_argument( + '--seq-lens', + choices=list(seq_len_stoi.keys()), + required=True, + help=f"Sequence length configuration: {', '.join(seq_len_stoi.keys())}" + ) + parser.add_argument( + '--model-prefix', + required=True, + help='Model prefix to filter configurations' + ) + parser.add_argument( + '--step-size', + type=int, + default=2, + help='Step size for concurrency values (default: 2)' + ) + + args = parser.parse_args() - isl, osl = seq_len_stoi.get(seq_len) or (None, None) - if not (isl or osl): - raise ValueError(f"Input 'isl-osl' must be one of '{', '.join(seq_len_stoi.keys())}'.") + isl, osl = seq_len_stoi[args.seq_lens] - try: - with open(config_file, 'r') as f: - config_data = yaml.safe_load(f) - assert isinstance(config_data, dict) - except FileNotFoundError: - raise ValueError(f"Input file '{config_file}' does not exist.") + all_config_data = {} + for config_file in args.config_files: + try: + with open(config_file, 'r') as f: + config_data = yaml.safe_load(f) + assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" + + # Check for duplicate keys, shouldn't really be an issue but with NVIDIA and AMD + # separate configs this will help against any possible confusion + duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) + if duplicate_keys: + raise ValueError( + f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" + ) + + all_config_data.update(config_data) + except FileNotFoundError: + raise ValueError(f"Input file '{config_file}' does not exist.") matrix_values = [] - for key, val in config_data.items(): - # Filter by model prefix - if not key.startswith(model_prefix): + for key, val in all_config_data.items(): + # Filter by model prefix i.e., + if not key.startswith(args.model_prefix): continue seq_len_configs = val.get('seq-len-configs') @@ -95,7 +126,7 @@ def main(): if conc == conc_end: break - conc *= step_size + conc *= args.step_size if conc > conc_end: conc = conc_end From 09ebb8a8887b18e2fef98edbbbbf09e3c4743763 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 09:08:08 -0500 Subject: [PATCH 027/149] updating the benchmark files with logic --- .github/workflows/benchmark-tmpl.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index e4ec98314..4f8468a82 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -75,19 +75,26 @@ jobs: run: | if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then host=$(hostname) + if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then echo "[INFO] Running container-by-container cleanup on $host" + for cid in $(docker ps -aq); do echo "[INFO] Cleaning container $cid" + # Try graceful first docker stop -t 90 "$cid" || true + # Wait until it's really dead docker wait "$cid" >/dev/null 2>&1 || true + # Force remove if anything lingers docker rm -f "$cid" >/dev/null 2>&1 || true done + # Give a moment for GPU processes to fully terminate sleep 2 + # Verify GPUs are now idle if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then echo "[WARN] After stop, GPU still busy:" @@ -103,6 +110,7 @@ jobs: docker ps -a sleep 5 done + fi fi if command -v squeue >/dev/null 2>&1; then echo "[Slurm] Cleaning up resources ..." From 6c61ba9ff059f4ad56ce67e3bdaad9ebfe3aafa8 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 09:11:04 -0500 Subject: [PATCH 028/149] updating the benchmark files with logic --- .github/configs/master.yaml | 784 ------------------------------- .github/workflows/1k1k-sweep.yml | 120 ++--- 2 files changed, 60 insertions(+), 844 deletions(-) delete mode 100644 .github/configs/master.yaml diff --git a/.github/configs/master.yaml b/.github/configs/master.yaml deleted file mode 100644 index b5743fe51..000000000 --- a/.github/configs/master.yaml +++ /dev/null @@ -1,784 +0,0 @@ -70b-fp4-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: nvidia/Llama-3.3-70B-Instruct-FP4 - runner: b200-trt - precision: fp4 - framework: trt - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 16, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 128 } - - { tp: 2, conc-start: 16, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - -70b-fp4-b200-vllm: - image: vllm/vllm-openai:v0.10.2 - model: nvidia/Llama-3.3-70B-Instruct-FP4 - runner: b200 - precision: fp4 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 16, conc-end: 64 } - - { tp: 2, conc-start: 16, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 32 } - - { tp: 8, conc-start: 4, conc-end: 8 } - -70b-fp4-mi355x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: amd/Llama-3.3-70B-Instruct-MXFP4-Preview - runner: mi355x - precision: fp4 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - -70b-fp8-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: nvidia/Llama-3.3-70B-Instruct-FP8 - runner: b200-trt - precision: fp8 - framework: trt - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 16, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 128 } - - { tp: 2, conc-start: 16, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 16 } - -70b-fp8-b200-vllm: - image: vllm/vllm-openai:v0.10.2 - model: nvidia/Llama-3.3-70B-Instruct-FP8 - runner: b200 - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 16, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 16, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 32 } - -70b-fp8-h100-vllm: - image: vllm/vllm-openai:v0.10.2 - model: nvidia/Llama-3.3-70B-Instruct-FP8 - runner: h100 - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 2, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 2, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - -70b-fp8-h200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: nvidia/Llama-3.3-70B-Instruct-FP8 - runner: h200-trt - precision: fp8 - framework: trt - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 16, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 32 } - -70b-fp8-h200-vllm: - image: vllm/vllm-openai:v0.10.2 - model: nvidia/Llama-3.3-70B-Instruct-FP8 - runner: h200 - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 16, conc-end: 64 } - - { tp: 2, conc-start: 16, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - -70b-fp8-mi300x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: amd/Llama-3.3-70B-Instruct-FP8-KV - runner: mi300x - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - -70b-fp8-mi325x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: amd/Llama-3.3-70B-Instruct-FP8-KV - runner: mi325x - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 32, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 64, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 16, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 32 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - -70b-fp8-mi355x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: amd/Llama-3.3-70B-Instruct-FP8-KV - runner: mi355x - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - -dsr1-fp4-b200-sgl: - image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 - model: nvidia/DeepSeek-R1-0528-FP4-V2 - runner: b200 - precision: fp4 - framework: sglang - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } - -dsr1-fp4-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: nvidia/DeepSeek-R1-0528-FP4-V2 - runner: b200-trt - precision: fp4 - framework: trt - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - # If TP=4, - # If CONC > 32, then EP=4 - # If CONC >= 256, DP_ATTN=true - - { tp: 4, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } - # If TP=8, - # If CONC > 8, then EP=8 - # If CONC >= 256, DP_ATTN=true - - { tp: 8, conc-start: 4, conc-end: 8 } - - { tp: 8, ep: 8, conc-start: 16, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } - - isl: 1024 - osl: 8192 - bmk-space: - # If TP=4, - # If CONC > 32, then EP=4 - # If CONC >= 256, DP_ATTN=true - - { tp: 4, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } - # If TP=8, - # If CONC > 16, then EP=8 - # If CONC >= 256, DP_ATTN=true - - { tp: 8, conc-start: 4, conc-end: 16 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } - - isl: 8192 - osl: 1024 - bmk-space: - # If TP=4, - # If CONC > 32, then EP=4 and DP_ATTN=true - - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } - # If TP=8, - # If CONC > 32, then EP=8 and DP_ATTN=true - - { tp: 8, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } - -dsr1-fp4-mi355x-sgl: - image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915 - model: amd/DeepSeek-R1-0528-MXFP4-Preview - runner: mi355x - precision: fp4 - framework: sglang - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -dsr1-fp8-b200-sgl: - image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 - model: deepseek-ai/DeepSeek-R1-0528 - runner: b200 - precision: fp8 - framework: sglang - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - -dsr1-fp8-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: deepseek-ai/DeepSeek-R1-0528 - runner: b200-trt - precision: fp8 - framework: trt - seq-len-configs: - # For all sequence lengths, EP=TP - - isl: 1024 - osl: 1024 - bmk-space: - # If CONC > 32, then DP_ATTN=true - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - # If CONC > 64, then DP_ATTN=true - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - # If CONC > 64, then DP_ATTN=true - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - -dsr1-fp8-h200-sgl: - image: lmsysorg/sglang:v0.5.2rc2-cu126 - model: deepseek-ai/DeepSeek-R1-0528 - runner: h200 - precision: fp8 - framework: sglang - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - -dsr1-fp8-h200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: deepseek-ai/DeepSeek-R1-0528 - runner: h200-trt - precision: fp8 - framework: trt - # For all sequence lengths, EP=TP - seq-len-configs: - - isl: 1024 - osl: 1024 - # If CONC > 64, then DP_ATTN=true - bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - # If CONC > 64, then DP_ATTN=true - bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - # If CONC > 32, then DP_ATTN=true - bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } - -dsr1-fp8-mi300x-sgl: - image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915 - model: deepseek-ai/DeepSeek-R1-0528 - runner: mi300x - precision: fp8 - framework: sglang - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -dsr1-fp8-mi325x-sgl: - image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915 - model: deepseek-ai/DeepSeek-R1-0528 - runner: mi325x - precision: fp8 - framework: sglang - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -dsr1-fp8-mi355x-sgl: - image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915 - model: deepseek-ai/DeepSeek-R1-0528 - runner: mi355x - precision: fp8 - framework: sglang - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -gptoss-fp4-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 - model: openai/gpt-oss-120b - runner: b200-nvs - precision: fp4 - framework: trt - # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } - -gptoss-fp4-b200-vllm: - image: vllm/vllm-openai:v0.10.2 - model: openai/gpt-oss-120b - runner: b200 - precision: fp4 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - -gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.10.2 - model: openai/gpt-oss-120b - runner: h100 - precision: fp4 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - -gptoss-fp4-h200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev - model: openai/gpt-oss-120b - runner: h200-trt - precision: fp4 - framework: trt - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, ep: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - -gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.10.2 - model: openai/gpt-oss-120b - runner: h200 - precision: fp4 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 16 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - -gptoss-fp4-mi300x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: openai/gpt-oss-120b - runner: mi300x - precision: fp4 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } - -gptoss-fp4-mi325x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: openai/gpt-oss-120b - runner: mi325x - precision: fp4 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 64, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 8 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } - -gptoss-fp4-mi355x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: openai/gpt-oss-120b - runner: mi355x - precision: fp4 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - bmk-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - bmk-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 8, conc-start: 4, conc-end: 16 } diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 04b0b9b86..2d00fa924 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -44,53 +44,53 @@ jobs: CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - benchmark-70b: - needs: get-70b-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: 70b 1k1k - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "70b_1k1k" - isl: 1024 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} - conc: ${{ matrix.config.conc }} + # benchmark-70b: + # needs: get-70b-configs + # uses: ./.github/workflows/benchmark-tmpl.yml + # name: 70b 1k1k + # strategy: + # fail-fast: false + # matrix: + # config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} + # secrets: inherit + # with: + # exp-name: "70b_1k1k" + # isl: 1024 + # osl: 1024 + # max-model-len: 2048 + # runner: ${{ matrix.config.runner }} + # image: ${{ matrix.config.image }} + # model: ${{ matrix.config.model }} + # framework: ${{ matrix.config.framework }} + # precision: ${{ matrix.config.precision }} + # tp: ${{ matrix.config.tp }} + # ep: ${{ matrix.config.ep || 1 }} + # dp-attn: ${{ matrix.config.dp-attn || false }} + # conc: ${{ matrix.config.conc }} - benchmark-dsr1: - needs: get-dsr1-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k1k - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "dsr1_1k1k" - isl: 1024 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} - conc: ${{ matrix.config.conc }} + # benchmark-dsr1: + # needs: get-dsr1-configs + # uses: ./.github/workflows/benchmark-tmpl.yml + # name: dsr1 1k1k + # strategy: + # fail-fast: false + # matrix: + # config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + # secrets: inherit + # with: + # exp-name: "dsr1_1k1k" + # isl: 1024 + # osl: 1024 + # max-model-len: 2048 + # runner: ${{ matrix.config.runner }} + # image: ${{ matrix.config.image }} + # model: ${{ matrix.config.model }} + # framework: ${{ matrix.config.framework }} + # precision: ${{ matrix.config.precision }} + # tp: ${{ matrix.config.tp }} + # ep: ${{ matrix.config.ep || 1 }} + # dp-attn: ${{ matrix.config.dp-attn || false }} + # conc: ${{ matrix.config.conc }} benchmark-gptoss: needs: get-gptoss-configs @@ -116,21 +116,21 @@ jobs: dp-attn: ${{ matrix.config.dp-attn || false }} conc: ${{ matrix.config.conc }} - collect-70b-results: - needs: benchmark-70b - if: ${{ always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "70b_1k1k" + # collect-70b-results: + # needs: benchmark-70b + # if: ${{ always() }} + # uses: ./.github/workflows/collect-results.yml + # secrets: inherit + # with: + # exp-name: "70b_1k1k" - collect-dsr1-results: - needs: benchmark-dsr1 - if: ${{ always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_1k1k" + # collect-dsr1-results: + # needs: benchmark-dsr1 + # if: ${{ always() }} + # uses: ./.github/workflows/collect-results.yml + # secrets: inherit + # with: + # exp-name: "dsr1_1k1k" collect-gptoss-results: needs: benchmark-gptoss From f7d83402720d991a283bca6edc14feb55f8272fa Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 11:33:05 -0500 Subject: [PATCH 029/149] updating the benchmark files with logic --- .github/configs/amd-master.yaml | 32 +++---- .github/configs/nvidia-master.yaml | 114 ++++++++++++------------ benchmarks/gptoss_fp4_h200_trt_slurm.sh | 7 +- 3 files changed, 77 insertions(+), 76 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 2465ee5b6..a501ead63 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -205,24 +205,24 @@ gptoss-fp4-mi300x-vllm: - isl: 1024 osl: 1024 bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 1024 osl: 8192 bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi325x-vllm: image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 @@ -234,10 +234,10 @@ gptoss-fp4-mi325x-vllm: - isl: 1024 osl: 1024 bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 bmk-space: diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9ac3fbcf3..5c006dc91 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -207,18 +207,18 @@ dsr1-fp4-b200-sgl: - isl: 1024 osl: 1024 bmk-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 1024 osl: 8192 bmk-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 bmk-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 16 } dsr1-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 @@ -279,15 +279,15 @@ dsr1-fp8-b200-sgl: - isl: 1024 osl: 1024 bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 @@ -324,15 +324,15 @@ dsr1-fp8-h200-sgl: - isl: 1024 osl: 1024 bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 bmk-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 @@ -370,24 +370,24 @@ gptoss-fp4-b200-trt: - isl: 1024 osl: 1024 bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 8192 osl: 1024 bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-b200-vllm: image: vllm/vllm-openai:v0.10.2 @@ -399,24 +399,24 @@ gptoss-fp4-b200-vllm: - isl: 1024 osl: 1024 bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 bmk-space: - - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 8192 osl: 1024 bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.10.2 @@ -428,21 +428,21 @@ gptoss-fp4-h100-vllm: - isl: 1024 osl: 1024 bmk-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 bmk-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 bmk-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 32 } gptoss-fp4-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev @@ -483,21 +483,21 @@ gptoss-fp4-h200-vllm: - isl: 1024 osl: 1024 bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 16 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 1, conc-start: 4, conc-end: 16 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 bmk-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 32 } diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index c381f7c64..969d65310 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -13,6 +13,8 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# DP_ATTENTION +# EP_SIZE echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -30,7 +32,7 @@ cat > gptoss-config.yml << EOF cuda_graph_config: enable_padding: true max_batch_size: $CONC -enable_attention_dp: false +enable_attention_dp: $DP_ATTENTION kv_cache_config: dtype: auto enable_block_reuse: false @@ -42,9 +44,8 @@ print_iter_log: true stream_interval: 20 EOF - #mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --max_batch_size $CONC --max_num_tokens 20000 --backend pytorch --extra_llm_api_options gptoss-config.yml --ep_size=$TP --trust_remote_code --gpus_per_node 8 --host 0.0.0.0 --port $PORT --tp_size=$TP --pp_size=1 > $SERVER_LOG 2>&1 & +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --max_batch_size $CONC --max_num_tokens 20000 --backend pytorch --extra_llm_api_options gptoss-config.yml --ep_size=$EP_SIZE --trust_remote_code --gpus_per_node 8 --host 0.0.0.0 --port $PORT --tp_size=$TP --pp_size=1 > $SERVER_LOG 2>&1 & set +x From 869572a2b5e4ccadcefde53558bf8bd34afc9b2f Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 12:30:09 -0500 Subject: [PATCH 030/149] adding pytests --- .github/workflows/1k1k-sweep.yml | 2 +- .../get_full_sweep_configs.cpython-313.pyc | Bin 0 -> 5046 bytes ...sweep_configs.cpython-313-pytest-8.4.2.pyc | Bin 0 -> 55816 bytes .../get_full_sweep_configs.py} | 0 .../test_get_full_sweep_configs.py | 842 ++++++++++++++++++ 5 files changed, 843 insertions(+), 1 deletion(-) create mode 100644 utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc create mode 100644 utils/matrix-logic/__pycache__/test_get_full_sweep_configs.cpython-313-pytest-8.4.2.pyc rename utils/{get_configs.py => matrix-logic/get_full_sweep_configs.py} (100%) create mode 100644 utils/matrix-logic/test_get_full_sweep_configs.py diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 2d00fa924..ee1c8ddd2 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -15,7 +15,7 @@ jobs: - id: get-70b-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-dsr1-configs: diff --git a/utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc b/utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b29a85d117b2da207801b8a1143e67830ad9f45b GIT binary patch literal 5046 zcmb7ITWs6b89ovz>wcH4i)DwFmB_LqyQ$qaPMdaVF3!zyD~4%Or4ST}c2rB0LP|}n z4PEvyK-~t^K|93ROZ@1f1DeM^^ua)}p%{jhno@{igTO%d5@5ibC0McbVfzmyN=_OT z9YgT^|2g0J@Bh(p9$Kv?1kYzb{qM!2E`+|pjQWsm;$Z@a-y#~(niGg`jT&B~@LFE0 zj08{6TAH9qS~o%PB;6N8yzYD-t>1SRp|k;^^SUs4t}T#4i<(7E%y1v`r()Xs2vqR| zFfpw%+((o`V+&gIgiP*P+Oveed*Qnm=nO1&9t%)dz(ZM&ZB2F-650;gA;RDZt)0q&xtttVktsqUzH{hX#XN9$-KZAw_X2$lN(w@nC?giRXk^5ES){A%ndLm z$XsZ9=6XmO>`1ZpKn!&dwjPKb9fZ9Hg6bd~JrKhkgp)>R?=g19!8oUNFG4-Et1Fn7z^~s6jY6yaX&RqBdTOZi_cgks?aAr(2)+B=%69`jGJ>&XkLqJfN^!AjWhIb zSam&B^G;P3=V3gZJ*^Dw`o1@e2R!)`c;aQePn8$#$m{Qc80{c@JrJ=D!ruci)e=liEWwM{S@DP+yFDc`1kE=XW$)(nw7D&^2}%%0^|(P@d+arVgeW7 zXk-FRh#tdj$^;cY2>iIh8xxT4Cm$xHT16p66aGEL`S};IGoCDyFP58 z8GqMKW%p^2@aIQn^O_=1)~ zJD|2GG^?Ha4x#dMoATf^eF8>u)ggy^ed-f0J5Pm7N|^-TgWX|0Ff(l)Iu)L3Nejr&ay&=&z10G)fSJp zP;q5h){LL7#^*VfD)LmJ$g@wAL5-d}Fn5w#DDt2z79(UGm*y7**}$`xDmk9b%H%~h zU#gnoae=)Q&$FCR-9f_|7|Bw=E|xDoE_WYQ)$XQ1l0I{>n9HyNk8d$Nz8GK&3&kv( zkC%9MA$O&^a}IM{prP9~UUV^{Feu42G21Qe4PY+57l+wHl{^r!bG*zGC0m@ez_VwR4L_i8JwY+s>a*9hu7sLXKOcqEDY+ECQe(1!om4~l%@G{)u7NUmzA|_sjAPG;^}gktNx^; zpj?XXrlQ>)z=3V83D~Bn?HJKj6f^SpHpYALryqat^^31Vz(*#r@Ys-S zfGRJgd4ZK}2l&NGf#u4tVV0LI>1;LytIC{A<{?3{1zb!)v8RM`F(;Ea$ujwNF~`Y7 zjw{P1MH1cwjnjq-vISH@%cfe*NES;hCzH$RLS8ls=>;~GFQ&7y3DU(ubfBAL5*L_E z2rNiEgDdwi&lh>QPbl(bXvDrsDGLQBJC8u~pDLD*;3{iDbh$EH5M+}QX5n(Kd{MS2 zC6Q{imP{;y(7tpDY{-IoXqXgk7+F{1u><7W0@REyTPc);*r42}_`}O2_NW^{*`Chl zQ*E!nl(cN#=Ax`w=?b)2PIO(_#FDHFiRxcZgowxkI?lPSB^noHp= zEjR=$e@Vvmno{ajCV*@uTd6(B8PuMv)LshQ=R>#-p!ia__;`H9mUWPRep$dY^+S(* zY?6T*5hmemg!a#6CXaCoEIw=4xdUe>E9G2Xm~6Qj&leYA9pau^s6g0+%Pa)F?cqde zStbi`s_?IYN_@BpQ}9>eRdm3pHns*JqA5 zW==LH=fsIqqIJH9Aitt!dP}dUAwi94n;P$3t3z@)AGgDXEUue8KCNuU%fPHhg2XqmKjo1J@U?Ew1Jo{iC(n zO}B4NyLSARLEJIZaL<5q);BehDI z91KhF4v*n?C?X9E;WrqPLevAdDd1YsZ3U61|Jut^II?nhHFE8QnQ<4Z6<1|K*>mjxdG4+9^@AABldBnNkhRO%!2 z5O~DTE3+@b4@)J_zRaHi7GGtBm*E5EXf$6Vv+|n0Ms_&1tO3d5hxdV=bZDI}|iKIm8&$50jS=Ntj(XyUltCbxrM1mA-3ZNfI z*`nLVNt&2W(ne0>6Q*%nrA@b@yQj6Y?WvP=&)V5`ojdp5xpU3qaanME{^g&aXc@Cuen=7R zva5k>2doy$ix$};Tko?dR`$2(UR$y#V7b?>I0F2RQ*p98#Y!>zD=8BDTcVV(zb?gv zzeV?z-s@J}tc>kG&%I?z*}Y!H8?Zd;JQC0ztpAnm^_ER)RH;}bkHe6sa*;evLmuBE zd5R5rs${9&qEy#fS}k%(y+w9yD&nu9_*q&h(%d4ghNXFsRwmMFS(+DV@{e(P%uJ98bgs(cirpmnn6 zZY3HSj|L)v@#x91gc4Cs2ZmxwbZ|U@&~QQt3?|~kv5~-VY&4qenL(MXG=6d{Jcg$x zsm?%lpf{9^D#_j_Ct{yE|P7sU3SP$xfuT>xdi`QJvO;icJokgk?cXdST2*ja=Dsjr8L~B z=y9&MqM0fuyN*Q1CzLpv=~QGiHWV0m_~71yyS27Sjw=&`xF7AgUcP1h@$TWVjorul z*01j#SH`dR#ZL_?J+VYv{8Vfx7Qx5IlLYR8GS0DJaOF%EbImr zcB{Rr6r&CfN-0GBnDw!y4D(M_yIWa}JM_ThC;+-zvXnH;S09|O-a79O%-1#Hhwg6- z%r`W@)f{-m_PQ@!d(S*VMX%e^wR^8PY_0yci;F8NuT<)})O)rfy640_UwNSHZBNZA z*A-ix+j+%O=Wr_YVma94Mgqg8bqFkiO)j|sh}sOnJiVdp6(~jsYQJ z@xk$M-}=5y>+k5>5>^H`OtwT$4)rdIIa|B$dpPUl48P^g|j zHGu{KEdW5hmQuoc*H5+@d==Gv>?cG=OzOhNv zH{JmHrh@vW!mMuyMOfd^LrnXI#yX933IAxcdwT3FOye9Qy~ojO!}zAYt5kcis!@u9%bSa)Q6JPwK&9UV&cOjZmfl?^##UMa&{O*W~6 zcrVDC9F0EFePl!T`b~XXY4~DhRL+Xr@p%0oL}c7pf2q+7& zN)6qv1$gMP1%o~4V7HX@NV#6DzMP(Enf`0wUmB(_4QpQ-=AWkYqe^PvWCVbBWg{Dm zn@k6z^))43y9a~uob|QJH!&~|vG6NHZ?>h5eI#8QzEW(b5h!5<+FaDj6;0@6AN8_N z>t*Uy?V_2A_3|_VFItXTyD>=Pwqy8eeuZqZRht%zEOGlWb$Yc}ju7d}yIL(r+qK+a z9g66m9zRCt&90Dbllp{qON(Wlg=i98Lr5Ji3Rs#g0sI@N+hJ?A3|oV?$@T-$(Xpr! z7*7NSwdwSib|1Q58}c-F_GGP34Fv63dtxja&)QE%PL5{nLlY;*vNlllpaZii&3c&b zvJ&NwgO+AX)!rG3j7LO-X5OrIIO)JE1?b}Bw30tsn>VQrWf!u9@jrPdz_XUOD(lmg z9nao(sl+q&#B~4plIF7;E=lgsm3+G7%-Zu(%R4rUEAXSzvZ-e>r41>2!#fVdFO)Q2 zXA|!WHFw&+>DpaVJc!D~rM8si+CxZs(efx`?UW+fs@UWr#V*^x06B;O17imk3v3;j zC@^(klfcpud*mY4NGX+FfNmKqk>ZivfMv2r@s5-Qz1i}Iz%U#Fd$Vr>)DDC3@(3O_ z*+R*I1}w75)=<$Vx`!+72-%LP zVV=Sv=Osc8m79=yiw(FVo};muai;;FoW~@sCap~3DGjUkTZh(Kz#hu3WAujXIzn?# z-qk8tSnA9nP>N$jX4rLv$VlGRYB@&Zhg}*zYtM{;uoy}c>B+myN= zqj`W`hSD?@sBKTUA#(!NG}%YtxsL20yq`_j(sx7+#sXy;Iw+(KfSxL58-d^A+d*1r0R zB~ieNrJSu;I*-0{BP*6=*EagdJUD=F;ZT8(^tkm?OnCED`;gbN@2ye_Jb*Vk=Lms zty=DudqUOfNPF*6w<^f3dnY$C0rl}nGB&8H0w$CQ$*;L$X~Pp6o(MowIzg)P07U*H zP^tG^FXeip9#Ropmy)NGJrQN()b*0&pb{G!hl2ciS+}ZK?Pgl}>#pu@RpFY<+NoB` z!SuAp3d5!X}rEEl7yaqJkYUv?DWQ9()ElQ2<%`<>x0SFl(He1T*ngng2|xk zSF*RW> zvR``|0t=sphXi);ci0?EW^HJR8N2czYEg~=P_6w0HeW9@e8E=AI(B-iLgQ}6nk|aO z@wiI$3j$CBs87y1qVaL%^h^<}O*w|_ITdZ<6>U=5=!rC9l}-Zf1R4ntW#IZZI}|d5 zOjT!mmiqE&Q|%$749R&2K1_KOiCQr3_M7)KrK+vWT^P`^SI z2YpPjlPyk0$DxT-6q3R!hX_0XkagcRb~=0<#u!izC$rAc#9(AJnROwR8D2z`Y;8Ds zA~7*K6jq>543CT^jz>nrOyg7%PDaRhV@%1G?Pll#biGGW5=zz=hLQz#9O3BGW22Ec zLuBQb=z$esJYkeoHkdd$7Ez*E7xZ95_@1buWbO2ES;uH1GK4NeV%7nbS8Ock&)SpG z(P3ua!8HHb(vyk!6VcO9aSfixItL?TK!2h5Vk)fxdSEGiA;*P&9^tRH$#|#F7|FS_ z(O71SF>;3jnPyJ?>y5W_3U1CyctJAz7*d|dE49H`sy|5rd#0*e0J!eX&pL(?3Qd1U z!#LYT1jMNL>M_swIOgR`ch^pLQYd<*#1VA;#NsF^{Yjan(*IljDd$g|mh$@P&abv-yj>}&>!P=7PRjYE zrLNCTinQDqPrfL1O?Mh|7-AyFSKEy_wAclyD`ob?OWw*en?LhRN@|{tFp%~(&q?Yp zBQ;MwBVs&oL2CYNQlzDGVm$ex)O=>MF^3jIQLQ@EJRK2vL;%TXCV%79J@dZ$l)G&@ zn)bELxz*pSyY0*t9y_lG#ciZbM@1~16BEf9ciSzkx+&%EI@^)kdq%$+=T zUJr`fNITmhV(FZiNY1#s=6$!Y>V}lNeR?A8YoBwgzgc(tnXNo_UJr`fNSmGzv2;#M zBxl_1^S;`YyJh;G7k19MTljC*-Ew9V51!Y9;x^Kz?-8+dPD~_c+%31V=9-i{Fum!8 z9dqsg|INAsXEyTSc|9m@BW)U=q-IX%#6)t&9hmpkrQEI4p%-?|xm)>f*4>KX!D8q2 zpty~+>5zz}b7CSn<8DQ3Ojp0Kose%M1Nj)ZUB^H*2%I{LZ_*K!hZ(BftjaFByWXzB zc)rqTsj8pzwFAR4zRoG>lE40okAMF0v$cQJ_?5=Fmi}~0f2L(q#=m*0blzJ(J)HJ} zh^fDf)IRmh^stEYzy(SbY3ZC8PrfL%8%iOEAtnlerWh;71QcCL2F@4LBXwlW`$FilDUdr_C^ip`MGQ%^> za|^?PWpfHuwCq(C%%w1ztq7Hxh$!@TG&ff;6H!z`!7O26J`*bTSswNzi#Ax2MLx>` zU|8ZGWFH}4tmqJ|M`_6(FN=HRDj__Z(prcqpY^Ak<5+b_0*?7}l+St@^LEme&owu_ zpM@G(`-+14S(HLQ*T}U(`3yz0sq$I>Li0W`Q$C|l#(pL@+-_$YfVaAxM=EJQM6xLE z8MXu)Cj-R)b78?rEW2Z4qfw(xVX{Zri|V1CQQ1) z0G7+vomh(Y7PdIw$(GWzgr()4LQ62^mkB&Z;3EJ7e6`F>TFKy0i2!7axsiBM8Kk5k z0#O2M0A}255v#;0!A^kH{Un7>5r9C}qKpwx2qX!N6PN%9Rw}r&u+9x*ahvtX96&LsBgydD%R5z?j~5i#PM^q5En z-!$*7NlC5KThiXvIZ6G^O082*^VoSkC~hNddW(pqb7CSHncltXu=utnVWwyK>vd)K zb=b|VO@D+8AMw6uS#Jpy-D3^e?y;Wq=CEYehrEI%b1cG=v3)Fs+n$EJnA~7h`?y4w zS{iF)&Z;H!iSzft{E>yHK24ZTo?3l^)mnx%TgWV}Mw`*aZ29!krYz9H(jda6h6&iT zl+FSzn^Ukqw2OmTZ?v2)(R?HtUpOuFKRU~w%8kTqgTt1{;`2AuDI!aB^U$kd$hMKsIL~3MqgFH zI&>u%MQpL&7G-)HXd$$^xy`U%${Z&|DZEvs;Th(+g*D}}IZL&+5_XqhOksEFQQ6|M z72uO#>9jgIKE++YslP3VI{25hOgG2On%kJEVVL$e&*uI%X0nDc!^{y+uy%3-V^_yd zM8*S5`xM~XC)zs-ML;Y*5}@6INwBaA?JylZ-81Qd!r*u;9xa%E&DqmRb8kQXUnBNy zr?m~MLfYAi6=OTpsn}zzPqYsenaMh|sZ9GM*-C3nn%GWRUUUR3hdTLMbEIgCjkfye zEc8IIh-+Ky&{rgb*lURmob(!5ul7kQLzpBPOCazp7xkBHABo1HK~Z*QcP+vq(gw>g zZ#rUKdDHc5=X#u-J)(lIq26Q=;AfY?Ii@NPf5-p2Xa0*fsU zu+IrQbySV|4kTvUO6ocg5~+FKVbl4{gsbWXn??H2s}&RV+1}< zz>M$z9o$F~{mITy_>WDMVtZss)k?KCz6Y&;~OP;yo}{DITMEZtPYW(qFJQm!tEJ`l>J^HMXH?b6WHE@HchYFsPAih2pzu6@a^ zYTCy4<+iFhLJmmVoXMhIE^Wgmptw^m2|4e?l5_|tdkC`Tkn?cSA-oNfycOqcZxb@g z+fv^GWYfe%=nU1PFyCOT4W&pZk#~V97Nx-6{=!Oq4@?33JRuL4_F{WcRpQ@5 z-fa*W85XLTFAS|=x=024%_Y45TVF02%9yvHV7`~lX^;gAscWrVC#0^W24g+Fx6zCF z7y3;sA2comEI&!YvE6FybB8{y8mQ%t(P}Y9QK2?QA@$YAES9ir%!1ixW40n^%nC{G zhjh#~7#4M=OH7S3Bwlc+kTEM-bVXxUl){*8SQhJqsAAciLM(C1H8mC33TQpBSsk?{ z#6M%yHnLHBV8x?$6&tlpM}NUbt=ydWF{faOKfyLzP%n(LK)f>K3RN;%_A`McV2eXu zEj9#+w4ra!DO=0Y&uuJ{TaW&aP~}FOwguNv>b=r7UGO1k)arBH@KLn_Dp;k}rTx{ZAn63-FkOzj;+6ZYhwfM*Mbs!{K1Hm~p@`Mmb3ddMvGb-= zMA4!eb{C}66s15BJD0`k4yv8@gz2xPPVm+-f#61b@+I z1_Nii=EFTlf|9?;DY?*kJBP6X4UqmjClp*edABZF)Vvg#`(p~g`WVLFgQF8e(V@Uo zC!%qZLMCAX4$FG(PNxu6`5$1?U@n!IzFav&QH*FVV)nbiP7- zm~xXVryxw>x9y1C8*4|^;dmlEG^W67i7IUyp%6Le zP)H{QLF6A(m_V>dw}4ll0`qpBhCo0~+XBme8#jlKq7>QDZ<#b2*WbZy;_t{-(c$U~tsW*DItH!&P{to!+9=yhX7#qyvSxi|)$W)5XLrA{@wF#k-SK;o8GlNuPp#Sw zkzC5xk@0q>q&4R%7)X29%t`7mBdtk!VU4Y)AbdeuLoz@$Eu9nN$rn+~*I~>eq9`nC zMD6D)L@p6P^7pE{$UM4s+W*3il)Lln9tP6B&N;XGn{{`df$b{A&g;Ppo7B!;wbGt<@5kT^# z>NUJgs$RGgX5(jD7|6FBhZQzil9TPY8iY0YS(qj3CgwcMjKr%`?(XSRUs^?fR;7L2 z_~sPSnu@27LOA@si>D!kP7KRT&>*|ELG#4KUl3X|aaV;Z?#dHU=Lop5 zvGiHd@=BD# z@~V8<<&`c*s!+w8rSPNWyJ-f$RSgG}(`bJ&M_q@v%Nq-wx7_z6Yu7F#YhSR~a>1n3^rDHsoTv$`)k&XH=W~K7fINxv^bkn6}%38UG!>-#Pu@xuKc;uRJ+( z{}-S9O6OE*%74eDs`{B{G7WdW{LHzQ>EV|S&ODQDxHD6=eX3-BRr6Wj^ud(WF?}#i zQ-=DRl{%(Qix>}F$Vp4u1y_l$N{ zH%HJeDA*0tJ)=FExnNVLdp<$;$fE0JZFKN6y5~2gdo`1OM)w9M;KTi-PWB3>c-Kp@ z9~`5;n?`-sx{><_UW@`NuK}|;ZTSX;bQ-68lj8pgAh`&Isb-5OopA71&UVuKFI+C= z=ldDe{NDgpMqlRLb<_P11Z&Io2BkqFD%vH8 z%vGeCy0umv*!RK&R&_iM7xg(Vkzsyn*yCjnnc6*ZclRMX8E48qxoR1cL5E?FmqRc~ zPFYHvQXQ!XBjN)0cxl?%5nVeguWVNhIdPS=hPxpLxO!<*B4BEGj9oT2osknQx*~!m zN`aB9HME53x9A45%jOhT1ah6a$E!rHmm5MQs!qqX0-cV~BxyJUiEtzII8Kni=U~ph zil!pDiD0}`E=l}t$jwZnhqaa%0uN<)V{?62HQb+A<>sS*spFENjCp$qo%^ym#XdDT zpiTuIxkYXj#=RvgXb&;110K1ZIay$lc?kg0Co(C zr+&V!gMRg0^Q&6tSGB<&EzmyS-1b&m`zw21-;-|KkCP2L>v8Htwf~AuYOln}290Gn z*`U$|UkmM)njkrAc+;QpA6~gw@JAmsuL$$r>NAn&Pfz>P-atwsp&x-X$sg6qxB@(*I&}xtDn_FVFE_ z{$brkBU)6OZkd8!Z$a~(7Nx*=7*}`Zxy6*XECvZ?LF}T*_g)SI4t^QxGT)VD=Di&F z#J!heuY!NFpU6JnT00aQX6IEw#FXodE7xh4y4%)lawM-?Tc6N3+2(9gAO@clVAsen z-LORgMrDZ#*}_;LE3rL0tfC(z%w(33RW4$8ZFE=Nkg!TsncCTg1P6Ce#in%Ju_)xG zZh)QC1HsixltOZ(BxDwt8rgKH{4T=TJI!Z3(G+1c1)U?7G}(8sgXTbbRle=u;I}$o zd+&4dv73C8_aNI8W25F<+`MF&B|wB~Q#XQ<}dg zsZbHV%dmDaBS;^or{&4&X$h+@QuqR;ju9SMd9d%s;zUs2>>6{@W0vK727 zmAVZ8v8r!O!Dn3M^S^?9Nc=Y|HB6lnF&?;(lLjwyS`OTpuFT7fDC)R9ywO8+eJF-E z+akxUuuO-yahqr%@@+WxNMQFQoR5-H>|rz`7hju=Yh|8pe-|KWHt1F@7K$gHi0%WVhIF%M| zLkG#IN{JnZchDp%f;Jw>j%aU- zxOU0bUL0Wfpg8<6U|H_yLw(LC|892_6lU{1a1_y=`(PX@9JirES4_iP5iR@LwXEuMkA3>F zm$s$4cVs*}FS*MHtM> zexb?9nBr_pIvH~j&k^FJOLV-Q{^=bL#G}P$-o;NpG^m3_4LDsDI&HZ$N0lUL#m!y) z8P@S-T2B41#xOEeB@I2h45*s-#v}H{QFY)1N7co1jM!CVcB^i;49Lt5^MsXaEY6O* z?U{6IA#&2K&?`;WsY%-cl(vm{S(>D_1&$4vi45}aI#{A$3za47cuK)>mfR~AA&~N~ z1pX(1Ep)$xUPVXd8Al#6D?2FbPJsOLK`R(yt1lU5=a056AXgltjhV&>m7hm_KxKc& zxw165zr5tBIJ0&-dEV1@_Q`p-=X3j?-=C`LJ?~ymDBNzc)_rO4?AlCI&myje)v>Vg z_jZ11=LOFiF&NgSJZs*mMvWIdZP(wavXrgG-Rnv0XtaF3a*xyYMqlZk8r!#=u02)7 z->Px#=~!f#+*|N4S&oW=>e?Q(h77~RWZ2MzmUpScM$lVmXz@;TnGPEo;OsI#2vk2x zf(|xTY598{AdeM1K*}bWb2Afak0C;oIf5hP**UE^O}=NcgnY$wf=&Ykt}<-Ml(k0S z7XsM|86A^#Ct(c1h7_IeYOG(mOm*_n!-j~m6$z#TXq;}?30wonA0Jh0d|)EN8N4`B zn=eJVDT4{o&&&5LjgrrzRz6Bf)BgV4Q39O)+`&&Dob|6cxBI-i=RJ*+ov97?XFP|* zFd49>Jcn||$$<5O=a4#3>i5`eZ}gV#sj_{`=Gs$H{H-e2p4QxfLURq9Ps-Uqks){F zPGDsx5Wbks$ByHqWffH&hE%)mUKslo8Tk#on5Awy5V1-G;2CJ7KQ$X>Xeg+oy? zO2~0LOTEUfBg7u%T{x>&%roS7NgWXMRqQg%7=9W5^f_+Czr=jtSLcJ8$vcQn8qs}d zVr(=v7{R&2noJzt-i zuTRU@H|gfsxPgr#E!J1aK9>7X?_}d%@xHtw?HCGrdp3Xuf=+`FM4xkhjN+VYD$bi| zvc>8)JZs*73HdxCMIE5}#|T_=?}+StiR*{FaKgv%6jyczu7S{QfD&rSeuo%)qm$ zPV=Kn&s^Z*?#p=Y7bEhJb}qpfuJ7TX(JTzee3<4*8#v^>f;_vvvg zFzk^G<7+X)9>#;gum=+Zji(%Ltng8UF}TDis>JxLG>oFkB}P%DI*NP?Ck^$?64i_W z+!X|WobTr2MtKv>qWlek?+_qar9#sjSp8NCw-IP3&_SS+zzl(V2#EDXp8Wz}N7NnS zj|z0sD3LZ3@-MHvQ<7?4U-rjLI%@?)Q(eOVH^}7fG3&sB$4gd3Ajbp#!0-7nMgln`2WOSTU@myrq z3ak6(mPO{g{G`xH{x(Nui4pCNw^UP_R;(hHHKievzhYCG#Zdj;jORW$z@ZmcO+ z+#oFnFSP_MuL8$ZWuWC%OVIKvm6lhlW@-imKYsFw?&Mfx5PNS6rQ!^|v3a$T(3UB$ zTPRd00{b)s?h?;dN%!K&%g@=-C@ac^**kLu?-Ddv~C^}b#jC?n12bJecNFt zdoL&1dwB>+y^J0Q-a9?sm+|ZuNO}NBy8ohQze(YSFQ^{8q1z0&2WwzCb#1jgPJ1okI@Hr^(d3>yEU7$;UJ;d;SJOMAbES7W4#sVzYXml?%Mj^Xz+Mbsq~{6&tqZ$d;|mFv5TI^t}7K7n5trv#>ja9a5Lt98UIIP>eggl@?h>G}!}SvXLLmYX)A zU9BwxfWiTw72tr?54gS=V2`!Y;rr?KCWnvLMKuiKzqn8vyRL1=pBvRhD5WE=U}@IF zx^`$JdUph(@7Ox@++;n&{p1A1iUDkm4@7Z0Fi7`v^$ubxOW}>kU3uQF0=O<0OZdeW zAKXoO!F~sCTUzLL`M2ECf^VjLDmH#15Is3IewqlLCaCC{bg{HTB?^UTUV}#K2-34C zVu55hi{UeLV~W7@0H%ssF1Tjzr%C0SbMTxCt&JIaiDZ2R6+#?W;hS=D|ElZWbY}by zENy3<%N+7E@xY1RVsi*pG=>+pScu=G__e$65qYa8ZMH7Pd~E|`dF$qD;`-U83D4p= zdw@)~Q&&@wMYdBHLyYKu(3*HIB(eP+Us|o!UeI>x8g}e*?a`+Vc2#vttBy<@cT4Ic zE#`ZLbnltkyA|ZEO6-<_T~(bfhFrv7nlN*g>s7ldm)syXhFt32n)m88auw{Vrbkxk z+X{nCldX*YvvEI)gJePb!S_tS0e5`xbb#%L=uy6nw#k>atfjPd1oRCf%mqy~rLBm+ z`CBS63;;PjRdJd6LJ1J)CQwhHiogbdm<5Q>A1VwyvJQIdWWMPZ+o6c9*trVbd_wao z)!d9(&2;lLjW@QwRRLe-@|}&W!HDRnTS2m1W2Q~8obcplqH&un!sC|TBC`ASjp@b% z3pu8nwnYTzR94i?Y|hm5y}bEsWV-XEl9|ovn!ZeV|CA$JyDKH_O4aU4dw0!A>Tg!s zH8mk(Ja8c=Ev?6J1I6=lBZ^Ai#SX^Wes(y@8{V?L4U6n^ycd*i5gG?ovbw{OD4oY- z30+nIN8)AYliB$$N4Yv@3B7U}=U;}#)!ZWjD==L}v7dDyHX_MjHDYp|C2lM5XEdwo_}_S?i>z-4oa4G40J+6rZI ztK1eUQz^Z71-k}?uVGDRj9rvyg}56n@f#ye25R3v$rg9q;Eu=^?G~g3E!WD9C7y~Z z`VaFLa2G1EQHvkfYya`w(&4) z*l=PP_HEHIzN7G-+(}~b?1MC2JW$YdvFG)>|6fiQ-SDK9s_izq(wddJX}ZvTYatA0 zScWus($Zq&C95_kD=&#G^WLg6N1i`TXFK1`fboFmS)9lWcOT98!CSr-q{Er-7H2Yx z0CJFD4}I?07m#wc;}ip57=OKu%{|8uLoTHJ_m_8n$^C zJdTUq09u`(DCA?;3X0NaXc6ws8G9OTw+L@U?lcNG;{I^A$cPm5OfnN`?9#w-KR6#I z0QW+4kPmItU^~ayVLw=HXlMN}i#G^K*>ZvbM)FN)`hb!`b8A_f5LV`R0t-`F*elTd zZ}7I1q$pF*PVoC|QScn4K1<-^1U^RKlK_k}{Tq`m!flKPvK`cvLE9zAaeXDGMGB?3sk^g*r< zpTK0BIX0?f&&kN_OCY#QvG~kVp@f)yrswi7w%;a~Gv>d-gzc{^fbEA%6L{;f!`cT9 zJLI~;*?ygf%JsSXzBN**{f(1tR0A0P1ujsSP(u)2a}z!+`ssLzRY#2uP?~hZK64?3 zCXBfgjmr)t(4RtsBJW!CoIJf0F=Ba_u@r zC1{0*l`Cp%Q3`F{`2lHbFY~VXYwxM8 z|HaVOT{pe0#Zf>jYHLvnZQb?3XzP(yRX|WGoU7-Af~z&BL!=tg&ZiCtql$p)ezIUu zfkCFrL_cF6OOt*x8ZEn?>_Z4 zr|Jfw!bj&w`jCS=C1o;oA^afEquepkLSeU`%8(eHGHqZIh$M5|NAZn39$aw1;$Kn$ zTQWLs;FXv3Od9kqBTw(69%StFXXuTl2$1%UxtdV=>F(Dlx)GryyjJFj*THjhhMS8~ z7H?ZqG0NiEe}me624H!qJ9o@9FI+THX!q-V>BfCmitXgANh0GN7a8wVkR;y6jCWQS zZoFgW!yS_N>V}v4(@k4mb!M8jWE!?)shJ7!eI@j{r z#;@1BzWeKqUp_d!n|WD1m2TLVsoFnPLjG58R<5_;v9)sg0G;N3ApbOXh;ZmMck#P` zW8L*M9O16TaAW#_$R+~FK@MSWY4uHKi>CLdq_*k(SiH?i>TgzRqj%F25WbL;meymq zF}*)8Goq;D-K!o;<9;g~&hDfC$#x8lzgpuS*d_hdwi3X<_PHOhOMl%~LU6nLffDJz zx=R4f&y~LyimN4@VK|mN8j?17GE>}TRz^4J z3tuVLSXxhF4Obp72}w8>$$sG5q_jCkQ$M>x?qjW5STpNO{Db!Xa3;DI>pD6?ekDDK zDb?_nragEpAfAd@iZ1P$?40{$dU9cDv>ohhSM-+PftGvAkXR>)w*&qfN}1!CD22Cz zBxFTzRY=?EJ$55)x1coMvTE6H5~~=wT6IfWdhEU8q(}Bc#^XNH!7;!HHELw=`&V+T zOUn@wPHl8tWq$w2J?zcK)K2ZG+ISMUfZkuO-Ht6Njn=MRu9W_L%jVSkrA|Ep@IChn zq&(O@$y`Embsmni8=Dv>!x851B5<$#z(4@r zYPe%I_+N=WJr*4#uWbCVwL)Asv2B;!%HwemG35yY0|Z71oFwoNz<_c;1rHH8PT*Yv zhXHbAK!o5-2DAqOgA52Fr?}O)(@Ix{Yy71u8-iAXv;rd*CL6L2K;Ot7N^Un0ehGiF z)+ZK$QLHh zwtQ8ds|luRf?wONN{3c8=R3*dTU|XUYij<|MmIVL&`i&E!v?Did%4L}L3xV6(*#Zv zm?ZEFfhP%^BCw=Lh&K%Dh%Zy-uK)xq3}Pahj=4L|U!%}R39!EXG=*4KGg;ARDf%%2 ztZSd8&~pSnPT-RO>?ju16jT4hMMjm(@+POPvjG!`LvNd%C7E9-4Y`#MtY$5J~UNCyw69(*(%e2k*O z$I`yX=G^LU7AZK7kwwoN!WouRWD^0T(>veFx@%w9m2y9l+ID}s^U#|o(w&b`tn-ny z?~ysT`kQq>a%LBgo!5gImQQ37fs5`(u6lvRe4i9aod$w#&48bsVGy8ch8GWvCPrd|xjVZcDCDLX zdr`HiDF$=hb@O+W$k@xd+#y$aN^_@B?4$E=NM8-j^HAYKwWF%=g}5qCyGm+r*y6g5 z2@30TEUp_qfEL#c1`(_2@>Uls%8f#Jw@Pj@2=BlY@yk$`sUXx;c-M@@=k!Rk!Ol3K z+I|@^aZE{!;GpFsaG1MQ7e>Z(Sl_qt{6cn^1SlW9?#fk!%C}G~YwhDZO&s|uHl}xE z)jv$tzea#<3i$?wh6qFf2KcF00gP?tMHV|v3k<~)J4aThq}8d~)oCxz!&QH?(rPmF)e{iDkdv0y zW4M9hdASiqC4A?|H7EMwjf%3ps}|WgvM4QM4sdS%&Ji*tdyllNb{Se$cf+)-p3$;~ z99q`+0iX#F=x(E4pyaTQF^`fuVvNjFB&)~2MjsoJ$^@7g&@{mn{iiJ<8T z2w%uaOY1S*K=HiXh@uis(3HKzJ)%*>CD^j}dlTHPCS};1RY#;QE@9)qbf|=BarG0F zA!#s>0heeG5x`c8dUld%gYhI$Lm6thyvwLvT{frC-Ux=NR3?T=KiyO4rww%pErb4O zJXPr03=(@VOmS?d7@NWOPlbx_M1d8aARS=e-*WUhgT_MAo@=I2H-5j|dQ1@4BSIUc zm9|(+^xb;h=4_#`z|{0Ed7Gondqk9i&Iy~}ROZwpx62*k_`I@Xg}1fP$>g^6*r#)t z)6U6*Z1u!8CJ?)&ZGwx8J{5tt##nrCbYcjy%jhYbIuRRYo8y#dWbg#ij-L(~n6rXu z{bj=U%Y6Xvg?UwWgGMlFuLLQ?==um%dV)ZV0PT4jP{I_95MV4Utx9zk7OUo6ln^Ii zCqOn}23cS3R*5{(-?HXk`d+%{sM==7hORHnuPY+nz9Q>3JmN8hLm5xuuM#p3EB*WBfU6^}#?};RnQH5kCJxY|h z&CZsHB=}C{X&mkUmEMmDz_%cjmB!&MguInzixSCf@mOU1#AxieLbG$$KN1}e4^PlW zpr@iZIvpqqHz`RzkZ71^i-%)Rk54F3h2}1WD5BCsfYcFu 0 + runners = {entry['runner'] for entry in result} + assert 'b200-trt' in runners + assert 'mi355x' in runners + + def test_model_prefix_filtering(self, temp_config_dir, valid_nvidia_config, config_with_optional_fields, monkeypatch): + """Test that model prefix filtering works correctly.""" + combined_config = {**valid_nvidia_config, **config_with_optional_fields} + config_file = create_config_file(temp_config_dir, "combined.yaml", combined_config) + + # Filter for 70b only + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + result = main() + + # Should only have 70b configs + assert all('70b' in list(combined_config.keys())[0] for entry in result) + assert len(result) == 3 # Only from 70b config + + # Filter for dsr1 only + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', 'dsr1' + ]) + + result = main() + + # Should only have dsr1 configs + # 3 bmk-space entries: [4,8,16,32] + [64,128] + [256] = 4+2+1 = 7 entries + assert len(result) == 7 + + def test_optional_fields_ep_and_dp_attn(self, temp_config_dir, config_with_optional_fields, monkeypatch): + """Test that optional ep and dp-attn fields are included when present.""" + config_file = create_config_file(temp_config_dir, "config.yaml", config_with_optional_fields) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', 'dsr1' + ]) + + result = main() + + # Check entries without optional fields + entries_without_ep = [e for e in result if 'ep' not in e] + assert len(entries_without_ep) > 0 + for entry in entries_without_ep: + assert entry['conc'] <= 32 + + # Check entries with ep but without dp-attn + entries_with_ep_no_dp = [e for e in result if 'ep' in e and 'dp-attn' not in e] + assert len(entries_with_ep_no_dp) > 0 + for entry in entries_with_ep_no_dp: + assert entry['ep'] == 4 + assert 64 <= entry['conc'] <= 128 + + # Check entries with both ep and dp-attn + entries_with_both = [e for e in result if 'ep' in e and 'dp-attn' in e] + assert len(entries_with_both) > 0 + for entry in entries_with_both: + assert entry['ep'] == 4 + assert entry['dp-attn'] is True + assert entry['conc'] == 256 + + def test_step_size_default(self, temp_config_dir, valid_nvidia_config, monkeypatch): + """Test default step size of 2.""" + config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + result = main() + + # For tp=2, conc-start=64, conc-end=128 with step=2 + # Should generate: 64, 128 + tp2_entries = [e for e in result if e['tp'] == 2] + tp2_concs = sorted([e['conc'] for e in tp2_entries]) + assert tp2_concs == [64, 128] + + def test_step_size_custom(self, temp_config_dir, valid_nvidia_config, monkeypatch): + """Test custom step size.""" + config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', '70b', + '--step-size', '4' + ]) + + result = main() + + # For tp=2, conc-start=64, conc-end=128 with step=4 + # Should generate: 64, 128 (64*4=256 > 128, so stop at 128) + tp2_entries = [e for e in result if e['tp'] == 2] + tp2_concs = sorted([e['conc'] for e in tp2_entries]) + assert tp2_concs == [64, 128] + + def test_conc_range_single_value(self, temp_config_dir, monkeypatch): + """Test when conc-start equals conc-end.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "runner": "test-runner", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 64, "conc-end": 64}, + ] + } + ] + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', 'test' + ]) + + result = main() + + assert len(result) == 1 + assert result[0]['conc'] == 64 + + def test_different_seq_lens(self, temp_config_dir, valid_nvidia_config, monkeypatch): + """Test with different sequence length configurations.""" + config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) + + # Test 1k8k + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k8k', + '--model-prefix', '70b' + ]) + + result = main() + + # Should match 1k8k config + assert all(e['isl'] == 1024 and e['osl'] == 8192 for e in result) + assert len(result) > 0 + + def test_no_matching_seq_lens(self, temp_config_dir, valid_nvidia_config, monkeypatch): + """Test when no configs match the requested sequence lengths.""" + config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '8k1k', # Not in the config + '--model-prefix', '70b' + ]) + + result = main() + + # Should return empty list + assert result == [] + + def test_no_matching_model_prefix(self, temp_config_dir, valid_nvidia_config, monkeypatch): + """Test when no configs match the model prefix.""" + config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', 'nonexistent' + ]) + + result = main() + + # Should return empty list + assert result == [] + + +class TestErrorHandling: + """Test suite for error handling.""" + + def test_missing_config_file(self, temp_config_dir, monkeypatch): + """Test error when config file doesn't exist.""" + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', '/nonexistent/file.yaml', + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + with pytest.raises(ValueError, match="does not exist"): + main() + + def test_invalid_yaml(self, temp_config_dir, monkeypatch): + """Test error when YAML is invalid.""" + config_path = temp_config_dir / "invalid.yaml" + with open(config_path, 'w') as f: + f.write("invalid: yaml: content: [") + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', str(config_path), + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + with pytest.raises(yaml.YAMLError): + main() + + def test_non_dict_config(self, temp_config_dir, monkeypatch): + """Test error when config is not a dictionary.""" + config_path = temp_config_dir / "list.yaml" + with open(config_path, 'w') as f: + yaml.dump(["not", "a", "dict"], f) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', str(config_path), + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + with pytest.raises(AssertionError, match="must contain a dictionary"): + main() + + def test_duplicate_keys(self, temp_config_dir, monkeypatch): + """Test error when duplicate keys exist across config files.""" + config1 = { + "70b-fp4-b200-trt": { + "image": "image1", + "model": "model1", + "runner": "runner1", + "precision": "fp4", + "framework": "trt", + "seq-len-configs": [] + } + } + config2 = { + "70b-fp4-b200-trt": { # Same key + "image": "image2", + "model": "model2", + "runner": "runner2", + "precision": "fp4", + "framework": "trt", + "seq-len-configs": [] + } + } + + file1 = create_config_file(temp_config_dir, "config1.yaml", config1) + file2 = create_config_file(temp_config_dir, "config2.yaml", config2) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', file1, file2, + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + with pytest.raises(ValueError, match="Duplicate configuration keys"): + main() + + def test_missing_seq_len_configs(self, temp_config_dir, monkeypatch): + """Test error when seq-len-configs is missing.""" + config = { + "70b-fp4-b200-trt": { + "image": "test-image", + "model": "test-model", + "runner": "test-runner", + "precision": "fp4", + "framework": "trt", + # Missing seq-len-configs + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + with pytest.raises(AssertionError, match="Missing 'seq-len-configs'"): + main() + + def test_missing_required_fields(self, temp_config_dir, monkeypatch): + """Test error when required fields are missing.""" + # Missing 'model' field + config = { + "70b-fp4-b200-trt": { + "image": "test-image", + # Missing model + "runner": "test-runner", + "precision": "fp4", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 64, "conc-end": 64} + ] + } + ] + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + with pytest.raises(AssertionError, match="Missing required fields"): + main() + + def test_missing_bmk_space(self, temp_config_dir, monkeypatch): + """Test error when bmk-space is missing.""" + config = { + "70b-fp4-b200-trt": { + "image": "test-image", + "model": "test-model", + "runner": "test-runner", + "precision": "fp4", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + # Missing bmk-space + } + ] + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + with pytest.raises(AssertionError, match="Missing 'bmk-space'"): + main() + + def test_missing_bmk_space_fields(self, temp_config_dir, monkeypatch): + """Test error when tp, conc-start, or conc-end is missing.""" + config = { + "70b-fp4-b200-trt": { + "image": "test-image", + "model": "test-model", + "runner": "test-runner", + "precision": "fp4", + "framework": "trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 64} # Missing conc-end + ] + } + ] + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + with pytest.raises(AssertionError, match="Missing 'tp', 'conc-start', or 'conc-end'"): + main() + + +class TestEdgeCases: + """Test suite for edge cases.""" + + def test_empty_config(self, temp_config_dir, monkeypatch): + """Test with empty config file.""" + config = {} + config_file = create_config_file(temp_config_dir, "empty.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + result = main() + + # Should return empty list + assert result == [] + + def test_large_conc_range(self, temp_config_dir, monkeypatch): + """Test with large concurrency range.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "runner": "test-runner", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 4, "conc-end": 1024}, + ] + } + ] + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', 'test' + ]) + + result = main() + + # With step=2: 4, 8, 16, 32, 64, 128, 256, 512, 1024 + concs = sorted([e['conc'] for e in result]) + assert concs == [4, 8, 16, 32, 64, 128, 256, 512, 1024] + + def test_conc_end_not_power_of_step(self, temp_config_dir, monkeypatch): + """Test when conc-end is not a power of step size.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "runner": "test-runner", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 10, "conc-end": 100}, + ] + } + ] + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', 'test' + ]) + + result = main() + + # With step=2: 10, 20, 40, 80, 100 (last value is conc-end) + concs = sorted([e['conc'] for e in result]) + assert concs == [10, 20, 40, 80, 100] + assert concs[-1] == 100 # Should always include conc-end + + def test_all_seq_lens_in_stoi(self): + """Test that all defined seq_lens work correctly.""" + assert seq_len_stoi["1k1k"] == (1024, 1024) + assert seq_len_stoi["1k8k"] == (1024, 8192) + assert seq_len_stoi["8k1k"] == (8192, 1024) + + def test_multiple_bmk_space_entries(self, temp_config_dir, monkeypatch): + """Test with multiple bmk-space entries.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "runner": "test-runner", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 32, "conc-end": 64}, + {"tp": 2, "conc-start": 16, "conc-end": 32}, + {"tp": 4, "conc-start": 8, "conc-end": 16}, + ] + } + ] + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', 'test' + ]) + + result = main() + + # Verify all tp values are present + tp_values = sorted(set(e['tp'] for e in result)) + assert tp_values == [1, 2, 4] + + # Verify correct conc ranges for each tp + tp1_concs = sorted([e['conc'] for e in result if e['tp'] == 1]) + tp2_concs = sorted([e['conc'] for e in result if e['tp'] == 2]) + tp4_concs = sorted([e['conc'] for e in result if e['tp'] == 4]) + + assert tp1_concs == [32, 64] + assert tp2_concs == [16, 32] + assert tp4_concs == [8, 16] + + def test_output_format(self, temp_config_dir, valid_nvidia_config, monkeypatch, capsys): + """Test that output is valid JSON and matches expected format.""" + config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', '70b' + ]) + + result = main() + + # Capture stdout + captured = capsys.readouterr() + + # Verify it's valid JSON + json_output = json.loads(captured.out.strip()) + + # Verify it matches the result + assert json_output == result + + # Verify each entry has the correct structure + for entry in json_output: + assert isinstance(entry, dict) + assert all(isinstance(k, str) for k in entry.keys()) + assert entry['image'] == valid_nvidia_config['70b-fp4-b200-trt']['image'] + assert entry['model'] == valid_nvidia_config['70b-fp4-b200-trt']['model'] + assert entry['precision'] == valid_nvidia_config['70b-fp4-b200-trt']['precision'] + assert entry['framework'] == valid_nvidia_config['70b-fp4-b200-trt']['framework'] + assert entry['runner'] == valid_nvidia_config['70b-fp4-b200-trt']['runner'] + + +class TestConcurrencyGeneration: + """Test suite specifically for concurrency value generation logic.""" + + def test_conc_progression_step_2(self, temp_config_dir, monkeypatch): + """Test concurrency progression with step size 2.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "runner": "test-runner", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 1, "conc-end": 16}, + ] + } + ] + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', 'test', + '--step-size', '2' + ]) + + result = main() + + # Should multiply by 2 each time: 1, 2, 4, 8, 16 + concs = sorted([e['conc'] for e in result]) + assert concs == [1, 2, 4, 8, 16] + + def test_conc_progression_step_3(self, temp_config_dir, monkeypatch): + """Test concurrency progression with step size 3.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "runner": "test-runner", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 2, "conc-end": 100}, + ] + } + ] + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', 'test', + '--step-size', '3' + ]) + + result = main() + + # Should multiply by 3 each time: 2, 6, 18, 54, 100 + concs = sorted([e['conc'] for e in result]) + assert concs == [2, 6, 18, 54, 100] + + def test_conc_exact_end_value(self, temp_config_dir, monkeypatch): + """Test that conc-end is always included even if not reached by progression.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "runner": "test-runner", + "precision": "fp8", + "framework": "vllm", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "bmk-space": [ + {"tp": 1, "conc-start": 5, "conc-end": 50}, + ] + } + ] + } + } + config_file = create_config_file(temp_config_dir, "config.yaml", config) + + monkeypatch.setattr('sys.argv', [ + 'script.py', + '--config-files', config_file, + '--seq-lens', '1k1k', + '--model-prefix', 'test', + '--step-size', '2' + ]) + + result = main() + + concs = sorted([e['conc'] for e in result]) + # 5, 10, 20, 40, 50 (40*2=80 > 50, so we include 50) + assert concs[-1] == 50 + assert 50 in concs From fdb94fab8a835d5603a0629a038fd15da101701c Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 12:34:06 -0500 Subject: [PATCH 031/149] adding other isl osl --- .github/workflows/1k1k-sweep.yml | 128 ++++++++++++++------------- .github/workflows/1k8k-sweep.yml | 145 +++++++++++++++++++++++++++++++ .github/workflows/8k1k-sweep.yml | 145 +++++++++++++++++++++++++++++++ 3 files changed, 356 insertions(+), 62 deletions(-) create mode 100644 .github/workflows/1k8k-sweep.yml create mode 100644 .github/workflows/8k1k-sweep.yml diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index ee1c8ddd2..768d278f5 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -1,5 +1,9 @@ name: "1K/1K Sweep" +concurrency: + group: benchmark-lock-1k1k + cancel-in-progress: false + on: pull_request: workflow_dispatch: @@ -28,7 +32,7 @@ jobs: - id: get-dsr1-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-gptoss-configs: @@ -41,56 +45,56 @@ jobs: - id: get-gptoss-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - # benchmark-70b: - # needs: get-70b-configs - # uses: ./.github/workflows/benchmark-tmpl.yml - # name: 70b 1k1k - # strategy: - # fail-fast: false - # matrix: - # config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} - # secrets: inherit - # with: - # exp-name: "70b_1k1k" - # isl: 1024 - # osl: 1024 - # max-model-len: 2048 - # runner: ${{ matrix.config.runner }} - # image: ${{ matrix.config.image }} - # model: ${{ matrix.config.model }} - # framework: ${{ matrix.config.framework }} - # precision: ${{ matrix.config.precision }} - # tp: ${{ matrix.config.tp }} - # ep: ${{ matrix.config.ep || 1 }} - # dp-attn: ${{ matrix.config.dp-attn || false }} - # conc: ${{ matrix.config.conc }} + benchmark-70b: + needs: get-70b-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: 70b 1k1k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "70b_1k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} - # benchmark-dsr1: - # needs: get-dsr1-configs - # uses: ./.github/workflows/benchmark-tmpl.yml - # name: dsr1 1k1k - # strategy: - # fail-fast: false - # matrix: - # config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} - # secrets: inherit - # with: - # exp-name: "dsr1_1k1k" - # isl: 1024 - # osl: 1024 - # max-model-len: 2048 - # runner: ${{ matrix.config.runner }} - # image: ${{ matrix.config.image }} - # model: ${{ matrix.config.model }} - # framework: ${{ matrix.config.framework }} - # precision: ${{ matrix.config.precision }} - # tp: ${{ matrix.config.tp }} - # ep: ${{ matrix.config.ep || 1 }} - # dp-attn: ${{ matrix.config.dp-attn || false }} - # conc: ${{ matrix.config.conc }} + benchmark-dsr1: + needs: get-dsr1-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: dsr1 1k1k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "dsr1_1k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} benchmark-gptoss: needs: get-gptoss-configs @@ -116,21 +120,21 @@ jobs: dp-attn: ${{ matrix.config.dp-attn || false }} conc: ${{ matrix.config.conc }} - # collect-70b-results: - # needs: benchmark-70b - # if: ${{ always() }} - # uses: ./.github/workflows/collect-results.yml - # secrets: inherit - # with: - # exp-name: "70b_1k1k" + collect-70b-results: + needs: benchmark-70b + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "70b_1k1k" - # collect-dsr1-results: - # needs: benchmark-dsr1 - # if: ${{ always() }} - # uses: ./.github/workflows/collect-results.yml - # secrets: inherit - # with: - # exp-name: "dsr1_1k1k" + collect-dsr1-results: + needs: benchmark-dsr1 + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "dsr1_1k1k" collect-gptoss-results: needs: benchmark-gptoss diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml new file mode 100644 index 000000000..da747e3ed --- /dev/null +++ b/.github/workflows/1k8k-sweep.yml @@ -0,0 +1,145 @@ +name: "1K/8K Sweep" + +concurrency: + group: benchmark-lock-1k8k + cancel-in-progress: false + +on: + # pull_request: + workflow_dispatch: + +jobs: + get-70b-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-70b-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + get-dsr1-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-dsr1-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + get-gptoss-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-gptoss-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + benchmark-70b: + needs: get-70b-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: 70b 1k8k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "70b_1k8k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} + + benchmark-dsr1: + needs: get-dsr1-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: dsr1 1k8k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "dsr1_1k8k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} + + benchmark-gptoss: + needs: get-gptoss-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: gptoss 1k8k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "gptoss_1k8k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} + + collect-70b-results: + needs: benchmark-70b + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "70b_1k8k" + + collect-dsr1-results: + needs: benchmark-dsr1 + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "dsr1_1k8k" + + collect-gptoss-results: + needs: benchmark-gptoss + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "gptoss_1k8k" diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml new file mode 100644 index 000000000..d5ffc3f43 --- /dev/null +++ b/.github/workflows/8k1k-sweep.yml @@ -0,0 +1,145 @@ +name: "8K/1K Sweep" + +concurrency: + group: benchmark-lock-8k1k + cancel-in-progress: false + +on: + # pull_request: + workflow_dispatch: + +jobs: + get-70b-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-70b-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix 70b) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + get-dsr1-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-dsr1-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + get-gptoss-configs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-gptoss-configs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + benchmark-70b: + needs: get-70b-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: 70b 8k1k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "70b_8k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} + + benchmark-dsr1: + needs: get-dsr1-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: dsr1 8k1k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "dsr1_8k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} + + benchmark-gptoss: + needs: get-gptoss-configs + uses: ./.github/workflows/benchmark-tmpl.yml + name: gptoss 8k1k + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "gptoss_8k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} + + collect-70b-results: + needs: benchmark-70b + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "70b_8k1k" + + collect-dsr1-results: + needs: benchmark-dsr1 + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "dsr1_8k1k" + + collect-gptoss-results: + needs: benchmark-gptoss + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "gptoss_8k1k" From d339b8f44eb2ef77df349f709a906e3d70bd523a Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 14:42:41 -0500 Subject: [PATCH 032/149] adding more workflows --- .github/workflows/1k1k-sweep.yml | 6 +- .github/workflows/1k8k-sweep.yml | 5 + .github/workflows/70b-tmpl.yml | 230 ---------------- .github/workflows/8k1k-sweep.yml | 2 + .github/workflows/dsr1-tmpl.yml | 265 ------------------- .github/workflows/gptoss-tmpl.yml | 176 ------------ .github/workflows/test.yml | 147 ++++++++++ utils/matrix-logic/get_test_sweep_configs.py | 151 +++++++++++ 8 files changed, 309 insertions(+), 673 deletions(-) delete mode 100644 .github/workflows/70b-tmpl.yml delete mode 100644 .github/workflows/dsr1-tmpl.yml delete mode 100644 .github/workflows/gptoss-tmpl.yml create mode 100644 .github/workflows/test.yml create mode 100644 utils/matrix-logic/get_test_sweep_configs.py diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 768d278f5..58ee3131c 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -1,12 +1,14 @@ name: "1K/1K Sweep" concurrency: - group: benchmark-lock-1k1k - cancel-in-progress: false + group: benchmark-lock-1k1k + cancel-in-progress: false on: pull_request: workflow_dispatch: +# schedule: +# - cron: '0 23 * * *' jobs: get-70b-configs: diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index da747e3ed..5a89e54b2 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -4,6 +4,11 @@ concurrency: group: benchmark-lock-1k8k cancel-in-progress: false +on: + workflow_dispatch: + schedule: + - cron: '0 23 * * *' + on: # pull_request: workflow_dispatch: diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml deleted file mode 100644 index 3d1dd5051..000000000 --- a/.github/workflows/70b-tmpl.yml +++ /dev/null @@ -1,230 +0,0 @@ -name: Template - LLaMA 70B - -on: - workflow_call: - inputs: - exp-name: - required: true - type: string - isl: - required: true - type: string - osl: - required: true - type: string - max-model-len: - required: true - type: string - random-range-ratio: - required: true - type: string - - use_h100: - type: boolean - required: true - use_h200: - type: boolean - required: true - use_b200: - type: boolean - required: true - use_mi300x: - type: boolean - required: true - use_mi325x: - type: boolean - required: true - use_mi355x: - type: boolean - required: true - -jobs: - bmk-h100-fp8: - if: ${{ inputs.use_h100 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: h100 - image: 'vllm/vllm-openai:v0.10.2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[2, 4, 8]' - - bmk-h200-fp8: - if: ${{ inputs.use_h200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: h200 - image: 'vllm/vllm-openai:v0.10.2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - - bmk-h200-trt-fp8: - if: ${{ inputs.use_h200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: h200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - framework: 'trt' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - conc-list: '[4, 8, 16, 32, 64, 128]' # H200 can achieve TPS/User >= 30 with larger concurrency till 128 - - bmk-b200-fp8: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200 - image: 'vllm/vllm-openai:v0.10.2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - - bmk-b200-trt-fp8: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - framework: 'trt' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 256 - - bmk-mi300x-fp8: - if: ${{ inputs.use_mi300x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi300x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - - bmk-mi325x-fp8: - if: ${{ inputs.use_mi325x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi325x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - - bmk-mi355x-fp8: - if: ${{ inputs.use_mi355x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi355x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - - bmk-b200-fp4: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200 - image: 'vllm/vllm-openai:v0.10.2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP4' - framework: 'vllm' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - - bmk-b200-trt-fp4: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP4' - framework: 'trt' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 128 - - bmk-mi355x-fp4: - if: ${{ inputs.use_mi355x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi355x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview' - framework: 'vllm' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml index d5ffc3f43..9dc28c52b 100644 --- a/.github/workflows/8k1k-sweep.yml +++ b/.github/workflows/8k1k-sweep.yml @@ -7,6 +7,8 @@ concurrency: on: # pull_request: workflow_dispatch: +# schedule: +# - cron: '0 23 * * *' jobs: get-70b-configs: diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml deleted file mode 100644 index 3a48710f2..000000000 --- a/.github/workflows/dsr1-tmpl.yml +++ /dev/null @@ -1,265 +0,0 @@ -name: Template - DeepSeek R1 - -on: - workflow_call: - inputs: - exp-name: - required: true - type: string - isl: - required: true - type: string - osl: - required: true - type: string - max-model-len: - required: true - type: string - random-range-ratio: - required: true - type: string - - use_h200: - type: boolean - required: true - use_b200: - type: boolean - required: true - use_mi300x: - type: boolean - required: true - use_mi325x: - type: boolean - required: true - use_mi355x: - type: boolean - required: true - use_gb200: - type: boolean - required: false - default: false - -jobs: - bmk-h200-fp8: - if: ${{ inputs.use_h200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: h200 - image: 'lmsysorg/sglang:v0.5.2rc2-cu126' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'sglang' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' - - bmk-h200-trt-fp8: - if: ${{ inputs.use_h200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: h200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'trt' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' - - bmk-b200-fp8: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200 - image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'sglang' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' - - bmk-b200-trt-fp8: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'trt' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' - - bmk-mi300x-fp8: - if: ${{ inputs.use_mi300x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi300x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'sglang' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' - - bmk-mi325x-fp8: - if: ${{ inputs.use_mi325x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi325x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'sglang' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' - - bmk-mi355x-fp8: - if: ${{ inputs.use_mi355x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi355x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'sglang' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' - - bmk-b200-fp4: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200 - image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200' - model: 'nvidia/DeepSeek-R1-0528-FP4' - framework: 'sglang' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[4,8]' - conc-list: '[4, 8, 16, 32, 64, 128]' # Custom concurrency values for this job - - bmk-b200-trt-fp4: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - model: 'nvidia/DeepSeek-R1-0528-FP4-v2' - framework: 'trt' - precision: fp4 - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[4, 8]' - conc-list: '[4, 8, 16, 32, 64, 128, 256]' # DPA4EP4 is already 30 tok/s/user and DPA8EP8 is already 35tok/s/user. 512 conc would be too much so we skipping it - - bmk-mi355x-fp4: - if: ${{ inputs.use_mi355x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi355x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915' - framework: 'sglang' - precision: 'fp4' - model: 'amd/DeepSeek-R1-0528-MXFP4-Preview' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - # These tensor parallelism settings are not necessary as they cannot fall on the Pareto frontier with this particular container - we remove them to save CI time. - tp-list: ${{ inputs.isl == 1024 && inputs.osl == 1024 && '[4, 8]' || '[8]' }} - - bmk-gb200-fp4-multinode-mtp-off: - if: ${{ inputs.use_gb200 && !(inputs.isl == '1024' && inputs.osl == '8192') }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - secrets: inherit - with: - runner: gb200 - image: 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3' - model: 'deepseek-r1-fp4' - framework: 'dynamo-trtllm' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - mtp-mode: 'off' - - bmk-gb200-fp4-multinode-mtp-on: - if: ${{ inputs.use_gb200 && !(inputs.isl == '1024' && inputs.osl == '8192') }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - secrets: inherit - with: - runner: gb200 - image: 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3' - model: 'deepseek-r1-fp4' - framework: 'dynamo-trtllm' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - mtp-mode: 'on' - - bmk-gb200-fp8-multinode: - if: ${{ inputs.use_gb200 && !(inputs.isl == '1024' && inputs.osl == '8192') }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - secrets: inherit - with: - runner: gb200 - image: 'nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'dynamo-sglang' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - mtp-mode: 'off' diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml deleted file mode 100644 index 8bb8d13a6..000000000 --- a/.github/workflows/gptoss-tmpl.yml +++ /dev/null @@ -1,176 +0,0 @@ -name: Template - gpt-oss - -on: - workflow_call: - inputs: - exp-name: - required: true - type: string - isl: - required: true - type: string - osl: - required: true - type: string - max-model-len: - required: true - type: string - random-range-ratio: - required: true - type: string - - use_h100: - type: boolean - required: true - use_h200: - type: boolean - required: true - use_b200: - type: boolean - required: true - use_mi300x: - type: boolean - required: true - use_mi325x: - type: boolean - required: true - use_mi355x: - type: boolean - required: true - -jobs: - bmk-h100: - if: ${{ inputs.use_h100 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: h100 - image: 'vllm/vllm-openai:v0.10.2' - model: 'openai/gpt-oss-120b' - tp-list: '[2, 4, 8]' - framework: 'vllm' - precision: 'fp4' - - bmk-h200: - if: ${{ inputs.use_h200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: h200 - image: 'vllm/vllm-openai:v0.10.2' - model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' - framework: 'vllm' - precision: 'fp4' - - bmk-b200: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: b200 - image: 'vllm/vllm-openai:v0.10.2' - model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' - framework: 'vllm' - precision: 'fp4' - - bmk-b200-trt: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: b200-nvs - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' - model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' - framework: 'trt' - precision: 'fp4' - - bmk-h200-trt: - if: ${{ inputs.use_h200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: h200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' - model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' - framework: 'trt' - precision: 'fp4' - - bmk-mi300x: - if: ${{ inputs.use_mi300x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi300x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' - framework: 'vllm' - precision: 'fp4' - - bmk-mi325x: - if: ${{ inputs.use_mi325x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi325x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' - framework: 'vllm' - precision: 'fp4' - - bmk-mi355x: - if: ${{ inputs.use_mi355x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - runner: mi355x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'openai/gpt-oss-120b' - tp-list: '[1, 4, 8]' - framework: 'vllm' - precision: 'fp4' diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 000000000..0d92952da --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,147 @@ +name: Test - Full Sweep + +concurrency: + group: benchmark-lock + cancel-in-progress: false + +on: + pull_request: + workflow_dispatch: + inputs: + name: + description: "Name of benchmark from master configs" + required: true + type: string + default: 70b-fp4-mi355x-vllm + + run_1k1k: + description: "Run ISL/OSL 1k/1k" + type: boolean + required: true + run_1k8k: + description: "Run ISL/OSL 1k/8k" + type: boolean + required: true + run_8k1k: + description: "Run ISL/OSL 8k/1k" + type: boolean + required: true + + runner: + description: "Specific runner node to run on" + required: false + type: choice + options: + - "h100-cr_0" + - "h100-cr_1" + - "h100-cw_0" + - "h100-cw_1" + - "h200-cw_0" + - "h200-cw_1" + - "h200-nb_0" + - "h200-nb_1" + - "h200-nb_2" + - "h200-nb_3" + - "h200-nv_0" + - "h200-nv_1" + - "h200-nv_2" + - "h200-nv_3" + - "b200-nv_0" + - "b200-nv_1" + - "b200-nb_0" + - "b200-nb_1" + - "b200-nvd_0" + - "b200-nvd_1" + - "b200-nvd_2" + - "b200-nvd_3" + - "b200-tg_0" + - "mi300x-amd_0" + - "mi300x-amd_1" + - "mi300x-amd_2" + - "mi300x-amd_3" + - "mi300x-amd_4" + - "mi300x-cr_0" + - "mi300x-oci_0" + - "mi325x-amd_0" + - "mi325x-tw_0" + - "mi325x-tw_1" + - "mi325x-tw_2" + - "mi325x-tw_3" + - "mi355x-amd_0" + - "mi355x-amd_1" + - "mi355x-amd_2" + - "mi355x-amd_3" + +jobs: + get-jobs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-jobs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \ + --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ + --key ${{ inputs.name }} \ + ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }}) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + test-sweep: + needs: get-jobs + uses: ./.github/workflows/benchmark-tmpl.yml + name: test sweep - ${{ inputs.name }} + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "dsr1_1k1k" + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} + + calc-success-rate: + needs: test-sweep + if: ${{ always() }} + runs-on: ubuntu-latest + + env: + RESULTS_DIR: "results/" + STATS_FILENAME: "run_stats" + GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download results artifacts + uses: actions/download-artifact@v4 + with: + path: ${{ env.RESULTS_DIR }} + pattern: results_* + + - name: Install python dependencies + run: pip install PyGithub + + - name: Calculate success rate + run: python3 utils/calc_success_rate.py $STATS_FILENAME + + - uses: actions/upload-artifact@v4 + with: + name: "run-stats" + path: ${{ env.STATS_FILENAME }}.json diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py new file mode 100644 index 000000000..87ab0457b --- /dev/null +++ b/utils/matrix-logic/get_test_sweep_configs.py @@ -0,0 +1,151 @@ +import json +import yaml +import sys +import argparse + +seq_len_stoi = { + "1k1k": (1024, 1024), + "1k8k": (1024, 8192), + "8k1k": (8192, 1024) +} + +def main(): + parser = argparse.ArgumentParser( + description='Generate benchmark matrix from a specific configuration key' + ) + parser.add_argument( + '--config-files', + nargs='+', + required=True, + help='One or more configuration files (YAML format)' + ) + parser.add_argument( + '--key', + required=True, + help='Configuration key to use' + ) + parser.add_argument( + '--seq-lens', + nargs='+', + choices=list(seq_len_stoi.keys()), + required=False, + help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." + ) + parser.add_argument( + '--step-size', + type=int, + default=2, + help='Step size for concurrency values (default: 2)' + ) + + args = parser.parse_args() + + # Convert seq-lens to set of (isl, osl) tuples for filtering + seq_lens_filter = None + if args.seq_lens: + seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} + + # Load and merge all config files + all_config_data = {} + for config_file in args.config_files: + try: + with open(config_file, 'r') as f: + config_data = yaml.safe_load(f) + assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" + + # Check for duplicate keys + duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) + if duplicate_keys: + raise ValueError( + f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" + ) + + all_config_data.update(config_data) + except FileNotFoundError: + raise ValueError(f"Input file '{config_file}' does not exist.") + + # Check if the key exists + if args.key not in all_config_data: + available_keys = ', '.join(sorted(all_config_data.keys())) + raise ValueError( + f"Key '{args.key}' not found in configuration files. " + f"Available keys: {available_keys}" + ) + + val = all_config_data[args.key] + + # Validate required fields + seq_len_configs = val.get('seq-len-configs') + assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'" + + image = val.get('image') + model = val.get('model') + precision = val.get('precision') + framework = val.get('framework') + runner = val.get('runner') + + assert None not in (image, model, precision, framework, runner), \ + f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" + + matrix_values = [] + + # Process each sequence length configuration + for seq_config in seq_len_configs: + isl = seq_config.get('isl') + osl = seq_config.get('osl') + + assert None not in (isl, osl), \ + f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'" + + # Filter by sequence lengths if specified + if seq_lens_filter and (isl, osl) not in seq_lens_filter: + continue + + bmk_space = seq_config.get('bmk-space') + assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'" + + for bmk in bmk_space: + tp = bmk.get('tp') + conc_start = bmk.get('conc-start') + conc_end = bmk.get('conc-end') + ep = bmk.get('ep') + dp_attn = bmk.get('dp-attn') + + assert None not in (tp, conc_start, conc_end), \ + f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" + + # Generate entries for each concurrency value in the range + conc = conc_start + while conc <= conc_end: + entry = { + 'image': image, + 'model': model, + 'precision': precision, + 'framework': framework, + 'runner': runner, + 'isl': isl, + 'osl': osl, + 'tp': tp, + 'conc': conc, + 'max-model-len': isl + osl, + } + + # Add optional fields if they exist + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + print(json.dumps(matrix_values)) + return matrix_values + +if __name__ == "__main__": + main() \ No newline at end of file From 2b284f9203c529b38c4d953312fe5803404cf68d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 14:43:37 -0500 Subject: [PATCH 033/149] adding more workflows --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0d92952da..8299f1623 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,4 +1,4 @@ -name: Test - Full Sweep +name: Test Sweep concurrency: group: benchmark-lock From 09e9c4974cc8f11f158d88ec9a57f56f4aace9d3 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 14:45:02 -0500 Subject: [PATCH 034/149] adding more workflows --- .github/workflows/test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8299f1623..2c4e672cb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,7 +5,9 @@ concurrency: cancel-in-progress: false on: - pull_request: + push: + branches: + - initial-refactor workflow_dispatch: inputs: name: From 15553b8dff2e3b851078e0fc7adb5e60f15e4c6d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 15:09:58 -0500 Subject: [PATCH 035/149] adding more workflows --- .github/workflows/test.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2c4e672cb..01a3bd5fa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,9 +5,6 @@ concurrency: cancel-in-progress: false on: - push: - branches: - - initial-refactor workflow_dispatch: inputs: name: @@ -101,7 +98,7 @@ jobs: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit with: - exp-name: "dsr1_1k1k" + exp-name: "test" isl: ${{ matrix.config.isl }} osl: ${{ matrix.config.osl }} max-model-len: ${{ matrix.config.max-model-len }} From 471b7c2be93bf1a8e503e95f3a38d40e44b1fd63 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 15:21:23 -0500 Subject: [PATCH 036/149] adding more workflows --- .github/workflows/test.yml | 2 +- utils/matrix-logic/get_test_sweep_configs.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 01a3bd5fa..ecc590503 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -98,7 +98,7 @@ jobs: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit with: - exp-name: "test" + exp-name: ${{ matrix.config.model-code }}_test isl: ${{ matrix.config.isl }} osl: ${{ matrix.config.osl }} max-model-len: ${{ matrix.config.max-model-len }} diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py index 87ab0457b..8c021cd93 100644 --- a/utils/matrix-logic/get_test_sweep_configs.py +++ b/utils/matrix-logic/get_test_sweep_configs.py @@ -72,6 +72,9 @@ def main(): f"Available keys: {available_keys}" ) + # Extract model code (everything before first hyphen) + model_code = args.key.split('-')[0] + val = all_config_data[args.key] # Validate required fields @@ -120,6 +123,7 @@ def main(): entry = { 'image': image, 'model': model, + 'model-code': model_code, 'precision': precision, 'framework': framework, 'runner': runner, From fca9c160773c4aed0e0f668682fedfa4ae3c36d1 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 18:01:04 -0500 Subject: [PATCH 037/149] adding more workflows --- .github/workflows/test.yml | 1 + utils/matrix-logic/get_test_sweep_configs.py | 49 ++++++++++++++++---- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ecc590503..ab70e8ccd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -84,6 +84,7 @@ jobs: run: | CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \ --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ + --test-mode \ --key ${{ inputs.name }} \ ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }}) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py index 8c021cd93..b4b1366e7 100644 --- a/utils/matrix-logic/get_test_sweep_configs.py +++ b/utils/matrix-logic/get_test_sweep_configs.py @@ -37,6 +37,11 @@ def main(): default=2, help='Step size for concurrency values (default: 2)' ) + parser.add_argument( + '--test-mode', + action='store_true', + help='Generate only the lowest concurrency value for each TP level' + ) args = parser.parse_args() @@ -117,9 +122,8 @@ def main(): assert None not in (tp, conc_start, conc_end), \ f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: + # In test mode, only use the lowest concurrency (conc_start) + if args.test_mode: entry = { 'image': image, 'model': model, @@ -130,7 +134,7 @@ def main(): 'isl': isl, 'osl': osl, 'tp': tp, - 'conc': conc, + 'conc': conc_start, 'max-model-len': isl + osl, } @@ -141,12 +145,37 @@ def main(): entry['dp-attn'] = dp_attn matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end + else: + # Generate entries for each concurrency value in the range + conc = conc_start + while conc <= conc_end: + entry = { + 'image': image, + 'model': model, + 'model-code': model_code, + 'precision': precision, + 'framework': framework, + 'runner': runner, + 'isl': isl, + 'osl': osl, + 'tp': tp, + 'conc': conc, + 'max-model-len': isl + osl, + } + + # Add optional fields if they exist + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end print(json.dumps(matrix_values)) return matrix_values From 8ba4de923417f7441135ed5d81e8a97c06f74947 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 08:56:11 -0500 Subject: [PATCH 038/149] adding more workflows --- .github/workflows/test.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ab70e8ccd..1aac35921 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -72,6 +72,25 @@ on: - "mi355x-amd_3" jobs: + verify-compatible-runner: + runs-on: ubuntu-latest + if: ${{ inputs.runner != '' }} + steps: + - name: Verify runner compatible + shell: python + run: | + import re + + inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', ${{ inputs.name }}) + if inputs_name_re: + config_gpu = inputs_name_re.group(1) + inputs_runner_re = re.match(r'^([^-]+)', ${{ inputs.runner }}) + if inputs_runner_re: + runner_gpu = inputs_runner_re.group(1) + + assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'"" + + get-jobs: runs-on: ubuntu-latest outputs: From 8df3aa3384b609dbd26e6b58f88a91678b15c781 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 08:58:33 -0500 Subject: [PATCH 039/149] adding more workflows --- .github/workflows/1k1k-sweep.yml | 2 +- .github/workflows/test.yml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 58ee3131c..5a7ac4a25 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -5,7 +5,7 @@ concurrency: cancel-in-progress: false on: - pull_request: + # pull_request: workflow_dispatch: # schedule: # - cron: '0 23 * * *' diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1aac35921..4756aae65 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -93,6 +93,8 @@ jobs: get-jobs: runs-on: ubuntu-latest + needs: verify-compatible-runner + if: ${{ always() && (needs.verify-compatible-runner.result == 'success' || needs.verify-compatible-runner.result == 'skipped') }} outputs: search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} steps: From 903f2f6bcfbc22bce869a4eab7ce2bd5f209df35 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 08:59:14 -0500 Subject: [PATCH 040/149] adding more workflows --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4756aae65..af764e56e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -88,7 +88,7 @@ jobs: if inputs_runner_re: runner_gpu = inputs_runner_re.group(1) - assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'"" + assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'" get-jobs: From 60465c823960ebba040a8d898d96bf81b742397d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 09:00:55 -0500 Subject: [PATCH 041/149] adding more workflows --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index af764e56e..27e97ef95 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -81,10 +81,10 @@ jobs: run: | import re - inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', ${{ inputs.name }}) + inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}'') if inputs_name_re: config_gpu = inputs_name_re.group(1) - inputs_runner_re = re.match(r'^([^-]+)', ${{ inputs.runner }}) + inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}'') if inputs_runner_re: runner_gpu = inputs_runner_re.group(1) From ae2505ed974d6122f38592259116b0ee6670b258 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 09:01:30 -0500 Subject: [PATCH 042/149] adding more workflows --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 27e97ef95..a600e1bc5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -81,10 +81,10 @@ jobs: run: | import re - inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}'') + inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}') if inputs_name_re: config_gpu = inputs_name_re.group(1) - inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}'') + inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}') if inputs_runner_re: runner_gpu = inputs_runner_re.group(1) From 6fec99ef2b33a874aaa536de04c7ff6068a57bf8 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 09:41:20 -0500 Subject: [PATCH 043/149] adding more workflows --- .github/workflows/1k1k-sweep.yml | 6 ++-- .github/workflows/1k8k-sweep.yml | 6 ++-- .github/workflows/8k1k-sweep.yml | 6 ++-- .github/workflows/test.yml | 60 +++++++++++--------------------- 4 files changed, 30 insertions(+), 48 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 5a7ac4a25..80bcca43e 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -21,7 +21,7 @@ jobs: - id: get-70b-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-dsr1-configs: @@ -34,7 +34,7 @@ jobs: - id: get-dsr1-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-gptoss-configs: @@ -47,7 +47,7 @@ jobs: - id: get-gptoss-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark-70b: diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index 5a89e54b2..604e9b9d3 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -24,7 +24,7 @@ jobs: - id: get-70b-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-dsr1-configs: @@ -37,7 +37,7 @@ jobs: - id: get-dsr1-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-gptoss-configs: @@ -50,7 +50,7 @@ jobs: - id: get-gptoss-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark-70b: diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml index 9dc28c52b..58c676b56 100644 --- a/.github/workflows/8k1k-sweep.yml +++ b/.github/workflows/8k1k-sweep.yml @@ -21,7 +21,7 @@ jobs: - id: get-70b-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix 70b) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix 70b) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-dsr1-configs: @@ -34,7 +34,7 @@ jobs: - id: get-dsr1-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-gptoss-configs: @@ -47,7 +47,7 @@ jobs: - id: get-gptoss-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark-70b: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a600e1bc5..54bed54fe 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,24 +7,10 @@ concurrency: on: workflow_dispatch: inputs: - name: - description: "Name of benchmark from master configs" + generate-cli-command: + description: "Command passed to generate matrix script" required: true type: string - default: 70b-fp4-mi355x-vllm - - run_1k1k: - description: "Run ISL/OSL 1k/1k" - type: boolean - required: true - run_1k8k: - description: "Run ISL/OSL 1k/8k" - type: boolean - required: true - run_8k1k: - description: "Run ISL/OSL 8k/1k" - type: boolean - required: true runner: description: "Specific runner node to run on" @@ -72,29 +58,29 @@ on: - "mi355x-amd_3" jobs: - verify-compatible-runner: - runs-on: ubuntu-latest - if: ${{ inputs.runner != '' }} - steps: - - name: Verify runner compatible - shell: python - run: | - import re + # verify-compatible-runner: + # runs-on: ubuntu-latest + # if: ${{ inputs.runner != '' }} + # steps: + # - name: Verify runner compatible + # shell: python + # run: | + # import re - inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}') - if inputs_name_re: - config_gpu = inputs_name_re.group(1) - inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}') - if inputs_runner_re: - runner_gpu = inputs_runner_re.group(1) + # inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}') + # if inputs_name_re: + # config_gpu = inputs_name_re.group(1) + # inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}') + # if inputs_runner_re: + # runner_gpu = inputs_runner_re.group(1) - assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'" + # assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'" get-jobs: runs-on: ubuntu-latest - needs: verify-compatible-runner - if: ${{ always() && (needs.verify-compatible-runner.result == 'success' || needs.verify-compatible-runner.result == 'skipped') }} + # needs: verify-compatible-runner + # if: ${{ always() && (needs.verify-compatible-runner.result == 'success' || needs.verify-compatible-runner.result == 'skipped') }} outputs: search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} steps: @@ -103,17 +89,13 @@ jobs: - id: get-jobs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \ - --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ - --test-mode \ - --key ${{ inputs.name }} \ - ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }}) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py test-config --test-mode ${{ inputs.generate-cli-command }}) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT test-sweep: needs: get-jobs uses: ./.github/workflows/benchmark-tmpl.yml - name: test sweep - ${{ inputs.name }} + name: test ${{ inputs.name }} strategy: fail-fast: false matrix: From 0226fc558dc412fd19f98de2364b8bfd16d81ef7 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 09:45:59 -0500 Subject: [PATCH 044/149] adding more workflows --- utils/matrix-logic/generate_sweep_configs.py | 331 +++++++++++++++++++ 1 file changed, 331 insertions(+) create mode 100644 utils/matrix-logic/generate_sweep_configs.py diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py new file mode 100644 index 000000000..43998fc5b --- /dev/null +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -0,0 +1,331 @@ +import json +import yaml +import argparse + +seq_len_stoi = { + "1k1k": (1024, 1024), + "1k8k": (1024, 8192), + "8k1k": (8192, 1024) +} + +def generate_full_sweep(args, all_config_data): + """Generate full sweep configurations based on model prefix and sequence lengths.""" + isl, osl = seq_len_stoi[args.seq_lens] + + matrix_values = [] + for key, val in all_config_data.items(): + # Filter by model prefix + if not key.startswith(args.model_prefix): + continue + + seq_len_configs = val.get('seq-len-configs') + assert seq_len_configs, f"Missing 'seq-len-configs' for key '{key}'" + + image = val.get('image') + model = val.get('model') + precision = val.get('precision') + framework = val.get('framework') + runner = val.get('runner') + + assert None not in (image, model, precision, framework, runner), \ + f"Missing required fields for key '{key}'" + + # Check if this config has matching sequence lengths + matching_seq_config = None + for slq in seq_len_configs: + if slq.get('isl') == isl and slq.get('osl') == osl: + matching_seq_config = slq + break + + if not matching_seq_config: + continue # Skip this config if no matching sequence length + + bmk_space = matching_seq_config.get('bmk-space') + assert bmk_space, f"Missing 'bmk-space' in matching seq-len-config for key '{key}'" + + for bmk in bmk_space: + tp = bmk.get('tp') + conc_start = bmk.get('conc-start') + conc_end = bmk.get('conc-end') + ep = bmk.get('ep') + dp_attn = bmk.get('dp-attn') + + assert None not in (tp, conc_start, conc_end), \ + f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{key}'" + + # Generate entries for each concurrency value in the range + conc = conc_start + while conc <= conc_end: + entry = { + 'image': image, + 'model': model, + 'precision': precision, + 'framework': framework, + 'runner': runner, + 'isl': isl, + 'osl': osl, + 'tp': tp, + 'conc': conc + } + + # Add optional fields if they exist + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + return matrix_values + +def generate_test_config(args, all_config_data): + """Generate test configurations for a specific key.""" + # Check if the key exists + if args.key not in all_config_data: + available_keys = ', '.join(sorted(all_config_data.keys())) + raise ValueError( + f"Key '{args.key}' not found in configuration files. " + f"Available keys: {available_keys}" + ) + + # Extract model code (everything before first hyphen) + model_code = args.key.split('-')[0] + + val = all_config_data[args.key] + + # Validate required fields + seq_len_configs = val.get('seq-len-configs') + assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'" + + image = val.get('image') + model = val.get('model') + precision = val.get('precision') + framework = val.get('framework') + runner = val.get('runner') + + assert None not in (image, model, precision, framework, runner), \ + f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" + + # Convert seq-lens to set of (isl, osl) tuples for filtering + seq_lens_filter = None + if args.seq_lens: + seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} + + matrix_values = [] + + # Process each sequence length configuration + for seq_config in seq_len_configs: + isl = seq_config.get('isl') + osl = seq_config.get('osl') + + assert None not in (isl, osl), \ + f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'" + + # Filter by sequence lengths if specified + if seq_lens_filter and (isl, osl) not in seq_lens_filter: + continue + + bmk_space = seq_config.get('bmk-space') + assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'" + + for bmk in bmk_space: + tp = bmk.get('tp') + conc_start = bmk.get('conc-start') + conc_end = bmk.get('conc-end') + ep = bmk.get('ep') + dp_attn = bmk.get('dp-attn') + + assert None not in (tp, conc_start, conc_end), \ + f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" + + # In test mode, only use the lowest concurrency (conc_start) + if args.test_mode: + entry = { + 'image': image, + 'model': model, + 'model-code': model_code, + 'precision': precision, + 'framework': framework, + 'runner': runner, + 'isl': isl, + 'osl': osl, + 'tp': tp, + 'conc': conc_start, + 'max-model-len': isl + osl, + } + + # Add optional fields if they exist + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + else: + # Generate entries for each concurrency value in the range + conc = conc_start + while conc <= conc_end: + entry = { + 'image': image, + 'model': model, + 'model-code': model_code, + 'precision': precision, + 'framework': framework, + 'runner': runner, + 'isl': isl, + 'osl': osl, + 'tp': tp, + 'conc': conc, + 'max-model-len': isl + osl, + } + + # Add optional fields if they exist + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + return matrix_values + +def load_config_files(config_files): + """Load and merge configuration files.""" + all_config_data = {} + for config_file in config_files: + try: + with open(config_file, 'r') as f: + config_data = yaml.safe_load(f) + assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" + + # Check for duplicate keys + duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) + if duplicate_keys: + raise ValueError( + f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" + ) + + all_config_data.update(config_data) + except FileNotFoundError: + raise ValueError(f"Input file '{config_file}' does not exist.") + + return all_config_data + +def main(): + # Create parent parser with common arguments + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser.add_argument( + '--config-files', + nargs='+', + required=True, + help='One or more configuration files (YAML format)' + ) + + # Create main parser + parser = argparse.ArgumentParser( + description='Generate benchmark configurations from YAML config files' + ) + + # Create subparsers for subcommands + subparsers = parser.add_subparsers( + dest='command', + required=True, + help='Available commands' + ) + + # Subcommand: full-sweep + full_sweep_parser = subparsers.add_parser( + 'full-sweep', + parents=[parent_parser], + add_help=False, + help='Generate full sweep configurations based on model prefix' + ) + full_sweep_parser.add_argument( + '--seq-lens', + choices=list(seq_len_stoi.keys()), + required=True, + help=f"Sequence length configuration: {', '.join(seq_len_stoi.keys())}" + ) + full_sweep_parser.add_argument( + '--model-prefix', + required=True, + help='Model prefix to filter configurations' + ) + full_sweep_parser.add_argument( + '--step-size', + type=int, + default=2, + help='Step size for concurrency values (default: 2)' + ) + full_sweep_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + + # Subcommand: test-config + test_config_parser = subparsers.add_parser( + 'test-config', + parents=[parent_parser], + add_help=False, + help='Generate test configurations for a specific key' + ) + test_config_parser.add_argument( + '--key', + required=True, + help='Configuration key to use' + ) + test_config_parser.add_argument( + '--seq-lens', + nargs='+', + choices=list(seq_len_stoi.keys()), + required=False, + help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." + ) + test_config_parser.add_argument( + '--step-size', + type=int, + default=2, + help='Step size for concurrency values (default: 2)' + ) + test_config_parser.add_argument( + '--test-mode', + action='store_true', + help='Generate only the lowest concurrency value for each TP level' + ) + test_config_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + + args = parser.parse_args() + + # Load configuration files + all_config_data = load_config_files(args.config_files) + + # Route to appropriate function based on subcommand + if args.command == 'full-sweep': + matrix_values = generate_full_sweep(args, all_config_data) + elif args.command == 'test-config': + matrix_values = generate_test_config(args, all_config_data) + else: + parser.error(f"Unknown command: {args.command}") + + print(json.dumps(matrix_values)) + return matrix_values + +if __name__ == "__main__": + main() From 3f7609d66fd1eb16e2c4f5f7ce9f59b3a7c80f16 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 10:11:19 -0500 Subject: [PATCH 045/149] adding more workflows --- utils/matrix-logic/generate_sweep_configs.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 43998fc5b..0d835bf0d 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -94,9 +94,16 @@ def generate_test_config(args, all_config_data): f"Available keys: {available_keys}" ) - # Extract model code (everything before first hyphen) + # Extract model code from config key model_code = args.key.split('-')[0] - + # Extract GPU from config key + config_gpu = args.key.split('-')[2] + runner_gpu = args.runner_node.split('-')[0] if args.runner_node else None + + # If user enters a runner not compatible with input GPU sku, error + if runner_gpu and config_gpu != runner_gpu: + raise ValueError(f"GPU '{config_gpu}' used in selected config '{args.key}' cannot run on selected runner node '{args.runner_node}'.") + val = all_config_data[args.key] # Validate required fields @@ -107,7 +114,8 @@ def generate_test_config(args, all_config_data): model = val.get('model') precision = val.get('precision') framework = val.get('framework') - runner = val.get('runner') + # Use default runner or specific runner node if input by user + runner = val.get('runner') if not args.runner_node else args.runner_node assert None not in (image, model, precision, framework, runner), \ f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" @@ -287,6 +295,11 @@ def main(): required=True, help='Configuration key to use' ) + test_config_parser.add_argument( + '--runner-node', + required=False, + help='Specific runner node to use' + ) test_config_parser.add_argument( '--seq-lens', nargs='+', From 395bbb067465fe460ce98c1da63ae77f2cbe2776 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 10:20:25 -0500 Subject: [PATCH 046/149] adding more workflows --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 54bed54fe..4cd314ef0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -95,7 +95,7 @@ jobs: test-sweep: needs: get-jobs uses: ./.github/workflows/benchmark-tmpl.yml - name: test ${{ inputs.name }} + name: ${{ inputs.generate-cli-command }} strategy: fail-fast: false matrix: From 8510c0aeadd8935a2b8f4abf5afdc249af472f7a Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 10:28:51 -0500 Subject: [PATCH 047/149] adding more workflows --- .github/workflows/test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4cd314ef0..a21b118f1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,8 +1,8 @@ name: Test Sweep -concurrency: - group: benchmark-lock - cancel-in-progress: false +# concurrency: +# group: benchmark-lock +# cancel-in-progress: false on: workflow_dispatch: @@ -102,7 +102,7 @@ jobs: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit with: - exp-name: ${{ matrix.config.model-code }}_test + exp-name: ${{ matrix.config.model-code }}_test_${{ matrix.config.isl }}_${{ matrix.config.osl }} isl: ${{ matrix.config.isl }} osl: ${{ matrix.config.osl }} max-model-len: ${{ matrix.config.max-model-len }} From f439163edb9ddf293f23b1bd430d47e7d6cc2d59 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 15:29:03 -0500 Subject: [PATCH 048/149] adding more workflows --- .github/configs/runners.yaml | 48 ++++ .github/workflows/test.yml | 70 +---- utils/matrix-logic/generate_sweep_configs.py | 266 ++++++++++++++----- 3 files changed, 250 insertions(+), 134 deletions(-) create mode 100644 .github/configs/runners.yaml diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml new file mode 100644 index 000000000..692cf74ad --- /dev/null +++ b/.github/configs/runners.yaml @@ -0,0 +1,48 @@ +h100: +- 'h100-cr_0' +- 'h100-cr_1' +- 'h100-cw_0' +- 'h100-cw_1' +h200: +- 'h200-cw_0' +- 'h200-cw_1' +- 'h200-nb_0' +- 'h200-nb_1' +- 'h200-nb_2' +- 'h200-nb_3' +- 'h200-nv_0' +- 'h200-nv_1' +- 'h200-nv_2' +- 'h200-nv_3' +b200-trt: +- 'b200-nv_0' +- 'b200-nv_1' +b200: +- 'b200-nb_0' +- 'b200-nb_1' +- 'b200-nvd_0' +- 'b200-nvd_1' +- 'b200-nvd_2' +- 'b200-nvd_3' +- 'b200-tg_0' +mi300x: +- 'mi300x-amd_0' +- 'mi300x-amd_1' +- 'mi300x-amd_2' +- 'mi300x-amd_3' +- 'mi300x-amd_4' +- 'mi300x-cr_0' +- 'mi300x-oci_0' +mi325x: +- 'mi325x-amd_0' +- 'mi325x-tw_0' +- 'mi325x-tw_1' +- 'mi325x-tw_2' +- 'mi325x-tw_3' +mi355x: +- 'mi355x-amd_0' +- 'mi355x-amd_1' +- 'mi355x-amd_2' +- 'mi355x-amd_3' +gb200: +- gb200-nv_0 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a21b118f1..e56fc9a82 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,75 +12,9 @@ on: required: true type: string - runner: - description: "Specific runner node to run on" - required: false - type: choice - options: - - "h100-cr_0" - - "h100-cr_1" - - "h100-cw_0" - - "h100-cw_1" - - "h200-cw_0" - - "h200-cw_1" - - "h200-nb_0" - - "h200-nb_1" - - "h200-nb_2" - - "h200-nb_3" - - "h200-nv_0" - - "h200-nv_1" - - "h200-nv_2" - - "h200-nv_3" - - "b200-nv_0" - - "b200-nv_1" - - "b200-nb_0" - - "b200-nb_1" - - "b200-nvd_0" - - "b200-nvd_1" - - "b200-nvd_2" - - "b200-nvd_3" - - "b200-tg_0" - - "mi300x-amd_0" - - "mi300x-amd_1" - - "mi300x-amd_2" - - "mi300x-amd_3" - - "mi300x-amd_4" - - "mi300x-cr_0" - - "mi300x-oci_0" - - "mi325x-amd_0" - - "mi325x-tw_0" - - "mi325x-tw_1" - - "mi325x-tw_2" - - "mi325x-tw_3" - - "mi355x-amd_0" - - "mi355x-amd_1" - - "mi355x-amd_2" - - "mi355x-amd_3" - jobs: - # verify-compatible-runner: - # runs-on: ubuntu-latest - # if: ${{ inputs.runner != '' }} - # steps: - # - name: Verify runner compatible - # shell: python - # run: | - # import re - - # inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}') - # if inputs_name_re: - # config_gpu = inputs_name_re.group(1) - # inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}') - # if inputs_runner_re: - # runner_gpu = inputs_runner_re.group(1) - - # assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'" - - get-jobs: runs-on: ubuntu-latest - # needs: verify-compatible-runner - # if: ${{ always() && (needs.verify-compatible-runner.result == 'success' || needs.verify-compatible-runner.result == 'skipped') }} outputs: search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} steps: @@ -89,7 +23,7 @@ jobs: - id: get-jobs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py test-config --test-mode ${{ inputs.generate-cli-command }}) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py ${{ inputs.generate-cli-command }}) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT test-sweep: @@ -106,7 +40,7 @@ jobs: isl: ${{ matrix.config.isl }} osl: ${{ matrix.config.osl }} max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }} + runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 0d835bf0d..408a7e353 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -8,8 +8,82 @@ "8k1k": (8192, 1024) } +def validate_master_configs_structure(all_config_data): + """Validate the structure of all master config entries. + + This validates that all required fields are present, have correct types, + and no extra fields exist. Should be called once after loading config files. + """ + for key, val in all_config_data.items(): + # Check for required top-level fields and their types + required_fields = { + 'image': str, + 'model': str, + 'precision': str, + 'framework': str, + 'runner': str, + 'seq-len-configs': list + } + + for field, expected_type in required_fields.items(): + if field not in val or val[field] is None: + raise ValueError(f"Missing required field '{field}' for key '{key}'") + if not isinstance(val[field], expected_type): + raise ValueError(f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}") + + seq_len_configs = val['seq-len-configs'] + if len(seq_len_configs) == 0: + raise ValueError(f"'seq-len-configs' must be a non-empty list for key '{key}'") + + # Validate each seq-len-config + for i, seq_config in enumerate(seq_len_configs): + # Check isl + if 'isl' not in seq_config or seq_config['isl'] is None: + raise ValueError(f"Missing 'isl' in seq-len-config[{i}] for key '{key}'") + if not isinstance(seq_config['isl'], int): + raise ValueError(f"'isl' must be int in seq-len-config[{i}] for key '{key}'") + + # Check osl + if 'osl' not in seq_config or seq_config['osl'] is None: + raise ValueError(f"Missing 'osl' in seq-len-config[{i}] for key '{key}'") + if not isinstance(seq_config['osl'], int): + raise ValueError(f"'osl' must be int in seq-len-config[{i}] for key '{key}'") + + bmk_space = seq_config.get('bmk-space') + if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0: + raise ValueError(f"Missing or invalid 'bmk-space' in seq-len-config[{i}] for key '{key}'") + + # Validate each benchmark in bmk-space + for j, bmk in enumerate(bmk_space): + # Define allowed fields + allowed_fields = {'tp', 'conc-start', 'conc-end', 'ep', 'dp-attn'} + required_bmk_fields = {'tp': int, 'conc-start': int, 'conc-end': int} + optional_bmk_fields = {'ep': int, 'dp-attn': bool} + + # Check for extra fields + extra_fields = set(bmk.keys()) - allowed_fields + if extra_fields: + raise ValueError(f"Extra fields {extra_fields} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + + # Validate required fields + for field, expected_type in required_bmk_fields.items(): + if field not in bmk or bmk[field] is None: + raise ValueError(f"Missing '{field}' in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + if not isinstance(bmk[field], expected_type): + raise ValueError(f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + + # Validate optional fields if they exist + for field, expected_type in optional_bmk_fields.items(): + if field in bmk and bmk[field] is not None: + if not isinstance(bmk[field], expected_type): + raise ValueError(f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + + def generate_full_sweep(args, all_config_data): - """Generate full sweep configurations based on model prefix and sequence lengths.""" + """Generate full sweep configurations based on model prefix and sequence lengths. + + Assumes all_config_data has been validated by validate_config_structure(). + """ isl, osl = seq_len_stoi[args.seq_lens] matrix_values = [] @@ -18,41 +92,32 @@ def generate_full_sweep(args, all_config_data): if not key.startswith(args.model_prefix): continue - seq_len_configs = val.get('seq-len-configs') - assert seq_len_configs, f"Missing 'seq-len-configs' for key '{key}'" - - image = val.get('image') - model = val.get('model') - precision = val.get('precision') - framework = val.get('framework') - runner = val.get('runner') - - assert None not in (image, model, precision, framework, runner), \ - f"Missing required fields for key '{key}'" + seq_len_configs = val['seq-len-configs'] + image = val['image'] + model = val['model'] + precision = val['precision'] + framework = val['framework'] + runner = val['runner'] # Check if this config has matching sequence lengths matching_seq_config = None for slq in seq_len_configs: - if slq.get('isl') == isl and slq.get('osl') == osl: + if slq['isl'] == isl and slq['osl'] == osl: matching_seq_config = slq break if not matching_seq_config: continue # Skip this config if no matching sequence length - bmk_space = matching_seq_config.get('bmk-space') - assert bmk_space, f"Missing 'bmk-space' in matching seq-len-config for key '{key}'" + bmk_space = matching_seq_config['bmk-space'] for bmk in bmk_space: - tp = bmk.get('tp') - conc_start = bmk.get('conc-start') - conc_end = bmk.get('conc-end') + tp = bmk['tp'] + conc_start = bmk['conc-start'] + conc_end = bmk['conc-end'] ep = bmk.get('ep') dp_attn = bmk.get('dp-attn') - assert None not in (tp, conc_start, conc_end), \ - f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{key}'" - # Generate entries for each concurrency value in the range conc = conc_start while conc <= conc_end: @@ -84,41 +149,24 @@ def generate_full_sweep(args, all_config_data): return matrix_values + def generate_test_config(args, all_config_data): - """Generate test configurations for a specific key.""" - # Check if the key exists - if args.key not in all_config_data: - available_keys = ', '.join(sorted(all_config_data.keys())) - raise ValueError( - f"Key '{args.key}' not found in configuration files. " - f"Available keys: {available_keys}" - ) + """Generate test configurations for a specific key. + Assumes all_config_data has been validated by validate_config_structure(). + """ # Extract model code from config key model_code = args.key.split('-')[0] - # Extract GPU from config key - config_gpu = args.key.split('-')[2] - runner_gpu = args.runner_node.split('-')[0] if args.runner_node else None - - # If user enters a runner not compatible with input GPU sku, error - if runner_gpu and config_gpu != runner_gpu: - raise ValueError(f"GPU '{config_gpu}' used in selected config '{args.key}' cannot run on selected runner node '{args.runner_node}'.") - - val = all_config_data[args.key] - # Validate required fields - seq_len_configs = val.get('seq-len-configs') - assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'" + val = all_config_data[args.key] - image = val.get('image') - model = val.get('model') - precision = val.get('precision') - framework = val.get('framework') + seq_len_configs = val['seq-len-configs'] + image = val['image'] + model = val['model'] + precision = val['precision'] + framework = val['framework'] # Use default runner or specific runner node if input by user - runner = val.get('runner') if not args.runner_node else args.runner_node - - assert None not in (image, model, precision, framework, runner), \ - f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" + runner = val['runner'] if not args.runner_node else args.runner_node # Convert seq-lens to set of (isl, osl) tuples for filtering seq_lens_filter = None @@ -129,29 +177,22 @@ def generate_test_config(args, all_config_data): # Process each sequence length configuration for seq_config in seq_len_configs: - isl = seq_config.get('isl') - osl = seq_config.get('osl') - - assert None not in (isl, osl), \ - f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'" + isl = seq_config['isl'] + osl = seq_config['osl'] # Filter by sequence lengths if specified if seq_lens_filter and (isl, osl) not in seq_lens_filter: continue - bmk_space = seq_config.get('bmk-space') - assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'" + bmk_space = seq_config['bmk-space'] for bmk in bmk_space: - tp = bmk.get('tp') - conc_start = bmk.get('conc-start') - conc_end = bmk.get('conc-end') + tp = bmk['tp'] + conc_start = bmk['conc-start'] + conc_end = bmk['conc-end'] ep = bmk.get('ep') dp_attn = bmk.get('dp-attn') - assert None not in (tp, conc_start, conc_end), \ - f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" - # In test mode, only use the lowest concurrency (conc_start) if args.test_mode: entry = { @@ -209,6 +250,68 @@ def generate_test_config(args, all_config_data): return matrix_values + +def generate_runner_model_sweep_config(args, all_config_data): + """Generate runner-model sweep configurations. + + Assumes all_config_data has been validated by validate_config_structure(). + """ + with open(args.runner_config, 'r') as f: + runner_config = yaml.safe_load(f) + + runner_nodes = runner_config.get(args.runner_type) + + if not runner_nodes: + raise ValueError(f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") + + matrix_values = [] + for key, val in all_config_data.items(): + # Only consider configs with specified runner + if val['runner'] != args.runner_type: + continue + + # Find 1k1k config + target_config = None + for config in val['seq-len-configs']: + if config['isl'] == 1024 and config['osl'] == 1024: + target_config = config + break + + highest_tp_bmk = max(target_config['bmk-space'], key=lambda x: x['tp']) + # Since we are just testing, pick the highest TP for this config and just test + # on that TP with the lowest concurrency available + highest_tp = highest_tp_bmk['tp'] + lowest_conc = highest_tp_bmk['conc-start'] + + ep = highest_tp_bmk.get('ep') + dp_attn = highest_tp_bmk.get('dp-attn') + + for node in runner_nodes: + entry = { + 'image': val['image'], + 'model': val['model'], + 'precision': val['precision'], + 'framework': val['framework'], + # Add one entry for each node under specified runner type + 'runner': node, + # Again, just use 1k1k since this is just meant to smoke test all runners + 'isl': 1024, + 'osl': 1024, + 'tp': highest_tp, + 'conc': lowest_conc + } + + # Add optional fields if they exist + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + + return matrix_values + + def load_config_files(config_files): """Load and merge configuration files.""" all_config_data = {} @@ -216,10 +319,13 @@ def load_config_files(config_files): try: with open(config_file, 'r') as f: config_data = yaml.safe_load(f) - assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" + assert isinstance( + config_data, dict), f"Config file '{config_file}' must contain a dictionary" - # Check for duplicate keys - duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) + # Check for duplicate keys, this is only in place to prevent against the very unlikely + # case where an entry in one config accidentally/purposefully tries to override an entry in another config + duplicate_keys = set(all_config_data.keys()) & set( + config_data.keys()) if duplicate_keys: raise ValueError( f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" @@ -231,6 +337,7 @@ def load_config_files(config_files): return all_config_data + def main(): # Create parent parser with common arguments parent_parser = argparse.ArgumentParser(add_help=False) @@ -324,21 +431,48 @@ def main(): help='Show this help message and exit' ) + # Subcommand: runner-model-sweep + test_config_parser = subparsers.add_parser( + 'runner-model-sweep', + parents=[parent_parser], + add_help=False, + help='Sweep across all runner nodes and all compatible models for a given runner' + ) + test_config_parser.add_argument( + '--runner-type', + required=True, + help='Runner type (e.g., h200-trt, h100)' + ) + test_config_parser.add_argument( + '--runner-config', + required=True, + help='Configuration file holding runner information' + ) + test_config_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + args = parser.parse_args() - # Load configuration files + # Load and validate configuration files all_config_data = load_config_files(args.config_files) + validate_master_configs_structure(all_config_data) # Route to appropriate function based on subcommand if args.command == 'full-sweep': matrix_values = generate_full_sweep(args, all_config_data) elif args.command == 'test-config': matrix_values = generate_test_config(args, all_config_data) + elif args.command == 'runner-model-sweep': + matrix_values = generate_runner_model_sweep_config(args, all_config_data) else: parser.error(f"Unknown command: {args.command}") print(json.dumps(matrix_values)) return matrix_values + if __name__ == "__main__": main() From 28665f238ac5668563ee79c6e36b182ac3b7b822 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 15:48:31 -0500 Subject: [PATCH 049/149] adding more workflows --- utils/matrix-logic/generate_sweep_configs.py | 57 +++++++++++++------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 408a7e353..a0d5676cb 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -8,6 +8,7 @@ "8k1k": (8192, 1024) } + def validate_master_configs_structure(all_config_data): """Validate the structure of all master config entries. @@ -27,56 +28,70 @@ def validate_master_configs_structure(all_config_data): for field, expected_type in required_fields.items(): if field not in val or val[field] is None: - raise ValueError(f"Missing required field '{field}' for key '{key}'") + raise ValueError( + f"Missing required field '{field}' for key '{key}'") if not isinstance(val[field], expected_type): - raise ValueError(f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}") + raise ValueError( + f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}") seq_len_configs = val['seq-len-configs'] if len(seq_len_configs) == 0: - raise ValueError(f"'seq-len-configs' must be a non-empty list for key '{key}'") + raise ValueError( + f"'seq-len-configs' must be a non-empty list for key '{key}'") # Validate each seq-len-config for i, seq_config in enumerate(seq_len_configs): # Check isl if 'isl' not in seq_config or seq_config['isl'] is None: - raise ValueError(f"Missing 'isl' in seq-len-config[{i}] for key '{key}'") + raise ValueError( + f"Missing 'isl' in seq-len-config[{i}] for key '{key}'") if not isinstance(seq_config['isl'], int): - raise ValueError(f"'isl' must be int in seq-len-config[{i}] for key '{key}'") + raise ValueError( + f"'isl' must be int in seq-len-config[{i}] for key '{key}'") # Check osl if 'osl' not in seq_config or seq_config['osl'] is None: - raise ValueError(f"Missing 'osl' in seq-len-config[{i}] for key '{key}'") + raise ValueError( + f"Missing 'osl' in seq-len-config[{i}] for key '{key}'") if not isinstance(seq_config['osl'], int): - raise ValueError(f"'osl' must be int in seq-len-config[{i}] for key '{key}'") + raise ValueError( + f"'osl' must be int in seq-len-config[{i}] for key '{key}'") bmk_space = seq_config.get('bmk-space') if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0: - raise ValueError(f"Missing or invalid 'bmk-space' in seq-len-config[{i}] for key '{key}'") + raise ValueError( + f"Missing or invalid 'bmk-space' in seq-len-config[{i}] for key '{key}'") # Validate each benchmark in bmk-space for j, bmk in enumerate(bmk_space): # Define allowed fields - allowed_fields = {'tp', 'conc-start', 'conc-end', 'ep', 'dp-attn'} - required_bmk_fields = {'tp': int, 'conc-start': int, 'conc-end': int} + allowed_fields = {'tp', 'conc-start', + 'conc-end', 'ep', 'dp-attn'} + required_bmk_fields = {'tp': int, + 'conc-start': int, 'conc-end': int} optional_bmk_fields = {'ep': int, 'dp-attn': bool} # Check for extra fields extra_fields = set(bmk.keys()) - allowed_fields if extra_fields: - raise ValueError(f"Extra fields {extra_fields} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + raise ValueError( + f"Extra fields {extra_fields} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") # Validate required fields for field, expected_type in required_bmk_fields.items(): if field not in bmk or bmk[field] is None: - raise ValueError(f"Missing '{field}' in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + raise ValueError( + f"Missing '{field}' in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") if not isinstance(bmk[field], expected_type): - raise ValueError(f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + raise ValueError( + f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") # Validate optional fields if they exist for field, expected_type in optional_bmk_fields.items(): if field in bmk and bmk[field] is not None: if not isinstance(bmk[field], expected_type): - raise ValueError(f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + raise ValueError( + f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") def generate_full_sweep(args, all_config_data): @@ -98,6 +113,9 @@ def generate_full_sweep(args, all_config_data): precision = val['precision'] framework = val['framework'] runner = val['runner'] + # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name + # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME + model_code = key.split('-')[0] # Check if this config has matching sequence lengths matching_seq_config = None @@ -130,7 +148,8 @@ def generate_full_sweep(args, all_config_data): 'isl': isl, 'osl': osl, 'tp': tp, - 'conc': conc + 'conc': conc, + 'model_code': model_code, } # Add optional fields if they exist @@ -260,9 +279,10 @@ def generate_runner_model_sweep_config(args, all_config_data): runner_config = yaml.safe_load(f) runner_nodes = runner_config.get(args.runner_type) - + if not runner_nodes: - raise ValueError(f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") + raise ValueError( + f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") matrix_values = [] for key, val in all_config_data.items(): @@ -466,7 +486,8 @@ def main(): elif args.command == 'test-config': matrix_values = generate_test_config(args, all_config_data) elif args.command == 'runner-model-sweep': - matrix_values = generate_runner_model_sweep_config(args, all_config_data) + matrix_values = generate_runner_model_sweep_config( + args, all_config_data) else: parser.error(f"Unknown command: {args.command}") From 99aec702d2305cdda8ca7e9ed8b88c5b3bd5ffc1 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 15:51:43 -0500 Subject: [PATCH 050/149] adding more workflows --- .github/workflows/1k8k-sweep.yml | 13 +++++-------- utils/matrix-logic/generate_sweep_configs.py | 7 ++++++- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index 604e9b9d3..cced99997 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -4,14 +4,11 @@ concurrency: group: benchmark-lock-1k8k cancel-in-progress: false -on: - workflow_dispatch: - schedule: - - cron: '0 23 * * *' - on: # pull_request: workflow_dispatch: +# schedule: +# - cron: '0 23 * * *' jobs: get-70b-configs: @@ -24,7 +21,7 @@ jobs: - id: get-70b-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-dsr1-configs: @@ -37,7 +34,7 @@ jobs: - id: get-dsr1-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-gptoss-configs: @@ -50,7 +47,7 @@ jobs: - id: get-gptoss-configs run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark-70b: diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index a0d5676cb..8f11d79df 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -289,6 +289,10 @@ def generate_runner_model_sweep_config(args, all_config_data): # Only consider configs with specified runner if val['runner'] != args.runner_type: continue + + # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name + # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME + model_code = key.split('-')[0] # Find 1k1k config target_config = None @@ -318,7 +322,8 @@ def generate_runner_model_sweep_config(args, all_config_data): 'isl': 1024, 'osl': 1024, 'tp': highest_tp, - 'conc': lowest_conc + 'conc': lowest_conc, + 'model-code': model_code, } # Add optional fields if they exist From 3ea4aa2731bbd23b89848c28c46607f6b4ad7b8f Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Tue, 28 Oct 2025 15:56:53 -0500 Subject: [PATCH 051/149] adding more workflows --- utils/matrix-logic/generate_sweep_configs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 8f11d79df..6dfdc2bf9 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -149,7 +149,8 @@ def generate_full_sweep(args, all_config_data): 'osl': osl, 'tp': tp, 'conc': conc, - 'model_code': model_code, + 'model-code': model_code, + 'max-model-len': isl + osl, } # Add optional fields if they exist @@ -324,6 +325,7 @@ def generate_runner_model_sweep_config(args, all_config_data): 'tp': highest_tp, 'conc': lowest_conc, 'model-code': model_code, + 'max-model-len': 2048, } # Add optional fields if they exist From 60906564838628da9b560560e41d5b825404b022 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 08:10:39 -0500 Subject: [PATCH 052/149] adding more workflows --- .github/configs/runners.yaml | 14 ++ utils/matrix-logic/generate_sweep_configs.py | 128 ++++++++++++++++++- 2 files changed, 139 insertions(+), 3 deletions(-) diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 692cf74ad..692ade8dd 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -14,9 +14,23 @@ h200: - 'h200-nv_1' - 'h200-nv_2' - 'h200-nv_3' +h200-trt: +- 'h200-cw_0' +- 'h200-cw_1' +- 'h200-nb_0' +- 'h200-nb_1' +- 'h200-nb_2' +- 'h200-nb_3' +- 'h200-nv_0' +- 'h200-nv_1' +- 'h200-nv_2' +- 'h200-nv_3' b200-trt: - 'b200-nv_0' - 'b200-nv_1' +b200-nvs: +- 'b200-nv_0' +- 'b200-nv_1' b200: - 'b200-nb_0' - 'b200-nb_1' diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 6dfdc2bf9..10dc07c82 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -276,8 +276,11 @@ def generate_runner_model_sweep_config(args, all_config_data): Assumes all_config_data has been validated by validate_config_structure(). """ - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) + try: + with open(args.runner_config, 'r') as f: + runner_config = yaml.safe_load(f) + except FileNotFoundError as e: + raise ValueError(f"Runner config file '{args.runner_config}' does not exist.") runner_nodes = runner_config.get(args.runner_type) @@ -290,10 +293,85 @@ def generate_runner_model_sweep_config(args, all_config_data): # Only consider configs with specified runner if val['runner'] != args.runner_type: continue - + + # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name + # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME + model_code = key.split('-')[0] + + # Find 1k1k config + target_config = None + for config in val['seq-len-configs']: + if config['isl'] == 1024 and config['osl'] == 1024: + target_config = config + break + + highest_tp_bmk = max(target_config['bmk-space'], key=lambda x: x['tp']) + # Since we are just testing, pick the highest TP for this config and just test + # on that TP with the lowest concurrency available + highest_tp = highest_tp_bmk['tp'] + lowest_conc = highest_tp_bmk['conc-start'] + + ep = highest_tp_bmk.get('ep') + dp_attn = highest_tp_bmk.get('dp-attn') + + for node in runner_nodes: + entry = { + 'image': val['image'], + 'model': val['model'], + 'precision': val['precision'], + 'framework': val['framework'], + # Add one entry for each node under specified runner type + 'runner': node, + # Again, just use 1k1k since this is just meant to smoke test all runners + 'isl': 1024, + 'osl': 1024, + 'tp': highest_tp, + 'conc': lowest_conc, + 'model-code': model_code, + 'max-model-len': 2048, + } + + # Add optional fields if they exist + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + + return matrix_values + + +def generate_runner_sweep_config(args, all_config_data): + """Generate runner sweep configurations. + + Assumes all_config_data has been validated by validate_config_structure(). + """ + try: + with open(args.runner_config, 'r') as f: + runner_config = yaml.safe_load(f) + except FileNotFoundError as e: + raise ValueError(f"Runner config file '{args.runner_config}' does not exist.") + + + matrix_values = [] + for key, val in all_config_data.items(): + # Only consider configs with specified runner + if not key.startswith(args.model_prefix): + continue + + # Optionally filter by precision and framework + if (args.precision and val['precision'] != args.precision) or (args.framework and val['framework'] != args.framework): + continue + # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME model_code = key.split('-')[0] + + runner_nodes = runner_config.get(val['runner']) + if not runner_nodes: + raise ValueError( + f"Runner '{val['runner']}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") # Find 1k1k config target_config = None @@ -336,6 +414,14 @@ def generate_runner_model_sweep_config(args, all_config_data): matrix_values.append(entry) + if len(matrix_values) == 0: + error_msg = f"No configs found matching model prefix '{args.model_prefix}'" + if args.precision: + error_msg += f", precision '{args.precision}'" + if args.framework: + error_msg += f", framework '{args.framework}'" + raise ValueError(error_msg + ".") + return matrix_values @@ -481,6 +567,39 @@ def main(): help='Show this help message and exit' ) + # Subcommand: runner-sweep + test_config_parser = subparsers.add_parser( + 'runner-sweep', + parents=[parent_parser], + add_help=False, + help='For a given model, run configurations on all compatible runners' + ) + test_config_parser.add_argument( + '--model-prefix', + required=True, + help='Model prefix (e.g., 70b)' + ) + test_config_parser.add_argument( + '--precision', + required=False, + help='Precision to filter by (e.g., fp4) (optional)' + ) + test_config_parser.add_argument( + '--framework', + required=False, + help='Framework to filter by (e.g., trt) (optional)' + ) + test_config_parser.add_argument( + '--runner-config', + required=True, + help='Configuration file holding runner information' + ) + test_config_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + args = parser.parse_args() # Load and validate configuration files @@ -495,6 +614,9 @@ def main(): elif args.command == 'runner-model-sweep': matrix_values = generate_runner_model_sweep_config( args, all_config_data) + elif args.command == 'runner-sweep': + matrix_values = generate_runner_sweep_config( + args, all_config_data) else: parser.error(f"Unknown command: {args.command}") From 9b570de0eaba5333fbdd64944ce11556b3969ab9 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 10:41:24 -0500 Subject: [PATCH 053/149] adding script --- .github/workflows/1k1k-sweep.yml | 12 ++++++------ utils/matrix-logic/generate_sweep_configs.py | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 80bcca43e..dd8ae9f9c 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -70,8 +70,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} benchmark-dsr1: @@ -94,8 +94,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} benchmark-gptoss: @@ -118,8 +118,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} collect-70b-results: diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 10dc07c82..1c3472eb8 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -151,6 +151,8 @@ def generate_full_sweep(args, all_config_data): 'conc': conc, 'model-code': model_code, 'max-model-len': isl + osl, + 'ep': 1, # Default + 'dp-attn': False, # Default } # Add optional fields if they exist From 2bb9dfaa17f4aaac2c63e60f05d84c363fd134e8 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 10:42:46 -0500 Subject: [PATCH 054/149] removing extraneous files --- utils/matrix-logic/get_full_sweep_configs.py | 137 --- utils/matrix-logic/get_test_sweep_configs.py | 184 ---- .../test_get_full_sweep_configs.py | 842 ------------------ 3 files changed, 1163 deletions(-) delete mode 100644 utils/matrix-logic/get_full_sweep_configs.py delete mode 100644 utils/matrix-logic/get_test_sweep_configs.py delete mode 100644 utils/matrix-logic/test_get_full_sweep_configs.py diff --git a/utils/matrix-logic/get_full_sweep_configs.py b/utils/matrix-logic/get_full_sweep_configs.py deleted file mode 100644 index 01e13f313..000000000 --- a/utils/matrix-logic/get_full_sweep_configs.py +++ /dev/null @@ -1,137 +0,0 @@ -import json -import yaml -import sys -import argparse - -seq_len_stoi = { - "1k1k": (1024, 1024), - "1k8k": (1024, 8192), - "8k1k": (8192, 1024) -} - -def main(): - parser = argparse.ArgumentParser( - description='Generate benchmark matrix from configuration files' - ) - parser.add_argument( - '--config-files', - nargs='+', - required=True, - help='One or more configuration files (YAML format)' - ) - parser.add_argument( - '--seq-lens', - choices=list(seq_len_stoi.keys()), - required=True, - help=f"Sequence length configuration: {', '.join(seq_len_stoi.keys())}" - ) - parser.add_argument( - '--model-prefix', - required=True, - help='Model prefix to filter configurations' - ) - parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - - args = parser.parse_args() - - isl, osl = seq_len_stoi[args.seq_lens] - - all_config_data = {} - for config_file in args.config_files: - try: - with open(config_file, 'r') as f: - config_data = yaml.safe_load(f) - assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" - - # Check for duplicate keys, shouldn't really be an issue but with NVIDIA and AMD - # separate configs this will help against any possible confusion - duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) - if duplicate_keys: - raise ValueError( - f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" - ) - - all_config_data.update(config_data) - except FileNotFoundError: - raise ValueError(f"Input file '{config_file}' does not exist.") - - matrix_values = [] - for key, val in all_config_data.items(): - # Filter by model prefix i.e., - if not key.startswith(args.model_prefix): - continue - - seq_len_configs = val.get('seq-len-configs') - assert seq_len_configs, f"Missing 'seq-len-configs' for key '{key}'" - - image = val.get('image') - model = val.get('model') - precision = val.get('precision') - framework = val.get('framework') - runner = val.get('runner') - - assert None not in (image, model, precision, framework, runner), \ - f"Missing required fields for key '{key}'" - - # Check if this config has matching sequence lengths - matching_seq_config = None - for slq in seq_len_configs: - if slq.get('isl') == isl and slq.get('osl') == osl: - matching_seq_config = slq - break - - if not matching_seq_config: - continue # Skip this config if no matching sequence length, this is possible - - bmk_space = matching_seq_config.get('bmk-space') - assert bmk_space, f"Missing 'bmk-space' in matching seq-len-config for key '{key}'" - - for bmk in bmk_space: - tp = bmk.get('tp') - conc_start = bmk.get('conc-start') - conc_end = bmk.get('conc-end') - ep = bmk.get('ep') - dp_attn = bmk.get('dp-attn') - - assert None not in (tp, conc_start, conc_end), \ - f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{key}'" - - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: - entry = { - 'image': image, - 'model': model, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'conc': conc - } - - # Add optional fields if they exist - if ep is not None: - entry['ep'] = ep - if dp_attn is not None: - entry['dp-attn'] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - print(json.dumps(matrix_values)) - return matrix_values - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py deleted file mode 100644 index b4b1366e7..000000000 --- a/utils/matrix-logic/get_test_sweep_configs.py +++ /dev/null @@ -1,184 +0,0 @@ -import json -import yaml -import sys -import argparse - -seq_len_stoi = { - "1k1k": (1024, 1024), - "1k8k": (1024, 8192), - "8k1k": (8192, 1024) -} - -def main(): - parser = argparse.ArgumentParser( - description='Generate benchmark matrix from a specific configuration key' - ) - parser.add_argument( - '--config-files', - nargs='+', - required=True, - help='One or more configuration files (YAML format)' - ) - parser.add_argument( - '--key', - required=True, - help='Configuration key to use' - ) - parser.add_argument( - '--seq-lens', - nargs='+', - choices=list(seq_len_stoi.keys()), - required=False, - help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." - ) - parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - parser.add_argument( - '--test-mode', - action='store_true', - help='Generate only the lowest concurrency value for each TP level' - ) - - args = parser.parse_args() - - # Convert seq-lens to set of (isl, osl) tuples for filtering - seq_lens_filter = None - if args.seq_lens: - seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} - - # Load and merge all config files - all_config_data = {} - for config_file in args.config_files: - try: - with open(config_file, 'r') as f: - config_data = yaml.safe_load(f) - assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" - - # Check for duplicate keys - duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) - if duplicate_keys: - raise ValueError( - f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" - ) - - all_config_data.update(config_data) - except FileNotFoundError: - raise ValueError(f"Input file '{config_file}' does not exist.") - - # Check if the key exists - if args.key not in all_config_data: - available_keys = ', '.join(sorted(all_config_data.keys())) - raise ValueError( - f"Key '{args.key}' not found in configuration files. " - f"Available keys: {available_keys}" - ) - - # Extract model code (everything before first hyphen) - model_code = args.key.split('-')[0] - - val = all_config_data[args.key] - - # Validate required fields - seq_len_configs = val.get('seq-len-configs') - assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'" - - image = val.get('image') - model = val.get('model') - precision = val.get('precision') - framework = val.get('framework') - runner = val.get('runner') - - assert None not in (image, model, precision, framework, runner), \ - f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" - - matrix_values = [] - - # Process each sequence length configuration - for seq_config in seq_len_configs: - isl = seq_config.get('isl') - osl = seq_config.get('osl') - - assert None not in (isl, osl), \ - f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'" - - # Filter by sequence lengths if specified - if seq_lens_filter and (isl, osl) not in seq_lens_filter: - continue - - bmk_space = seq_config.get('bmk-space') - assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'" - - for bmk in bmk_space: - tp = bmk.get('tp') - conc_start = bmk.get('conc-start') - conc_end = bmk.get('conc-end') - ep = bmk.get('ep') - dp_attn = bmk.get('dp-attn') - - assert None not in (tp, conc_start, conc_end), \ - f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" - - # In test mode, only use the lowest concurrency (conc_start) - if args.test_mode: - entry = { - 'image': image, - 'model': model, - 'model-code': model_code, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'conc': conc_start, - 'max-model-len': isl + osl, - } - - # Add optional fields if they exist - if ep is not None: - entry['ep'] = ep - if dp_attn is not None: - entry['dp-attn'] = dp_attn - - matrix_values.append(entry) - else: - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: - entry = { - 'image': image, - 'model': model, - 'model-code': model_code, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'conc': conc, - 'max-model-len': isl + osl, - } - - # Add optional fields if they exist - if ep is not None: - entry['ep'] = ep - if dp_attn is not None: - entry['dp-attn'] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - print(json.dumps(matrix_values)) - return matrix_values - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/utils/matrix-logic/test_get_full_sweep_configs.py b/utils/matrix-logic/test_get_full_sweep_configs.py deleted file mode 100644 index beee33aeb..000000000 --- a/utils/matrix-logic/test_get_full_sweep_configs.py +++ /dev/null @@ -1,842 +0,0 @@ -import pytest -import json -import yaml -import tempfile -import os -from pathlib import Path -from get_full_sweep_configs import main, seq_len_stoi - - -@pytest.fixture -def temp_config_dir(tmp_path): - """Create a temporary directory for config files.""" - return tmp_path - - -@pytest.fixture -def valid_nvidia_config(): - """Return a valid NVIDIA config structure.""" - return { - "70b-fp4-b200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", - "model": "nvidia/Llama-3.3-70B-Instruct-FP4", - "runner": "b200-trt", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 128, "conc-end": 128}, - {"tp": 2, "conc-start": 64, "conc-end": 128}, - ] - }, - { - "isl": 1024, - "osl": 8192, - "bmk-space": [ - {"tp": 4, "conc-start": 16, "conc-end": 128}, - ] - } - ] - } - } - - -@pytest.fixture -def valid_amd_config(): - """Return a valid AMD config structure.""" - return { - "70b-fp8-mi355x-vllm": { - "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1", - "model": "amd/Llama-3.3-70B-Instruct-FP8-KV", - "runner": "mi355x", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - ] - } - ] - } - } - - -@pytest.fixture -def config_with_optional_fields(): - """Return a config with optional ep and dp-attn fields.""" - return { - "dsr1-fp4-b200-trt": { - "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2", - "model": "nvidia/DeepSeek-R1-0528-FP4-V2", - "runner": "b200-trt", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 4, "conc-start": 4, "conc-end": 32}, - {"tp": 4, "ep": 4, "conc-start": 64, "conc-end": 128}, - {"tp": 4, "ep": 4, "dp-attn": True, "conc-start": 256, "conc-end": 256}, - ] - } - ] - } - } - - -def create_config_file(temp_dir, filename, config_data): - """Helper to create a YAML config file.""" - config_path = temp_dir / filename - with open(config_path, 'w') as f: - yaml.dump(config_data, f) - return str(config_path) - - -class TestMainFunction: - """Test suite for the main function.""" - - def test_basic_config_1k1k(self, temp_config_dir, valid_nvidia_config, monkeypatch, capsys): - """Test basic configuration with 1k1k sequence lengths.""" - config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - result = main() - - # Verify output structure - assert isinstance(result, list) - assert len(result) == 3 # 1 config with 128 + 2 configs (64, 128) - - # Verify all results have required fields - for entry in result: - assert 'image' in entry - assert 'model' in entry - assert 'precision' in entry - assert 'framework' in entry - assert 'runner' in entry - assert 'isl' in entry - assert 'osl' in entry - assert 'tp' in entry - assert 'conc' in entry - assert entry['isl'] == 1024 - assert entry['osl'] == 1024 - - # Verify JSON output to stdout - captured = capsys.readouterr() - json_output = json.loads(captured.out.strip()) - assert json_output == result - - def test_multiple_config_files(self, temp_config_dir, valid_nvidia_config, valid_amd_config, monkeypatch): - """Test with multiple config files.""" - nvidia_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) - amd_file = create_config_file(temp_config_dir, "amd.yaml", valid_amd_config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', nvidia_file, amd_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - result = main() - - # Should have entries from both configs - assert len(result) > 0 - runners = {entry['runner'] for entry in result} - assert 'b200-trt' in runners - assert 'mi355x' in runners - - def test_model_prefix_filtering(self, temp_config_dir, valid_nvidia_config, config_with_optional_fields, monkeypatch): - """Test that model prefix filtering works correctly.""" - combined_config = {**valid_nvidia_config, **config_with_optional_fields} - config_file = create_config_file(temp_config_dir, "combined.yaml", combined_config) - - # Filter for 70b only - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - result = main() - - # Should only have 70b configs - assert all('70b' in list(combined_config.keys())[0] for entry in result) - assert len(result) == 3 # Only from 70b config - - # Filter for dsr1 only - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', 'dsr1' - ]) - - result = main() - - # Should only have dsr1 configs - # 3 bmk-space entries: [4,8,16,32] + [64,128] + [256] = 4+2+1 = 7 entries - assert len(result) == 7 - - def test_optional_fields_ep_and_dp_attn(self, temp_config_dir, config_with_optional_fields, monkeypatch): - """Test that optional ep and dp-attn fields are included when present.""" - config_file = create_config_file(temp_config_dir, "config.yaml", config_with_optional_fields) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', 'dsr1' - ]) - - result = main() - - # Check entries without optional fields - entries_without_ep = [e for e in result if 'ep' not in e] - assert len(entries_without_ep) > 0 - for entry in entries_without_ep: - assert entry['conc'] <= 32 - - # Check entries with ep but without dp-attn - entries_with_ep_no_dp = [e for e in result if 'ep' in e and 'dp-attn' not in e] - assert len(entries_with_ep_no_dp) > 0 - for entry in entries_with_ep_no_dp: - assert entry['ep'] == 4 - assert 64 <= entry['conc'] <= 128 - - # Check entries with both ep and dp-attn - entries_with_both = [e for e in result if 'ep' in e and 'dp-attn' in e] - assert len(entries_with_both) > 0 - for entry in entries_with_both: - assert entry['ep'] == 4 - assert entry['dp-attn'] is True - assert entry['conc'] == 256 - - def test_step_size_default(self, temp_config_dir, valid_nvidia_config, monkeypatch): - """Test default step size of 2.""" - config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - result = main() - - # For tp=2, conc-start=64, conc-end=128 with step=2 - # Should generate: 64, 128 - tp2_entries = [e for e in result if e['tp'] == 2] - tp2_concs = sorted([e['conc'] for e in tp2_entries]) - assert tp2_concs == [64, 128] - - def test_step_size_custom(self, temp_config_dir, valid_nvidia_config, monkeypatch): - """Test custom step size.""" - config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b', - '--step-size', '4' - ]) - - result = main() - - # For tp=2, conc-start=64, conc-end=128 with step=4 - # Should generate: 64, 128 (64*4=256 > 128, so stop at 128) - tp2_entries = [e for e in result if e['tp'] == 2] - tp2_concs = sorted([e['conc'] for e in tp2_entries]) - assert tp2_concs == [64, 128] - - def test_conc_range_single_value(self, temp_config_dir, monkeypatch): - """Test when conc-start equals conc-end.""" - config = { - "test-config": { - "image": "test-image", - "model": "test-model", - "runner": "test-runner", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 64, "conc-end": 64}, - ] - } - ] - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', 'test' - ]) - - result = main() - - assert len(result) == 1 - assert result[0]['conc'] == 64 - - def test_different_seq_lens(self, temp_config_dir, valid_nvidia_config, monkeypatch): - """Test with different sequence length configurations.""" - config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) - - # Test 1k8k - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k8k', - '--model-prefix', '70b' - ]) - - result = main() - - # Should match 1k8k config - assert all(e['isl'] == 1024 and e['osl'] == 8192 for e in result) - assert len(result) > 0 - - def test_no_matching_seq_lens(self, temp_config_dir, valid_nvidia_config, monkeypatch): - """Test when no configs match the requested sequence lengths.""" - config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '8k1k', # Not in the config - '--model-prefix', '70b' - ]) - - result = main() - - # Should return empty list - assert result == [] - - def test_no_matching_model_prefix(self, temp_config_dir, valid_nvidia_config, monkeypatch): - """Test when no configs match the model prefix.""" - config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', 'nonexistent' - ]) - - result = main() - - # Should return empty list - assert result == [] - - -class TestErrorHandling: - """Test suite for error handling.""" - - def test_missing_config_file(self, temp_config_dir, monkeypatch): - """Test error when config file doesn't exist.""" - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', '/nonexistent/file.yaml', - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - with pytest.raises(ValueError, match="does not exist"): - main() - - def test_invalid_yaml(self, temp_config_dir, monkeypatch): - """Test error when YAML is invalid.""" - config_path = temp_config_dir / "invalid.yaml" - with open(config_path, 'w') as f: - f.write("invalid: yaml: content: [") - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', str(config_path), - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - with pytest.raises(yaml.YAMLError): - main() - - def test_non_dict_config(self, temp_config_dir, monkeypatch): - """Test error when config is not a dictionary.""" - config_path = temp_config_dir / "list.yaml" - with open(config_path, 'w') as f: - yaml.dump(["not", "a", "dict"], f) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', str(config_path), - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - with pytest.raises(AssertionError, match="must contain a dictionary"): - main() - - def test_duplicate_keys(self, temp_config_dir, monkeypatch): - """Test error when duplicate keys exist across config files.""" - config1 = { - "70b-fp4-b200-trt": { - "image": "image1", - "model": "model1", - "runner": "runner1", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [] - } - } - config2 = { - "70b-fp4-b200-trt": { # Same key - "image": "image2", - "model": "model2", - "runner": "runner2", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [] - } - } - - file1 = create_config_file(temp_config_dir, "config1.yaml", config1) - file2 = create_config_file(temp_config_dir, "config2.yaml", config2) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', file1, file2, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - with pytest.raises(ValueError, match="Duplicate configuration keys"): - main() - - def test_missing_seq_len_configs(self, temp_config_dir, monkeypatch): - """Test error when seq-len-configs is missing.""" - config = { - "70b-fp4-b200-trt": { - "image": "test-image", - "model": "test-model", - "runner": "test-runner", - "precision": "fp4", - "framework": "trt", - # Missing seq-len-configs - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - with pytest.raises(AssertionError, match="Missing 'seq-len-configs'"): - main() - - def test_missing_required_fields(self, temp_config_dir, monkeypatch): - """Test error when required fields are missing.""" - # Missing 'model' field - config = { - "70b-fp4-b200-trt": { - "image": "test-image", - # Missing model - "runner": "test-runner", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 64, "conc-end": 64} - ] - } - ] - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - with pytest.raises(AssertionError, match="Missing required fields"): - main() - - def test_missing_bmk_space(self, temp_config_dir, monkeypatch): - """Test error when bmk-space is missing.""" - config = { - "70b-fp4-b200-trt": { - "image": "test-image", - "model": "test-model", - "runner": "test-runner", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - # Missing bmk-space - } - ] - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - with pytest.raises(AssertionError, match="Missing 'bmk-space'"): - main() - - def test_missing_bmk_space_fields(self, temp_config_dir, monkeypatch): - """Test error when tp, conc-start, or conc-end is missing.""" - config = { - "70b-fp4-b200-trt": { - "image": "test-image", - "model": "test-model", - "runner": "test-runner", - "precision": "fp4", - "framework": "trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 64} # Missing conc-end - ] - } - ] - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - with pytest.raises(AssertionError, match="Missing 'tp', 'conc-start', or 'conc-end'"): - main() - - -class TestEdgeCases: - """Test suite for edge cases.""" - - def test_empty_config(self, temp_config_dir, monkeypatch): - """Test with empty config file.""" - config = {} - config_file = create_config_file(temp_config_dir, "empty.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - result = main() - - # Should return empty list - assert result == [] - - def test_large_conc_range(self, temp_config_dir, monkeypatch): - """Test with large concurrency range.""" - config = { - "test-config": { - "image": "test-image", - "model": "test-model", - "runner": "test-runner", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 4, "conc-end": 1024}, - ] - } - ] - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', 'test' - ]) - - result = main() - - # With step=2: 4, 8, 16, 32, 64, 128, 256, 512, 1024 - concs = sorted([e['conc'] for e in result]) - assert concs == [4, 8, 16, 32, 64, 128, 256, 512, 1024] - - def test_conc_end_not_power_of_step(self, temp_config_dir, monkeypatch): - """Test when conc-end is not a power of step size.""" - config = { - "test-config": { - "image": "test-image", - "model": "test-model", - "runner": "test-runner", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 10, "conc-end": 100}, - ] - } - ] - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', 'test' - ]) - - result = main() - - # With step=2: 10, 20, 40, 80, 100 (last value is conc-end) - concs = sorted([e['conc'] for e in result]) - assert concs == [10, 20, 40, 80, 100] - assert concs[-1] == 100 # Should always include conc-end - - def test_all_seq_lens_in_stoi(self): - """Test that all defined seq_lens work correctly.""" - assert seq_len_stoi["1k1k"] == (1024, 1024) - assert seq_len_stoi["1k8k"] == (1024, 8192) - assert seq_len_stoi["8k1k"] == (8192, 1024) - - def test_multiple_bmk_space_entries(self, temp_config_dir, monkeypatch): - """Test with multiple bmk-space entries.""" - config = { - "test-config": { - "image": "test-image", - "model": "test-model", - "runner": "test-runner", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 32, "conc-end": 64}, - {"tp": 2, "conc-start": 16, "conc-end": 32}, - {"tp": 4, "conc-start": 8, "conc-end": 16}, - ] - } - ] - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', 'test' - ]) - - result = main() - - # Verify all tp values are present - tp_values = sorted(set(e['tp'] for e in result)) - assert tp_values == [1, 2, 4] - - # Verify correct conc ranges for each tp - tp1_concs = sorted([e['conc'] for e in result if e['tp'] == 1]) - tp2_concs = sorted([e['conc'] for e in result if e['tp'] == 2]) - tp4_concs = sorted([e['conc'] for e in result if e['tp'] == 4]) - - assert tp1_concs == [32, 64] - assert tp2_concs == [16, 32] - assert tp4_concs == [8, 16] - - def test_output_format(self, temp_config_dir, valid_nvidia_config, monkeypatch, capsys): - """Test that output is valid JSON and matches expected format.""" - config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', '70b' - ]) - - result = main() - - # Capture stdout - captured = capsys.readouterr() - - # Verify it's valid JSON - json_output = json.loads(captured.out.strip()) - - # Verify it matches the result - assert json_output == result - - # Verify each entry has the correct structure - for entry in json_output: - assert isinstance(entry, dict) - assert all(isinstance(k, str) for k in entry.keys()) - assert entry['image'] == valid_nvidia_config['70b-fp4-b200-trt']['image'] - assert entry['model'] == valid_nvidia_config['70b-fp4-b200-trt']['model'] - assert entry['precision'] == valid_nvidia_config['70b-fp4-b200-trt']['precision'] - assert entry['framework'] == valid_nvidia_config['70b-fp4-b200-trt']['framework'] - assert entry['runner'] == valid_nvidia_config['70b-fp4-b200-trt']['runner'] - - -class TestConcurrencyGeneration: - """Test suite specifically for concurrency value generation logic.""" - - def test_conc_progression_step_2(self, temp_config_dir, monkeypatch): - """Test concurrency progression with step size 2.""" - config = { - "test-config": { - "image": "test-image", - "model": "test-model", - "runner": "test-runner", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 1, "conc-end": 16}, - ] - } - ] - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', 'test', - '--step-size', '2' - ]) - - result = main() - - # Should multiply by 2 each time: 1, 2, 4, 8, 16 - concs = sorted([e['conc'] for e in result]) - assert concs == [1, 2, 4, 8, 16] - - def test_conc_progression_step_3(self, temp_config_dir, monkeypatch): - """Test concurrency progression with step size 3.""" - config = { - "test-config": { - "image": "test-image", - "model": "test-model", - "runner": "test-runner", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 2, "conc-end": 100}, - ] - } - ] - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', 'test', - '--step-size', '3' - ]) - - result = main() - - # Should multiply by 3 each time: 2, 6, 18, 54, 100 - concs = sorted([e['conc'] for e in result]) - assert concs == [2, 6, 18, 54, 100] - - def test_conc_exact_end_value(self, temp_config_dir, monkeypatch): - """Test that conc-end is always included even if not reached by progression.""" - config = { - "test-config": { - "image": "test-image", - "model": "test-model", - "runner": "test-runner", - "precision": "fp8", - "framework": "vllm", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "bmk-space": [ - {"tp": 1, "conc-start": 5, "conc-end": 50}, - ] - } - ] - } - } - config_file = create_config_file(temp_config_dir, "config.yaml", config) - - monkeypatch.setattr('sys.argv', [ - 'script.py', - '--config-files', config_file, - '--seq-lens', '1k1k', - '--model-prefix', 'test', - '--step-size', '2' - ]) - - result = main() - - concs = sorted([e['conc'] for e in result]) - # 5, 10, 20, 40, 50 (40*2=80 > 50, so we include 50) - assert concs[-1] == 50 - assert 50 in concs From 34ba318804a5aada119a4609c7cd6e132e8843fa Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 10:43:26 -0500 Subject: [PATCH 055/149] removing extraneous files --- .../get_full_sweep_configs.cpython-313.pyc | Bin 5046 -> 0 bytes ...l_sweep_configs.cpython-313-pytest-8.4.2.pyc | Bin 55816 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc delete mode 100644 utils/matrix-logic/__pycache__/test_get_full_sweep_configs.cpython-313-pytest-8.4.2.pyc diff --git a/utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc b/utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc deleted file mode 100644 index b29a85d117b2da207801b8a1143e67830ad9f45b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5046 zcmb7ITWs6b89ovz>wcH4i)DwFmB_LqyQ$qaPMdaVF3!zyD~4%Or4ST}c2rB0LP|}n z4PEvyK-~t^K|93ROZ@1f1DeM^^ua)}p%{jhno@{igTO%d5@5ibC0McbVfzmyN=_OT z9YgT^|2g0J@Bh(p9$Kv?1kYzb{qM!2E`+|pjQWsm;$Z@a-y#~(niGg`jT&B~@LFE0 zj08{6TAH9qS~o%PB;6N8yzYD-t>1SRp|k;^^SUs4t}T#4i<(7E%y1v`r()Xs2vqR| zFfpw%+((o`V+&gIgiP*P+Oveed*Qnm=nO1&9t%)dz(ZM&ZB2F-650;gA;RDZt)0q&xtttVktsqUzH{hX#XN9$-KZAw_X2$lN(w@nC?giRXk^5ES){A%ndLm z$XsZ9=6XmO>`1ZpKn!&dwjPKb9fZ9Hg6bd~JrKhkgp)>R?=g19!8oUNFG4-Et1Fn7z^~s6jY6yaX&RqBdTOZi_cgks?aAr(2)+B=%69`jGJ>&XkLqJfN^!AjWhIb zSam&B^G;P3=V3gZJ*^Dw`o1@e2R!)`c;aQePn8$#$m{Qc80{c@JrJ=D!ruci)e=liEWwM{S@DP+yFDc`1kE=XW$)(nw7D&^2}%%0^|(P@d+arVgeW7 zXk-FRh#tdj$^;cY2>iIh8xxT4Cm$xHT16p66aGEL`S};IGoCDyFP58 z8GqMKW%p^2@aIQn^O_=1)~ zJD|2GG^?Ha4x#dMoATf^eF8>u)ggy^ed-f0J5Pm7N|^-TgWX|0Ff(l)Iu)L3Nejr&ay&=&z10G)fSJp zP;q5h){LL7#^*VfD)LmJ$g@wAL5-d}Fn5w#DDt2z79(UGm*y7**}$`xDmk9b%H%~h zU#gnoae=)Q&$FCR-9f_|7|Bw=E|xDoE_WYQ)$XQ1l0I{>n9HyNk8d$Nz8GK&3&kv( zkC%9MA$O&^a}IM{prP9~UUV^{Feu42G21Qe4PY+57l+wHl{^r!bG*zGC0m@ez_VwR4L_i8JwY+s>a*9hu7sLXKOcqEDY+ECQe(1!om4~l%@G{)u7NUmzA|_sjAPG;^}gktNx^; zpj?XXrlQ>)z=3V83D~Bn?HJKj6f^SpHpYALryqat^^31Vz(*#r@Ys-S zfGRJgd4ZK}2l&NGf#u4tVV0LI>1;LytIC{A<{?3{1zb!)v8RM`F(;Ea$ujwNF~`Y7 zjw{P1MH1cwjnjq-vISH@%cfe*NES;hCzH$RLS8ls=>;~GFQ&7y3DU(ubfBAL5*L_E z2rNiEgDdwi&lh>QPbl(bXvDrsDGLQBJC8u~pDLD*;3{iDbh$EH5M+}QX5n(Kd{MS2 zC6Q{imP{;y(7tpDY{-IoXqXgk7+F{1u><7W0@REyTPc);*r42}_`}O2_NW^{*`Chl zQ*E!nl(cN#=Ax`w=?b)2PIO(_#FDHFiRxcZgowxkI?lPSB^noHp= zEjR=$e@Vvmno{ajCV*@uTd6(B8PuMv)LshQ=R>#-p!ia__;`H9mUWPRep$dY^+S(* zY?6T*5hmemg!a#6CXaCoEIw=4xdUe>E9G2Xm~6Qj&leYA9pau^s6g0+%Pa)F?cqde zStbi`s_?IYN_@BpQ}9>eRdm3pHns*JqA5 zW==LH=fsIqqIJH9Aitt!dP}dUAwi94n;P$3t3z@)AGgDXEUue8KCNuU%fPHhg2XqmKjo1J@U?Ew1Jo{iC(n zO}B4NyLSARLEJIZaL<5q);BehDI z91KhF4v*n?C?X9E;WrqPLevAdDd1YsZ3U61|Jut^II?nhHFE8QnQ<4Z6<1|K*>mjxdG4+9^@AABldBnNkhRO%!2 z5O~DTE3+@b4@)J_zRaHi7GGtBm*E5EXf$6Vv+|n0Ms_&1tO3d5hxdV=bZDI}|iKIm8&$50jS=Ntj(XyUltCbxrM1mA-3ZNfI z*`nLVNt&2W(ne0>6Q*%nrA@b@yQj6Y?WvP=&)V5`ojdp5xpU3qaanME{^g&aXc@Cuen=7R zva5k>2doy$ix$};Tko?dR`$2(UR$y#V7b?>I0F2RQ*p98#Y!>zD=8BDTcVV(zb?gv zzeV?z-s@J}tc>kG&%I?z*}Y!H8?Zd;JQC0ztpAnm^_ER)RH;}bkHe6sa*;evLmuBE zd5R5rs${9&qEy#fS}k%(y+w9yD&nu9_*q&h(%d4ghNXFsRwmMFS(+DV@{e(P%uJ98bgs(cirpmnn6 zZY3HSj|L)v@#x91gc4Cs2ZmxwbZ|U@&~QQt3?|~kv5~-VY&4qenL(MXG=6d{Jcg$x zsm?%lpf{9^D#_j_Ct{yE|P7sU3SP$xfuT>xdi`QJvO;icJokgk?cXdST2*ja=Dsjr8L~B z=y9&MqM0fuyN*Q1CzLpv=~QGiHWV0m_~71yyS27Sjw=&`xF7AgUcP1h@$TWVjorul z*01j#SH`dR#ZL_?J+VYv{8Vfx7Qx5IlLYR8GS0DJaOF%EbImr zcB{Rr6r&CfN-0GBnDw!y4D(M_yIWa}JM_ThC;+-zvXnH;S09|O-a79O%-1#Hhwg6- z%r`W@)f{-m_PQ@!d(S*VMX%e^wR^8PY_0yci;F8NuT<)})O)rfy640_UwNSHZBNZA z*A-ix+j+%O=Wr_YVma94Mgqg8bqFkiO)j|sh}sOnJiVdp6(~jsYQJ z@xk$M-}=5y>+k5>5>^H`OtwT$4)rdIIa|B$dpPUl48P^g|j zHGu{KEdW5hmQuoc*H5+@d==Gv>?cG=OzOhNv zH{JmHrh@vW!mMuyMOfd^LrnXI#yX933IAxcdwT3FOye9Qy~ojO!}zAYt5kcis!@u9%bSa)Q6JPwK&9UV&cOjZmfl?^##UMa&{O*W~6 zcrVDC9F0EFePl!T`b~XXY4~DhRL+Xr@p%0oL}c7pf2q+7& zN)6qv1$gMP1%o~4V7HX@NV#6DzMP(Enf`0wUmB(_4QpQ-=AWkYqe^PvWCVbBWg{Dm zn@k6z^))43y9a~uob|QJH!&~|vG6NHZ?>h5eI#8QzEW(b5h!5<+FaDj6;0@6AN8_N z>t*Uy?V_2A_3|_VFItXTyD>=Pwqy8eeuZqZRht%zEOGlWb$Yc}ju7d}yIL(r+qK+a z9g66m9zRCt&90Dbllp{qON(Wlg=i98Lr5Ji3Rs#g0sI@N+hJ?A3|oV?$@T-$(Xpr! z7*7NSwdwSib|1Q58}c-F_GGP34Fv63dtxja&)QE%PL5{nLlY;*vNlllpaZii&3c&b zvJ&NwgO+AX)!rG3j7LO-X5OrIIO)JE1?b}Bw30tsn>VQrWf!u9@jrPdz_XUOD(lmg z9nao(sl+q&#B~4plIF7;E=lgsm3+G7%-Zu(%R4rUEAXSzvZ-e>r41>2!#fVdFO)Q2 zXA|!WHFw&+>DpaVJc!D~rM8si+CxZs(efx`?UW+fs@UWr#V*^x06B;O17imk3v3;j zC@^(klfcpud*mY4NGX+FfNmKqk>ZivfMv2r@s5-Qz1i}Iz%U#Fd$Vr>)DDC3@(3O_ z*+R*I1}w75)=<$Vx`!+72-%LP zVV=Sv=Osc8m79=yiw(FVo};muai;;FoW~@sCap~3DGjUkTZh(Kz#hu3WAujXIzn?# z-qk8tSnA9nP>N$jX4rLv$VlGRYB@&Zhg}*zYtM{;uoy}c>B+myN= zqj`W`hSD?@sBKTUA#(!NG}%YtxsL20yq`_j(sx7+#sXy;Iw+(KfSxL58-d^A+d*1r0R zB~ieNrJSu;I*-0{BP*6=*EagdJUD=F;ZT8(^tkm?OnCED`;gbN@2ye_Jb*Vk=Lms zty=DudqUOfNPF*6w<^f3dnY$C0rl}nGB&8H0w$CQ$*;L$X~Pp6o(MowIzg)P07U*H zP^tG^FXeip9#Ropmy)NGJrQN()b*0&pb{G!hl2ciS+}ZK?Pgl}>#pu@RpFY<+NoB` z!SuAp3d5!X}rEEl7yaqJkYUv?DWQ9()ElQ2<%`<>x0SFl(He1T*ngng2|xk zSF*RW> zvR``|0t=sphXi);ci0?EW^HJR8N2czYEg~=P_6w0HeW9@e8E=AI(B-iLgQ}6nk|aO z@wiI$3j$CBs87y1qVaL%^h^<}O*w|_ITdZ<6>U=5=!rC9l}-Zf1R4ntW#IZZI}|d5 zOjT!mmiqE&Q|%$749R&2K1_KOiCQr3_M7)KrK+vWT^P`^SI z2YpPjlPyk0$DxT-6q3R!hX_0XkagcRb~=0<#u!izC$rAc#9(AJnROwR8D2z`Y;8Ds zA~7*K6jq>543CT^jz>nrOyg7%PDaRhV@%1G?Pll#biGGW5=zz=hLQz#9O3BGW22Ec zLuBQb=z$esJYkeoHkdd$7Ez*E7xZ95_@1buWbO2ES;uH1GK4NeV%7nbS8Ock&)SpG z(P3ua!8HHb(vyk!6VcO9aSfixItL?TK!2h5Vk)fxdSEGiA;*P&9^tRH$#|#F7|FS_ z(O71SF>;3jnPyJ?>y5W_3U1CyctJAz7*d|dE49H`sy|5rd#0*e0J!eX&pL(?3Qd1U z!#LYT1jMNL>M_swIOgR`ch^pLQYd<*#1VA;#NsF^{Yjan(*IljDd$g|mh$@P&abv-yj>}&>!P=7PRjYE zrLNCTinQDqPrfL1O?Mh|7-AyFSKEy_wAclyD`ob?OWw*en?LhRN@|{tFp%~(&q?Yp zBQ;MwBVs&oL2CYNQlzDGVm$ex)O=>MF^3jIQLQ@EJRK2vL;%TXCV%79J@dZ$l)G&@ zn)bELxz*pSyY0*t9y_lG#ciZbM@1~16BEf9ciSzkx+&%EI@^)kdq%$+=T zUJr`fNITmhV(FZiNY1#s=6$!Y>V}lNeR?A8YoBwgzgc(tnXNo_UJr`fNSmGzv2;#M zBxl_1^S;`YyJh;G7k19MTljC*-Ew9V51!Y9;x^Kz?-8+dPD~_c+%31V=9-i{Fum!8 z9dqsg|INAsXEyTSc|9m@BW)U=q-IX%#6)t&9hmpkrQEI4p%-?|xm)>f*4>KX!D8q2 zpty~+>5zz}b7CSn<8DQ3Ojp0Kose%M1Nj)ZUB^H*2%I{LZ_*K!hZ(BftjaFByWXzB zc)rqTsj8pzwFAR4zRoG>lE40okAMF0v$cQJ_?5=Fmi}~0f2L(q#=m*0blzJ(J)HJ} zh^fDf)IRmh^stEYzy(SbY3ZC8PrfL%8%iOEAtnlerWh;71QcCL2F@4LBXwlW`$FilDUdr_C^ip`MGQ%^> za|^?PWpfHuwCq(C%%w1ztq7Hxh$!@TG&ff;6H!z`!7O26J`*bTSswNzi#Ax2MLx>` zU|8ZGWFH}4tmqJ|M`_6(FN=HRDj__Z(prcqpY^Ak<5+b_0*?7}l+St@^LEme&owu_ zpM@G(`-+14S(HLQ*T}U(`3yz0sq$I>Li0W`Q$C|l#(pL@+-_$YfVaAxM=EJQM6xLE z8MXu)Cj-R)b78?rEW2Z4qfw(xVX{Zri|V1CQQ1) z0G7+vomh(Y7PdIw$(GWzgr()4LQ62^mkB&Z;3EJ7e6`F>TFKy0i2!7axsiBM8Kk5k z0#O2M0A}255v#;0!A^kH{Un7>5r9C}qKpwx2qX!N6PN%9Rw}r&u+9x*ahvtX96&LsBgydD%R5z?j~5i#PM^q5En z-!$*7NlC5KThiXvIZ6G^O082*^VoSkC~hNddW(pqb7CSHncltXu=utnVWwyK>vd)K zb=b|VO@D+8AMw6uS#Jpy-D3^e?y;Wq=CEYehrEI%b1cG=v3)Fs+n$EJnA~7h`?y4w zS{iF)&Z;H!iSzft{E>yHK24ZTo?3l^)mnx%TgWV}Mw`*aZ29!krYz9H(jda6h6&iT zl+FSzn^Ukqw2OmTZ?v2)(R?HtUpOuFKRU~w%8kTqgTt1{;`2AuDI!aB^U$kd$hMKsIL~3MqgFH zI&>u%MQpL&7G-)HXd$$^xy`U%${Z&|DZEvs;Th(+g*D}}IZL&+5_XqhOksEFQQ6|M z72uO#>9jgIKE++YslP3VI{25hOgG2On%kJEVVL$e&*uI%X0nDc!^{y+uy%3-V^_yd zM8*S5`xM~XC)zs-ML;Y*5}@6INwBaA?JylZ-81Qd!r*u;9xa%E&DqmRb8kQXUnBNy zr?m~MLfYAi6=OTpsn}zzPqYsenaMh|sZ9GM*-C3nn%GWRUUUR3hdTLMbEIgCjkfye zEc8IIh-+Ky&{rgb*lURmob(!5ul7kQLzpBPOCazp7xkBHABo1HK~Z*QcP+vq(gw>g zZ#rUKdDHc5=X#u-J)(lIq26Q=;AfY?Ii@NPf5-p2Xa0*fsU zu+IrQbySV|4kTvUO6ocg5~+FKVbl4{gsbWXn??H2s}&RV+1}< zz>M$z9o$F~{mITy_>WDMVtZss)k?KCz6Y&;~OP;yo}{DITMEZtPYW(qFJQm!tEJ`l>J^HMXH?b6WHE@HchYFsPAih2pzu6@a^ zYTCy4<+iFhLJmmVoXMhIE^Wgmptw^m2|4e?l5_|tdkC`Tkn?cSA-oNfycOqcZxb@g z+fv^GWYfe%=nU1PFyCOT4W&pZk#~V97Nx-6{=!Oq4@?33JRuL4_F{WcRpQ@5 z-fa*W85XLTFAS|=x=024%_Y45TVF02%9yvHV7`~lX^;gAscWrVC#0^W24g+Fx6zCF z7y3;sA2comEI&!YvE6FybB8{y8mQ%t(P}Y9QK2?QA@$YAES9ir%!1ixW40n^%nC{G zhjh#~7#4M=OH7S3Bwlc+kTEM-bVXxUl){*8SQhJqsAAciLM(C1H8mC33TQpBSsk?{ z#6M%yHnLHBV8x?$6&tlpM}NUbt=ydWF{faOKfyLzP%n(LK)f>K3RN;%_A`McV2eXu zEj9#+w4ra!DO=0Y&uuJ{TaW&aP~}FOwguNv>b=r7UGO1k)arBH@KLn_Dp;k}rTx{ZAn63-FkOzj;+6ZYhwfM*Mbs!{K1Hm~p@`Mmb3ddMvGb-= zMA4!eb{C}66s15BJD0`k4yv8@gz2xPPVm+-f#61b@+I z1_Nii=EFTlf|9?;DY?*kJBP6X4UqmjClp*edABZF)Vvg#`(p~g`WVLFgQF8e(V@Uo zC!%qZLMCAX4$FG(PNxu6`5$1?U@n!IzFav&QH*FVV)nbiP7- zm~xXVryxw>x9y1C8*4|^;dmlEG^W67i7IUyp%6Le zP)H{QLF6A(m_V>dw}4ll0`qpBhCo0~+XBme8#jlKq7>QDZ<#b2*WbZy;_t{-(c$U~tsW*DItH!&P{to!+9=yhX7#qyvSxi|)$W)5XLrA{@wF#k-SK;o8GlNuPp#Sw zkzC5xk@0q>q&4R%7)X29%t`7mBdtk!VU4Y)AbdeuLoz@$Eu9nN$rn+~*I~>eq9`nC zMD6D)L@p6P^7pE{$UM4s+W*3il)Lln9tP6B&N;XGn{{`df$b{A&g;Ppo7B!;wbGt<@5kT^# z>NUJgs$RGgX5(jD7|6FBhZQzil9TPY8iY0YS(qj3CgwcMjKr%`?(XSRUs^?fR;7L2 z_~sPSnu@27LOA@si>D!kP7KRT&>*|ELG#4KUl3X|aaV;Z?#dHU=Lop5 zvGiHd@=BD# z@~V8<<&`c*s!+w8rSPNWyJ-f$RSgG}(`bJ&M_q@v%Nq-wx7_z6Yu7F#YhSR~a>1n3^rDHsoTv$`)k&XH=W~K7fINxv^bkn6}%38UG!>-#Pu@xuKc;uRJ+( z{}-S9O6OE*%74eDs`{B{G7WdW{LHzQ>EV|S&ODQDxHD6=eX3-BRr6Wj^ud(WF?}#i zQ-=DRl{%(Qix>}F$Vp4u1y_l$N{ zH%HJeDA*0tJ)=FExnNVLdp<$;$fE0JZFKN6y5~2gdo`1OM)w9M;KTi-PWB3>c-Kp@ z9~`5;n?`-sx{><_UW@`NuK}|;ZTSX;bQ-68lj8pgAh`&Isb-5OopA71&UVuKFI+C= z=ldDe{NDgpMqlRLb<_P11Z&Io2BkqFD%vH8 z%vGeCy0umv*!RK&R&_iM7xg(Vkzsyn*yCjnnc6*ZclRMX8E48qxoR1cL5E?FmqRc~ zPFYHvQXQ!XBjN)0cxl?%5nVeguWVNhIdPS=hPxpLxO!<*B4BEGj9oT2osknQx*~!m zN`aB9HME53x9A45%jOhT1ah6a$E!rHmm5MQs!qqX0-cV~BxyJUiEtzII8Kni=U~ph zil!pDiD0}`E=l}t$jwZnhqaa%0uN<)V{?62HQb+A<>sS*spFENjCp$qo%^ym#XdDT zpiTuIxkYXj#=RvgXb&;110K1ZIay$lc?kg0Co(C zr+&V!gMRg0^Q&6tSGB<&EzmyS-1b&m`zw21-;-|KkCP2L>v8Htwf~AuYOln}290Gn z*`U$|UkmM)njkrAc+;QpA6~gw@JAmsuL$$r>NAn&Pfz>P-atwsp&x-X$sg6qxB@(*I&}xtDn_FVFE_ z{$brkBU)6OZkd8!Z$a~(7Nx*=7*}`Zxy6*XECvZ?LF}T*_g)SI4t^QxGT)VD=Di&F z#J!heuY!NFpU6JnT00aQX6IEw#FXodE7xh4y4%)lawM-?Tc6N3+2(9gAO@clVAsen z-LORgMrDZ#*}_;LE3rL0tfC(z%w(33RW4$8ZFE=Nkg!TsncCTg1P6Ce#in%Ju_)xG zZh)QC1HsixltOZ(BxDwt8rgKH{4T=TJI!Z3(G+1c1)U?7G}(8sgXTbbRle=u;I}$o zd+&4dv73C8_aNI8W25F<+`MF&B|wB~Q#XQ<}dg zsZbHV%dmDaBS;^or{&4&X$h+@QuqR;ju9SMd9d%s;zUs2>>6{@W0vK727 zmAVZ8v8r!O!Dn3M^S^?9Nc=Y|HB6lnF&?;(lLjwyS`OTpuFT7fDC)R9ywO8+eJF-E z+akxUuuO-yahqr%@@+WxNMQFQoR5-H>|rz`7hju=Yh|8pe-|KWHt1F@7K$gHi0%WVhIF%M| zLkG#IN{JnZchDp%f;Jw>j%aU- zxOU0bUL0Wfpg8<6U|H_yLw(LC|892_6lU{1a1_y=`(PX@9JirES4_iP5iR@LwXEuMkA3>F zm$s$4cVs*}FS*MHtM> zexb?9nBr_pIvH~j&k^FJOLV-Q{^=bL#G}P$-o;NpG^m3_4LDsDI&HZ$N0lUL#m!y) z8P@S-T2B41#xOEeB@I2h45*s-#v}H{QFY)1N7co1jM!CVcB^i;49Lt5^MsXaEY6O* z?U{6IA#&2K&?`;WsY%-cl(vm{S(>D_1&$4vi45}aI#{A$3za47cuK)>mfR~AA&~N~ z1pX(1Ep)$xUPVXd8Al#6D?2FbPJsOLK`R(yt1lU5=a056AXgltjhV&>m7hm_KxKc& zxw165zr5tBIJ0&-dEV1@_Q`p-=X3j?-=C`LJ?~ymDBNzc)_rO4?AlCI&myje)v>Vg z_jZ11=LOFiF&NgSJZs*mMvWIdZP(wavXrgG-Rnv0XtaF3a*xyYMqlZk8r!#=u02)7 z->Px#=~!f#+*|N4S&oW=>e?Q(h77~RWZ2MzmUpScM$lVmXz@;TnGPEo;OsI#2vk2x zf(|xTY598{AdeM1K*}bWb2Afak0C;oIf5hP**UE^O}=NcgnY$wf=&Ykt}<-Ml(k0S z7XsM|86A^#Ct(c1h7_IeYOG(mOm*_n!-j~m6$z#TXq;}?30wonA0Jh0d|)EN8N4`B zn=eJVDT4{o&&&5LjgrrzRz6Bf)BgV4Q39O)+`&&Dob|6cxBI-i=RJ*+ov97?XFP|* zFd49>Jcn||$$<5O=a4#3>i5`eZ}gV#sj_{`=Gs$H{H-e2p4QxfLURq9Ps-Uqks){F zPGDsx5Wbks$ByHqWffH&hE%)mUKslo8Tk#on5Awy5V1-G;2CJ7KQ$X>Xeg+oy? zO2~0LOTEUfBg7u%T{x>&%roS7NgWXMRqQg%7=9W5^f_+Czr=jtSLcJ8$vcQn8qs}d zVr(=v7{R&2noJzt-i zuTRU@H|gfsxPgr#E!J1aK9>7X?_}d%@xHtw?HCGrdp3Xuf=+`FM4xkhjN+VYD$bi| zvc>8)JZs*73HdxCMIE5}#|T_=?}+StiR*{FaKgv%6jyczu7S{QfD&rSeuo%)qm$ zPV=Kn&s^Z*?#p=Y7bEhJb}qpfuJ7TX(JTzee3<4*8#v^>f;_vvvg zFzk^G<7+X)9>#;gum=+Zji(%Ltng8UF}TDis>JxLG>oFkB}P%DI*NP?Ck^$?64i_W z+!X|WobTr2MtKv>qWlek?+_qar9#sjSp8NCw-IP3&_SS+zzl(V2#EDXp8Wz}N7NnS zj|z0sD3LZ3@-MHvQ<7?4U-rjLI%@?)Q(eOVH^}7fG3&sB$4gd3Ajbp#!0-7nMgln`2WOSTU@myrq z3ak6(mPO{g{G`xH{x(Nui4pCNw^UP_R;(hHHKievzhYCG#Zdj;jORW$z@ZmcO+ z+#oFnFSP_MuL8$ZWuWC%OVIKvm6lhlW@-imKYsFw?&Mfx5PNS6rQ!^|v3a$T(3UB$ zTPRd00{b)s?h?;dN%!K&%g@=-C@ac^**kLu?-Ddv~C^}b#jC?n12bJecNFt zdoL&1dwB>+y^J0Q-a9?sm+|ZuNO}NBy8ohQze(YSFQ^{8q1z0&2WwzCb#1jgPJ1okI@Hr^(d3>yEU7$;UJ;d;SJOMAbES7W4#sVzYXml?%Mj^Xz+Mbsq~{6&tqZ$d;|mFv5TI^t}7K7n5trv#>ja9a5Lt98UIIP>eggl@?h>G}!}SvXLLmYX)A zU9BwxfWiTw72tr?54gS=V2`!Y;rr?KCWnvLMKuiKzqn8vyRL1=pBvRhD5WE=U}@IF zx^`$JdUph(@7Ox@++;n&{p1A1iUDkm4@7Z0Fi7`v^$ubxOW}>kU3uQF0=O<0OZdeW zAKXoO!F~sCTUzLL`M2ECf^VjLDmH#15Is3IewqlLCaCC{bg{HTB?^UTUV}#K2-34C zVu55hi{UeLV~W7@0H%ssF1Tjzr%C0SbMTxCt&JIaiDZ2R6+#?W;hS=D|ElZWbY}by zENy3<%N+7E@xY1RVsi*pG=>+pScu=G__e$65qYa8ZMH7Pd~E|`dF$qD;`-U83D4p= zdw@)~Q&&@wMYdBHLyYKu(3*HIB(eP+Us|o!UeI>x8g}e*?a`+Vc2#vttBy<@cT4Ic zE#`ZLbnltkyA|ZEO6-<_T~(bfhFrv7nlN*g>s7ldm)syXhFt32n)m88auw{Vrbkxk z+X{nCldX*YvvEI)gJePb!S_tS0e5`xbb#%L=uy6nw#k>atfjPd1oRCf%mqy~rLBm+ z`CBS63;;PjRdJd6LJ1J)CQwhHiogbdm<5Q>A1VwyvJQIdWWMPZ+o6c9*trVbd_wao z)!d9(&2;lLjW@QwRRLe-@|}&W!HDRnTS2m1W2Q~8obcplqH&un!sC|TBC`ASjp@b% z3pu8nwnYTzR94i?Y|hm5y}bEsWV-XEl9|ovn!ZeV|CA$JyDKH_O4aU4dw0!A>Tg!s zH8mk(Ja8c=Ev?6J1I6=lBZ^Ai#SX^Wes(y@8{V?L4U6n^ycd*i5gG?ovbw{OD4oY- z30+nIN8)AYliB$$N4Yv@3B7U}=U;}#)!ZWjD==L}v7dDyHX_MjHDYp|C2lM5XEdwo_}_S?i>z-4oa4G40J+6rZI ztK1eUQz^Z71-k}?uVGDRj9rvyg}56n@f#ye25R3v$rg9q;Eu=^?G~g3E!WD9C7y~Z z`VaFLa2G1EQHvkfYya`w(&4) z*l=PP_HEHIzN7G-+(}~b?1MC2JW$YdvFG)>|6fiQ-SDK9s_izq(wddJX}ZvTYatA0 zScWus($Zq&C95_kD=&#G^WLg6N1i`TXFK1`fboFmS)9lWcOT98!CSr-q{Er-7H2Yx z0CJFD4}I?07m#wc;}ip57=OKu%{|8uLoTHJ_m_8n$^C zJdTUq09u`(DCA?;3X0NaXc6ws8G9OTw+L@U?lcNG;{I^A$cPm5OfnN`?9#w-KR6#I z0QW+4kPmItU^~ayVLw=HXlMN}i#G^K*>ZvbM)FN)`hb!`b8A_f5LV`R0t-`F*elTd zZ}7I1q$pF*PVoC|QScn4K1<-^1U^RKlK_k}{Tq`m!flKPvK`cvLE9zAaeXDGMGB?3sk^g*r< zpTK0BIX0?f&&kN_OCY#QvG~kVp@f)yrswi7w%;a~Gv>d-gzc{^fbEA%6L{;f!`cT9 zJLI~;*?ygf%JsSXzBN**{f(1tR0A0P1ujsSP(u)2a}z!+`ssLzRY#2uP?~hZK64?3 zCXBfgjmr)t(4RtsBJW!CoIJf0F=Ba_u@r zC1{0*l`Cp%Q3`F{`2lHbFY~VXYwxM8 z|HaVOT{pe0#Zf>jYHLvnZQb?3XzP(yRX|WGoU7-Af~z&BL!=tg&ZiCtql$p)ezIUu zfkCFrL_cF6OOt*x8ZEn?>_Z4 zr|Jfw!bj&w`jCS=C1o;oA^afEquepkLSeU`%8(eHGHqZIh$M5|NAZn39$aw1;$Kn$ zTQWLs;FXv3Od9kqBTw(69%StFXXuTl2$1%UxtdV=>F(Dlx)GryyjJFj*THjhhMS8~ z7H?ZqG0NiEe}me624H!qJ9o@9FI+THX!q-V>BfCmitXgANh0GN7a8wVkR;y6jCWQS zZoFgW!yS_N>V}v4(@k4mb!M8jWE!?)shJ7!eI@j{r z#;@1BzWeKqUp_d!n|WD1m2TLVsoFnPLjG58R<5_;v9)sg0G;N3ApbOXh;ZmMck#P` zW8L*M9O16TaAW#_$R+~FK@MSWY4uHKi>CLdq_*k(SiH?i>TgzRqj%F25WbL;meymq zF}*)8Goq;D-K!o;<9;g~&hDfC$#x8lzgpuS*d_hdwi3X<_PHOhOMl%~LU6nLffDJz zx=R4f&y~LyimN4@VK|mN8j?17GE>}TRz^4J z3tuVLSXxhF4Obp72}w8>$$sG5q_jCkQ$M>x?qjW5STpNO{Db!Xa3;DI>pD6?ekDDK zDb?_nragEpAfAd@iZ1P$?40{$dU9cDv>ohhSM-+PftGvAkXR>)w*&qfN}1!CD22Cz zBxFTzRY=?EJ$55)x1coMvTE6H5~~=wT6IfWdhEU8q(}Bc#^XNH!7;!HHELw=`&V+T zOUn@wPHl8tWq$w2J?zcK)K2ZG+ISMUfZkuO-Ht6Njn=MRu9W_L%jVSkrA|Ep@IChn zq&(O@$y`Embsmni8=Dv>!x851B5<$#z(4@r zYPe%I_+N=WJr*4#uWbCVwL)Asv2B;!%HwemG35yY0|Z71oFwoNz<_c;1rHH8PT*Yv zhXHbAK!o5-2DAqOgA52Fr?}O)(@Ix{Yy71u8-iAXv;rd*CL6L2K;Ot7N^Un0ehGiF z)+ZK$QLHh zwtQ8ds|luRf?wONN{3c8=R3*dTU|XUYij<|MmIVL&`i&E!v?Did%4L}L3xV6(*#Zv zm?ZEFfhP%^BCw=Lh&K%Dh%Zy-uK)xq3}Pahj=4L|U!%}R39!EXG=*4KGg;ARDf%%2 ztZSd8&~pSnPT-RO>?ju16jT4hMMjm(@+POPvjG!`LvNd%C7E9-4Y`#MtY$5J~UNCyw69(*(%e2k*O z$I`yX=G^LU7AZK7kwwoN!WouRWD^0T(>veFx@%w9m2y9l+ID}s^U#|o(w&b`tn-ny z?~ysT`kQq>a%LBgo!5gImQQ37fs5`(u6lvRe4i9aod$w#&48bsVGy8ch8GWvCPrd|xjVZcDCDLX zdr`HiDF$=hb@O+W$k@xd+#y$aN^_@B?4$E=NM8-j^HAYKwWF%=g}5qCyGm+r*y6g5 z2@30TEUp_qfEL#c1`(_2@>Uls%8f#Jw@Pj@2=BlY@yk$`sUXx;c-M@@=k!Rk!Ol3K z+I|@^aZE{!;GpFsaG1MQ7e>Z(Sl_qt{6cn^1SlW9?#fk!%C}G~YwhDZO&s|uHl}xE z)jv$tzea#<3i$?wh6qFf2KcF00gP?tMHV|v3k<~)J4aThq}8d~)oCxz!&QH?(rPmF)e{iDkdv0y zW4M9hdASiqC4A?|H7EMwjf%3ps}|WgvM4QM4sdS%&Ji*tdyllNb{Se$cf+)-p3$;~ z99q`+0iX#F=x(E4pyaTQF^`fuVvNjFB&)~2MjsoJ$^@7g&@{mn{iiJ<8T z2w%uaOY1S*K=HiXh@uis(3HKzJ)%*>CD^j}dlTHPCS};1RY#;QE@9)qbf|=BarG0F zA!#s>0heeG5x`c8dUld%gYhI$Lm6thyvwLvT{frC-Ux=NR3?T=KiyO4rww%pErb4O zJXPr03=(@VOmS?d7@NWOPlbx_M1d8aARS=e-*WUhgT_MAo@=I2H-5j|dQ1@4BSIUc zm9|(+^xb;h=4_#`z|{0Ed7Gondqk9i&Iy~}ROZwpx62*k_`I@Xg}1fP$>g^6*r#)t z)6U6*Z1u!8CJ?)&ZGwx8J{5tt##nrCbYcjy%jhYbIuRRYo8y#dWbg#ij-L(~n6rXu z{bj=U%Y6Xvg?UwWgGMlFuLLQ?==um%dV)ZV0PT4jP{I_95MV4Utx9zk7OUo6ln^Ii zCqOn}23cS3R*5{(-?HXk`d+%{sM==7hORHnuPY+nz9Q>3JmN8hLm5xuuM#p3EB*WBfU6^}#?};RnQH5kCJxY|h z&CZsHB=}C{X&mkUmEMmDz_%cjmB!&MguInzixSCf@mOU1#AxieLbG$$KN1}e4^PlW zpr@iZIvpqqHz`RzkZ71^i-%)Rk54F3h2}1WD5BCsfYcFu Date: Wed, 29 Oct 2025 11:02:04 -0500 Subject: [PATCH 056/149] removing plottingh --- .github/workflows/collect-results.yml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index 14c499c0d..1afe9f049 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -35,16 +35,3 @@ jobs: with: name: results_${{ inputs.exp-name }} path: agg_${{ inputs.exp-name }}.json - - - name: Plot performance - run: | - pip install -q matplotlib - python3 utils/plot_perf.py results/ ${{ inputs.exp-name }} - - - name: Upload performance graphs - uses: actions/upload-artifact@v4 - with: - name: graphs_${{ inputs.exp-name }} - path: | - tput_vs_intvty_*_${{ inputs.exp-name }}.png - tput_vs_e2el_*_${{ inputs.exp-name }}.png From 7b2acaac58bcec2c958ec44879c3fb5135415816 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 13:40:06 -0500 Subject: [PATCH 057/149] removing plottingh --- .github/workflows/1k8k-sweep.yml | 12 +- .github/workflows/8k1k-sweep.yml | 12 +- .github/workflows/benchmark-tmpl.yml | 14 +- .github/workflows/test.yml | 4 +- utils/matrix-logic/generate_sweep_configs.py | 132 +++++++++++++++++-- 5 files changed, 142 insertions(+), 32 deletions(-) diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index cced99997..68fbac028 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -70,8 +70,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} benchmark-dsr1: @@ -94,8 +94,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} benchmark-gptoss: @@ -118,8 +118,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} collect-70b-results: diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml index 58c676b56..7be91c4fb 100644 --- a/.github/workflows/8k1k-sweep.yml +++ b/.github/workflows/8k1k-sweep.yml @@ -70,8 +70,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} benchmark-dsr1: @@ -94,8 +94,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} benchmark-gptoss: @@ -118,8 +118,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} collect-70b-results: diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 4f8468a82..4fb327381 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -26,27 +26,25 @@ on: osl: required: true type: string - random-range-ratio: - required: false - type: string - default: '0.2' tp: required: true type: string ep: - required: false + required: true type: string - default: '1' dp-attn: - required: false + required: true type: boolean - default: false max-model-len: required: true type: string conc: required: true type: string + random-range-ratio: + required: false + type: string + default: '0.2' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e56fc9a82..3d4fd2c5f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -46,8 +46,8 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} calc-success-rate: diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 1c3472eb8..7ae81789e 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -151,8 +151,8 @@ def generate_full_sweep(args, all_config_data): 'conc': conc, 'model-code': model_code, 'max-model-len': isl + osl, - 'ep': 1, # Default - 'dp-attn': False, # Default + 'ep': 1, # Default + 'dp-attn': False, # Default } # Add optional fields if they exist @@ -177,10 +177,26 @@ def generate_test_config(args, all_config_data): Assumes all_config_data has been validated by validate_config_structure(). """ + try: + with open(args.runner_config, 'r') as f: + runner_config = yaml.safe_load(f) + except FileNotFoundError as e: + raise ValueError( + f"Runner config file '{args.runner_config}' does not exist.") + # Extract model code from config key model_code = args.key.split('-')[0] - val = all_config_data[args.key] + val = all_config_data.get(args.key) + + if not val: + raise ValueError( + f"Specified key '{args.key}' does not exist in config files.") + + runner_nodes = runner_config.get(val['runner'], []) + if args.runner_node not in runner_nodes: + raise ValueError( + f"Runner node '{args.runner_node}' is not compatible with config '{args.key}' which runs on runner type '{val['runner']}'. Available runner nodes for this config are '{', '.join(runner_nodes)}'.") seq_len_configs = val['seq-len-configs'] image = val['image'] @@ -282,7 +298,8 @@ def generate_runner_model_sweep_config(args, all_config_data): with open(args.runner_config, 'r') as f: runner_config = yaml.safe_load(f) except FileNotFoundError as e: - raise ValueError(f"Runner config file '{args.runner_config}' does not exist.") + raise ValueError( + f"Runner config file '{args.runner_config}' does not exist.") runner_nodes = runner_config.get(args.runner_type) @@ -344,6 +361,46 @@ def generate_runner_model_sweep_config(args, all_config_data): return matrix_values +def generate_custom_test(args): + """Generate single 1k1k job for custom inputs. + """ + try: + with open(args.runner_config, 'r') as f: + runner_config = yaml.safe_load(f) + except FileNotFoundError as e: + raise ValueError( + f"Runner config file '{args.runner_config}' does not exist.") + + found_runner_label = False + for runner_type, runner_nodes in runner_config.items(): + if args.runner_label == runner_type or args.runner_label in runner_nodes: + found_runner_label = True + + if not found_runner_label: + raise ValueError(f"Unable to find specified runner label '{args.runner_label}'.") + + if not runner_nodes: + raise ValueError( + f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") + + return [ + { + 'image': args.image, + 'model': args.model, + 'precision': args.precision, + 'framework': args.framework, + 'runner': args.runner_label, + # Again, just use 1k1k since this is just meant to smoke test all runners + 'isl': 1024, + 'osl': 1024, + 'tp': 8, + 'conc': 4, + 'model-code': args.model, + 'max-model-len': 2048, + } + ] + + def generate_runner_sweep_config(args, all_config_data): """Generate runner sweep configurations. @@ -353,8 +410,8 @@ def generate_runner_sweep_config(args, all_config_data): with open(args.runner_config, 'r') as f: runner_config = yaml.safe_load(f) except FileNotFoundError as e: - raise ValueError(f"Runner config file '{args.runner_config}' does not exist.") - + raise ValueError( + f"Runner config file '{args.runner_config}' does not exist.") matrix_values = [] for key, val in all_config_data.items(): @@ -369,7 +426,7 @@ def generate_runner_sweep_config(args, all_config_data): # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME model_code = key.split('-')[0] - + runner_nodes = runner_config.get(val['runner']) if not runner_nodes: raise ValueError( @@ -510,7 +567,12 @@ def main(): 'test-config', parents=[parent_parser], add_help=False, - help='Generate test configurations for a specific key' + help='Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config.' + ) + test_config_parser.add_argument( + '--runner-config', + required=True, + help='Configuration file holding runner information' ) test_config_parser.add_argument( '--key', @@ -551,7 +613,7 @@ def main(): 'runner-model-sweep', parents=[parent_parser], add_help=False, - help='Sweep across all runner nodes and all compatible models for a given runner' + help='Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner nodes.' ) test_config_parser.add_argument( '--runner-type', @@ -574,7 +636,7 @@ def main(): 'runner-sweep', parents=[parent_parser], add_help=False, - help='For a given model, run configurations on all compatible runners' + help='Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b actually do so successfully.' ) test_config_parser.add_argument( '--model-prefix', @@ -602,6 +664,54 @@ def main(): help='Show this help message and exit' ) + # Subcommand: custom + test_config_parser = subparsers.add_parser( + 'custom', + parents=[parent_parser], + add_help=False, + help='Enter custom values' + ) + test_config_parser.add_argument( + '--runner-label', + required=True, + help='Label associated with runner on which to launch the corresponding job (e.g., h200, h200-nv_1, etc.)' + ) + test_config_parser.add_argument( + '--image', + required=True, + help='Image to run the benchmark (e.g., openai/gpt-oss-120b)' + ) + test_config_parser.add_argument( + '--model', + required=True, + help='Model to run (e.g., vllm/vllm-openai:latest)' + ) + test_config_parser.add_argument( + '--framework', + required=True, + help='Framework to run on (e.g., vllm, trt, sglang)' + ) + test_config_parser.add_argument( + '--precision', + required=True, + help='Precision to run (e.g., fp4, fp8)' + ) + test_config_parser.add_argument( + '--exp-name', + required=True, + help='Experiment name (e.g., 70b_test)' + ) + test_config_parser.add_argument( + '--runner-config', + required=True, + help='Configuration file holding runner information' + ) + test_config_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + args = parser.parse_args() # Load and validate configuration files @@ -619,6 +729,8 @@ def main(): elif args.command == 'runner-sweep': matrix_values = generate_runner_sweep_config( args, all_config_data) + elif args.command == 'custom': + matrix_values = generate_custom_test(args) else: parser.error(f"Unknown command: {args.command}") From ad18b5112b873d45cc893cef786c10fa1b7ea0f9 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 13:43:08 -0500 Subject: [PATCH 058/149] removing plottingh --- utils/matrix-logic/generate_sweep_configs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 7ae81789e..fd49b7b55 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -394,6 +394,8 @@ def generate_custom_test(args): 'isl': 1024, 'osl': 1024, 'tp': 8, + 'ep': 1, + 'dp-attn': False, 'conc': 4, 'model-code': args.model, 'max-model-len': 2048, From 165bde31694fc8a027bb06a31db02b91b744fc11 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 13:45:00 -0500 Subject: [PATCH 059/149] removing plottingh --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3d4fd2c5f..9536d6db1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -36,7 +36,7 @@ jobs: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit with: - exp-name: ${{ matrix.config.model-code }}_test_${{ matrix.config.isl }}_${{ matrix.config.osl }} + exp-name: ${{ matrix.config.exp-name }} isl: ${{ matrix.config.isl }} osl: ${{ matrix.config.osl }} max-model-len: ${{ matrix.config.max-model-len }} From 52153c7edb600b6fa328832e7f2dbaf6b6233077 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 13:46:17 -0500 Subject: [PATCH 060/149] removing plottingh --- utils/matrix-logic/generate_sweep_configs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index fd49b7b55..8db70fa8b 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -398,6 +398,7 @@ def generate_custom_test(args): 'dp-attn': False, 'conc': 4, 'model-code': args.model, + 'exp-name': args.exp_name, 'max-model-len': 2048, } ] From db05e34556554251623978077e1d141f519df52b Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 13:49:46 -0500 Subject: [PATCH 061/149] removing plotting python script --- utils/plot_perf.py | 197 --------------------------------------------- 1 file changed, 197 deletions(-) delete mode 100644 utils/plot_perf.py diff --git a/utils/plot_perf.py b/utils/plot_perf.py deleted file mode 100644 index 1cab81cdc..000000000 --- a/utils/plot_perf.py +++ /dev/null @@ -1,197 +0,0 @@ -import sys -import json -from pathlib import Path -import matplotlib.pyplot as plt - - -results_dir = Path(sys.argv[1]) -exp_name = sys.argv[2] -hw_color = { - 'h100': 'lightgreen', - 'h200': 'green', # H200 VLLM - 'h200-trt': 'darkgreen', # H200 TRT-LLM - 'b200': 'black', # B200 VLLM - 'b200-trt': 'gray', # B200 TRT-LLM - 'mi300x': 'pink', - 'mi325x': 'red', - 'mi355x': 'purple', - 'gb200': 'orange', # GB200 TRT-LLM and SGlang -} - -results = [] -for result_path in results_dir.rglob(f'*.json'): - with open(result_path) as f: - result = json.load(f) - results.append(result) - - -def plot_tput_vs_e2el(precision_filter=None): - fig, ax = plt.subplots() - - # Filter results by precision if specified - filtered_results = results - if precision_filter is not None: - filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter] - - for hw_label, color in hw_color.items(): - # Separate fp8 and fp4 results for this hardware - fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] - fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] - - # Plot fp8 results with circles - if fp8_results: - xs_fp8 = [r['median_e2el'] for r in fp8_results] - ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] - ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) - - # Plot fp4 results with squares - if fp4_results: - xs_fp4 = [r['median_e2el'] for r in fp4_results] - ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] - ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) - - for result in filtered_results: - x, y = result['median_e2el'], result['tput_per_gpu'] - ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) - - ax.set_xlabel('End-to-end Latency (s)') - ax.set_ylabel('Throughput per GPU (tok/s)') - ax.legend(title='GPU Type') - fig.tight_layout() - - precision_suffix = f"_{precision_filter}" if precision_filter else "" - fig.savefig(f'tput_vs_e2el_{exp_name}{precision_suffix}.png', bbox_inches='tight') - plt.close(fig) - - -def plot_tput_vs_intvty(precision_filter=None): - fig, ax = plt.subplots() - - # Filter results by precision if specified - filtered_results = results - if precision_filter is not None: - filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter] - - for hw_label, color in hw_color.items(): - # Separate fp8 and fp4 results for this hardware - fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] - fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] - - # Plot fp8 results with circles - if fp8_results: - xs_fp8 = [r['median_intvty'] for r in fp8_results] - ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] - ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) - - # Plot fp4 results with squares - if fp4_results: - xs_fp4 = [r['median_intvty'] for r in fp4_results] - ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] - ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) - - for result in filtered_results: - x, y = result['median_intvty'], result['tput_per_gpu'] - ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) - - ax.set_xlabel('Interactivity (tok/s/user)') - ax.set_ylabel('Throughput per GPU (tok/s)') - ax.legend(title='GPU Type') - fig.tight_layout() - - precision_suffix = f"_{precision_filter}" if precision_filter else "" - fig.savefig(f'tput_vs_intvty_{exp_name}{precision_suffix}.png', bbox_inches='tight') - plt.close(fig) - - -def plot_tput_vs_e2el_for_model(model_results, model_name): - fig, ax = plt.subplots() - - for hw_label, color in hw_color.items(): - # Separate fp8 and fp4 results for this hardware - fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] - fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] - - # Plot fp8 results with circles - if fp8_results: - xs_fp8 = [r['median_e2el'] for r in fp8_results] - ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] - ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) - - # Plot fp4 results with squares - if fp4_results: - xs_fp4 = [r['median_e2el'] for r in fp4_results] - ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] - ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) - - for result in model_results: - x, y = result['median_e2el'], result['tput_per_gpu'] - ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) - - ax.set_xlabel('End-to-end Latency (s)') - ax.set_ylabel('Throughput per GPU (tok/s)') - ax.legend(title='Hardware + Framework') - ax.set_title(f'{model_name} - All Frameworks') - fig.tight_layout() - - # Extract model identifier from model name - model_id = model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name - fig.savefig(f'tput_vs_e2el_{model_id}_{exp_name}.png', bbox_inches='tight') - plt.close(fig) - - -def plot_tput_vs_intvty_for_model(model_results, model_name): - fig, ax = plt.subplots() - - for hw_label, color in hw_color.items(): - # Separate fp8 and fp4 results for this hardware - fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] - fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] - - # Plot fp8 results with circles - if fp8_results: - xs_fp8 = [r['median_intvty'] for r in fp8_results] - ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] - ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) - - # Plot fp4 results with squares - if fp4_results: - xs_fp4 = [r['median_intvty'] for r in fp4_results] - ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] - ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) - - for result in model_results: - x, y = result['median_intvty'], result['tput_per_gpu'] - ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) - - ax.set_xlabel('Interactivity (tok/s/user)') - ax.set_ylabel('Throughput per GPU (tok/s)') - ax.legend(title='Hardware + Framework') - ax.set_title(f'{model_name} - All Frameworks') - fig.tight_layout() - - # Extract model identifier from model name - model_id = model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name - fig.savefig(f'tput_vs_intvty_{model_id}_{exp_name}.png', bbox_inches='tight') - plt.close(fig) - - -# Create one plot per model showing all frameworks and hardware -# Group results by model family (70b, dsr1, etc.) instead of full model name -def get_model_family(model_name): - if '70b' in model_name.lower() or 'llama-3.3-70b' in model_name.lower(): - return '70b' - elif 'dsr1' in model_name.lower() or 'deepseek-r1' in model_name.lower(): - return 'dsr1' - else: - # Fallback to first part of model name - return model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name - -model_families = set(get_model_family(r.get('model', 'unknown')) for r in results) - -for model_family in model_families: - # Filter results for this model family - model_results = [r for r in results if get_model_family(r.get('model', 'unknown')) == model_family] - - # Create plots for this model family - plot_tput_vs_e2el_for_model(model_results, model_family) - plot_tput_vs_intvty_for_model(model_results, model_family) From efdfcaf14e28e7614c56e7cc4325d2db7ddec182 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 14:08:19 -0500 Subject: [PATCH 062/149] bmk-space -> search-space --- .github/configs/amd-master.yaml | 66 ++++++------ .github/configs/nvidia-master.yaml | 108 +++++++++---------- utils/matrix-logic/generate_sweep_configs.py | 78 +++++++++++--- 3 files changed, 148 insertions(+), 104 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a501ead63..81c436366 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -7,21 +7,21 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } @@ -36,21 +36,21 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 64, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } @@ -65,21 +65,21 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 32, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 64, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 16, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 32 } - { tp: 4, conc-start: 4, conc-end: 64 } @@ -94,21 +94,21 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } @@ -123,16 +123,16 @@ dsr1-fp4-mi355x-sgl: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-mi300x-sgl: @@ -144,15 +144,15 @@ dsr1-fp8-mi300x-sgl: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-mi325x-sgl: @@ -164,15 +164,15 @@ dsr1-fp8-mi325x-sgl: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-mi355x-sgl: @@ -184,15 +184,15 @@ dsr1-fp8-mi355x-sgl: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } gptoss-fp4-mi300x-vllm: @@ -204,21 +204,21 @@ gptoss-fp4-mi300x-vllm: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } @@ -233,21 +233,21 @@ gptoss-fp4-mi325x-vllm: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 64, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 8 } - { tp: 4, conc-start: 4, conc-end: 8 } @@ -262,19 +262,19 @@ gptoss-fp4-mi355x-vllm: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 16 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 16 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 16 } - { tp: 8, conc-start: 4, conc-end: 16 } diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5c006dc91..fe9ef989d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7,21 +7,21 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 128, conc-end: 128 } - { tp: 2, conc-start: 64, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 128, conc-end: 128 } - { tp: 2, conc-start: 64, conc-end: 128 } - { tp: 4, conc-start: 16, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 32 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 128 } - { tp: 2, conc-start: 16, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 64 } @@ -36,21 +36,21 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 32 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 16, conc-end: 64 } - { tp: 2, conc-start: 16, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 32 } @@ -65,21 +65,21 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 128, conc-end: 128 } - { tp: 2, conc-start: 64, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 32 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 128, conc-end: 128 } - { tp: 2, conc-start: 64, conc-end: 128 } - { tp: 4, conc-start: 16, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 32 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 128 } - { tp: 2, conc-start: 16, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 128 } @@ -94,21 +94,21 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 64, conc-end: 64 } - { tp: 4, conc-start: 16, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 32, conc-end: 64 } - { tp: 2, conc-start: 16, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } @@ -123,19 +123,19 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 2, conc-start: 64, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 2, conc-start: 64, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } @@ -149,21 +149,21 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 128, conc-end: 128 } - { tp: 2, conc-start: 64, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 32 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 128, conc-end: 128 } - { tp: 2, conc-start: 64, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 32 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 16, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 32 } @@ -177,21 +177,21 @@ seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 32, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 64, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 16, conc-end: 64 } - { tp: 2, conc-start: 16, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } @@ -206,17 +206,17 @@ dsr1-fp4-b200-sgl: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } @@ -229,7 +229,7 @@ dsr1-fp4-b200-trt: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: # If TP=4, # If CONC > 32, then EP=4 # If CONC >= 256, DP_ATTN=true @@ -244,7 +244,7 @@ dsr1-fp4-b200-trt: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } - isl: 1024 osl: 8192 - bmk-space: + search-space: # If TP=4, # If CONC > 32, then EP=4 # If CONC >= 256, DP_ATTN=true @@ -259,7 +259,7 @@ dsr1-fp4-b200-trt: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } - isl: 8192 osl: 1024 - bmk-space: + search-space: # If TP=4, # If CONC > 32, then EP=4 and DP_ATTN=true - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } @@ -278,15 +278,15 @@ dsr1-fp8-b200-sgl: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-b200-trt: @@ -299,18 +299,18 @@ dsr1-fp8-b200-trt: # For all sequence lengths, EP=TP - isl: 1024 osl: 1024 - bmk-space: + search-space: # If CONC > 32, then DP_ATTN=true - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: # If CONC > 64, then DP_ATTN=true - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: # If CONC > 64, then DP_ATTN=true - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } @@ -323,15 +323,15 @@ dsr1-fp8-h200-sgl: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-h200-trt: @@ -345,17 +345,17 @@ dsr1-fp8-h200-trt: - isl: 1024 osl: 1024 # If CONC > 64, then DP_ATTN=true - bmk-space: + search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 # If CONC > 64, then DP_ATTN=true - bmk-space: + search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 # If CONC > 32, then DP_ATTN=true - bmk-space: + search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } @@ -369,21 +369,21 @@ gptoss-fp4-b200-trt: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } @@ -398,21 +398,21 @@ gptoss-fp4-b200-vllm: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } @@ -427,19 +427,19 @@ gptoss-fp4-h100-vllm: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 32 } @@ -453,21 +453,21 @@ gptoss-fp4-h200-trt: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, ep: 1, conc-start: 32, conc-end: 64 } - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } @@ -482,21 +482,21 @@ gptoss-fp4-h200-vllm: seq-len-configs: - isl: 1024 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 - bmk-space: + search-space: - { tp: 1, conc-start: 4, conc-end: 16 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 - bmk-space: + search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 8db70fa8b..252c87bf9 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -1,6 +1,8 @@ import json import yaml import argparse +from pydantic import BaseModel, Field, ValidationError, ConfigDict +from typing import List seq_len_stoi = { "1k1k": (1024, 1024), @@ -9,6 +11,39 @@ } +class MatrixEntry(BaseModel): + """Pydantic model for validating matrix entry structure.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + precision: str + framework: str + runner: str + isl: int + osl: int + tp: int + ep: int + dp_attn: bool = Field(alias='dp-attn') + conc: int + max_model_len: int = Field(alias='max-model-len') + exp_name: str = Field(alias='exp-name') + + +def validate_matrix_output(matrix_values: List[dict]) -> List[dict]: + """Validate that matrix_values entries match the expected structure. + + Raises ValueError if any entry fails validation. + Returns the original list if all entries are valid. + """ + for i, entry in enumerate(matrix_values): + try: + MatrixEntry(**entry) + except ValidationError as e: + raise ValueError(f"Matrix entry at index {i} failed validation:\n{e}") + return matrix_values + + def validate_master_configs_structure(all_config_data): """Validate the structure of all master config entries. @@ -57,12 +92,12 @@ def validate_master_configs_structure(all_config_data): raise ValueError( f"'osl' must be int in seq-len-config[{i}] for key '{key}'") - bmk_space = seq_config.get('bmk-space') + bmk_space = seq_config.get('search-space') if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0: raise ValueError( - f"Missing or invalid 'bmk-space' in seq-len-config[{i}] for key '{key}'") + f"Missing or invalid 'search-space' in seq-len-config[{i}] for key '{key}'") - # Validate each benchmark in bmk-space + # Validate each benchmark in search-space for j, bmk in enumerate(bmk_space): # Define allowed fields allowed_fields = {'tp', 'conc-start', @@ -75,23 +110,23 @@ def validate_master_configs_structure(all_config_data): extra_fields = set(bmk.keys()) - allowed_fields if extra_fields: raise ValueError( - f"Extra fields {extra_fields} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + f"Extra fields {extra_fields} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") # Validate required fields for field, expected_type in required_bmk_fields.items(): if field not in bmk or bmk[field] is None: raise ValueError( - f"Missing '{field}' in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + f"Missing '{field}' in search-space[{j}] of seq-len-config[{i}] for key '{key}'") if not isinstance(bmk[field], expected_type): raise ValueError( - f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") # Validate optional fields if they exist for field, expected_type in optional_bmk_fields.items(): if field in bmk and bmk[field] is not None: if not isinstance(bmk[field], expected_type): raise ValueError( - f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'") + f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") def generate_full_sweep(args, all_config_data): @@ -127,7 +162,7 @@ def generate_full_sweep(args, all_config_data): if not matching_seq_config: continue # Skip this config if no matching sequence length - bmk_space = matching_seq_config['bmk-space'] + bmk_space = matching_seq_config['search-space'] for bmk in bmk_space: tp = bmk['tp'] @@ -149,10 +184,10 @@ def generate_full_sweep(args, all_config_data): 'osl': osl, 'tp': tp, 'conc': conc, - 'model-code': model_code, 'max-model-len': isl + osl, 'ep': 1, # Default 'dp-attn': False, # Default + 'exp-name': f"{model_code}_test", } # Add optional fields if they exist @@ -222,7 +257,7 @@ def generate_test_config(args, all_config_data): if seq_lens_filter and (isl, osl) not in seq_lens_filter: continue - bmk_space = seq_config['bmk-space'] + bmk_space = seq_config['search-space'] for bmk in bmk_space: tp = bmk['tp'] @@ -236,15 +271,17 @@ def generate_test_config(args, all_config_data): entry = { 'image': image, 'model': model, - 'model-code': model_code, 'precision': precision, 'framework': framework, 'runner': runner, 'isl': isl, 'osl': osl, 'tp': tp, + 'ep': 1, # Default, + 'dp-attn': False, # Default 'conc': conc_start, 'max-model-len': isl + osl, + 'exp-name': f"{model_code}_test", } # Add optional fields if they exist @@ -261,13 +298,14 @@ def generate_test_config(args, all_config_data): entry = { 'image': image, 'model': model, - 'model-code': model_code, 'precision': precision, 'framework': framework, 'runner': runner, 'isl': isl, 'osl': osl, 'tp': tp, + 'ep': 1, # Default, + 'dp-attn': False, # Default 'conc': conc, 'max-model-len': isl + osl, } @@ -324,7 +362,7 @@ def generate_runner_model_sweep_config(args, all_config_data): target_config = config break - highest_tp_bmk = max(target_config['bmk-space'], key=lambda x: x['tp']) + highest_tp_bmk = max(target_config['search-space'], key=lambda x: x['tp']) # Since we are just testing, pick the highest TP for this config and just test # on that TP with the lowest concurrency available highest_tp = highest_tp_bmk['tp'] @@ -345,9 +383,11 @@ def generate_runner_model_sweep_config(args, all_config_data): 'isl': 1024, 'osl': 1024, 'tp': highest_tp, + 'ep': 1, # Default, + 'dp-attn': False, # Default 'conc': lowest_conc, - 'model-code': model_code, 'max-model-len': 2048, + 'exp-name': f"{model_code}_test", } # Add optional fields if they exist @@ -397,7 +437,6 @@ def generate_custom_test(args): 'ep': 1, 'dp-attn': False, 'conc': 4, - 'model-code': args.model, 'exp-name': args.exp_name, 'max-model-len': 2048, } @@ -442,7 +481,7 @@ def generate_runner_sweep_config(args, all_config_data): target_config = config break - highest_tp_bmk = max(target_config['bmk-space'], key=lambda x: x['tp']) + highest_tp_bmk = max(target_config['search-space'], key=lambda x: x['tp']) # Since we are just testing, pick the highest TP for this config and just test # on that TP with the lowest concurrency available highest_tp = highest_tp_bmk['tp'] @@ -463,8 +502,10 @@ def generate_runner_sweep_config(args, all_config_data): 'isl': 1024, 'osl': 1024, 'tp': highest_tp, + 'ep': 1, # Default, + 'dp-attn': False, # Default 'conc': lowest_conc, - 'model-code': model_code, + 'exp-name': f"{model_code}_test", 'max-model-len': 2048, } @@ -737,6 +778,9 @@ def main(): else: parser.error(f"Unknown command: {args.command}") + # Validate output before printing + validate_matrix_output(matrix_values) + print(json.dumps(matrix_values)) return matrix_values From 15eead5a1998950e038261596e9fe9f11b5b4b89 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 14:10:32 -0500 Subject: [PATCH 063/149] updating exp name for full sweep --- utils/matrix-logic/generate_sweep_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 252c87bf9..ed68a9633 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -187,7 +187,7 @@ def generate_full_sweep(args, all_config_data): 'max-model-len': isl + osl, 'ep': 1, # Default 'dp-attn': False, # Default - 'exp-name': f"{model_code}_test", + 'exp-name': f"{model_code}_{isl}_{osl}_sweep", } # Add optional fields if they exist From 6bbc02859d119c99887cd3e5281264b0e2cb3db5 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 14:12:55 -0500 Subject: [PATCH 064/149] pip install pydantic --- .github/workflows/1k1k-sweep.yml | 3 +++ .github/workflows/test.yml | 1 + 2 files changed, 4 insertions(+) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index dd8ae9f9c..958fd73b9 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -21,6 +21,7 @@ jobs: - id: get-70b-configs run: | + pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT @@ -34,6 +35,7 @@ jobs: - id: get-dsr1-configs run: | + pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT @@ -47,6 +49,7 @@ jobs: - id: get-gptoss-configs run: | + pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9536d6db1..78b9b1f5e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,6 +23,7 @@ jobs: - id: get-jobs run: | + pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py ${{ inputs.generate-cli-command }}) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT From b84fffe77c8305116f52f6c50e8f88e32293c1d0 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 14:31:12 -0500 Subject: [PATCH 065/149] add filtered sweep --- utils/matrix-logic/generate_sweep_configs.py | 192 ++++++++++++++++++- 1 file changed, 191 insertions(+), 1 deletion(-) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index ed68a9633..bf8ccc065 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -184,7 +184,7 @@ def generate_full_sweep(args, all_config_data): 'osl': osl, 'tp': tp, 'conc': conc, - 'max-model-len': isl + osl, + 'max-model-len': isl + osl + 200, 'ep': 1, # Default 'dp-attn': False, # Default 'exp-name': f"{model_code}_{isl}_{osl}_sweep", @@ -207,6 +207,143 @@ def generate_full_sweep(args, all_config_data): return matrix_values +def generate_filtered_sweep(args, all_config_data): + """Generate sweep configurations with filtering options. + + Allows filtering by model prefix, precision, framework, runner type, and sequence lengths. + Supports test mode to only run highest TP with lowest concurrency. + + Assumes all_config_data has been validated by validate_config_structure(). + """ + matrix_values = [] + + # Convert seq-lens to set of (isl, osl) tuples for filtering + seq_lens_filter = None + if args.seq_lens: + seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} + + for key, val in all_config_data.items(): + # Filter by model prefix if specified + if args.model_prefix and not key.startswith(args.model_prefix): + continue + + # Filter by precision if specified + if args.precision and val['precision'] != args.precision: + continue + + # Filter by framework if specified + if args.framework and val['framework'] != args.framework: + continue + + # Filter by runner type if specified + if args.runner_type and val['runner'] != args.runner_type: + continue + + seq_len_configs = val['seq-len-configs'] + image = val['image'] + model = val['model'] + precision = val['precision'] + framework = val['framework'] + runner = val['runner'] + model_code = key.split('-')[0] + + for seq_config in seq_len_configs: + isl = seq_config['isl'] + osl = seq_config['osl'] + + # Filter by sequence lengths if specified + if seq_lens_filter and (isl, osl) not in seq_lens_filter: + continue + + bmk_space = seq_config['search-space'] + + if args.test_mode: + # In test mode, use highest TP with lowest concurrency + highest_tp_bmk = max(bmk_space, key=lambda x: x['tp']) + tp = highest_tp_bmk['tp'] + conc = highest_tp_bmk['conc-start'] + ep = highest_tp_bmk.get('ep') + dp_attn = highest_tp_bmk.get('dp-attn') + + entry = { + 'image': image, + 'model': model, + 'precision': precision, + 'framework': framework, + 'runner': runner, + 'isl': isl, + 'osl': osl, + 'tp': tp, + 'ep': 1, # Default + 'dp-attn': False, # Default + 'conc': conc, + 'max-model-len': isl + osl + 200, + 'exp-name': f"{model_code}_{isl}_{osl}_test", + } + + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + else: + # Full sweep mode + for bmk in bmk_space: + tp = bmk['tp'] + conc_start = bmk['conc-start'] + conc_end = bmk['conc-end'] + ep = bmk.get('ep') + dp_attn = bmk.get('dp-attn') + + conc = conc_start + while conc <= conc_end: + entry = { + 'image': image, + 'model': model, + 'precision': precision, + 'framework': framework, + 'runner': runner, + 'isl': isl, + 'osl': osl, + 'tp': tp, + 'conc': conc, + 'max-model-len': isl + osl + 200, + 'ep': 1, # Default + 'dp-attn': False, # Default + 'exp-name': f"{model_code}_{isl}_{osl}_sweep", + } + + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + if len(matrix_values) == 0: + error_msg = "No configs found matching filters:" + if args.model_prefix: + error_msg += f" model-prefix='{args.model_prefix}'" + if args.precision: + error_msg += f" precision='{args.precision}'" + if args.framework: + error_msg += f" framework='{args.framework}'" + if args.runner_type: + error_msg += f" runner-type='{args.runner_type}'" + if seq_lens_filter: + error_msg += f" seq-lens={list(args.seq_lens)}" + raise ValueError(error_msg) + + return matrix_values + + def generate_test_config(args, all_config_data): """Generate test configurations for a specific key. @@ -606,6 +743,57 @@ def main(): help='Show this help message and exit' ) + # Subcommand: filtered-sweep + filtered_sweep_parser = subparsers.add_parser( + 'filtered-sweep', + parents=[parent_parser], + add_help=False, + help='Generate sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths' + ) + filtered_sweep_parser.add_argument( + '--model-prefix', + required=False, + help='Model prefix to filter configurations (optional)' + ) + filtered_sweep_parser.add_argument( + '--precision', + required=False, + help='Precision to filter by (e.g., fp4, fp8) (optional)' + ) + filtered_sweep_parser.add_argument( + '--framework', + required=False, + help='Framework to filter by (e.g., vllm, trt, sglang) (optional)' + ) + filtered_sweep_parser.add_argument( + '--runner-type', + required=False, + help='Runner type to filter by (e.g., h200, h100) (optional)' + ) + filtered_sweep_parser.add_argument( + '--seq-lens', + nargs='+', + choices=list(seq_len_stoi.keys()), + required=False, + help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." + ) + filtered_sweep_parser.add_argument( + '--step-size', + type=int, + default=2, + help='Step size for concurrency values (default: 2)' + ) + filtered_sweep_parser.add_argument( + '--test-mode', + action='store_true', + help='Test mode: only run highest TP with lowest concurrency for each matching config' + ) + filtered_sweep_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + # Subcommand: test-config test_config_parser = subparsers.add_parser( 'test-config', @@ -765,6 +953,8 @@ def main(): # Route to appropriate function based on subcommand if args.command == 'full-sweep': matrix_values = generate_full_sweep(args, all_config_data) + elif args.command == 'filtered-sweep': + matrix_values = generate_filtered_sweep(args, all_config_data) elif args.command == 'test-config': matrix_values = generate_test_config(args, all_config_data) elif args.command == 'runner-model-sweep': From df8877dfbe8705ff63f17b49cb9a49837c205968 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 14:39:56 -0500 Subject: [PATCH 066/149] allow multiple filter values --- utils/matrix-logic/generate_sweep_configs.py | 58 +++++++++++++++----- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index bf8ccc065..9bfc2ac1f 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -215,6 +215,26 @@ def generate_filtered_sweep(args, all_config_data): Assumes all_config_data has been validated by validate_config_structure(). """ + # Validate runner types if specified + if args.runner_type: + if not args.runner_config: + raise ValueError( + "--runner-config is required when --runner-type is specified") + + try: + with open(args.runner_config, 'r') as f: + runner_config = yaml.safe_load(f) + except FileNotFoundError: + raise ValueError( + f"Runner config file '{args.runner_config}' does not exist.") + + valid_runner_types = set(runner_config.keys()) + invalid_runners = set(args.runner_type) - valid_runner_types + if invalid_runners: + raise ValueError( + f"Invalid runner type(s): {invalid_runners}. " + f"Valid runner types are: {', '.join(sorted(valid_runner_types))}") + matrix_values = [] # Convert seq-lens to set of (isl, osl) tuples for filtering @@ -224,19 +244,20 @@ def generate_filtered_sweep(args, all_config_data): for key, val in all_config_data.items(): # Filter by model prefix if specified - if args.model_prefix and not key.startswith(args.model_prefix): - continue + if args.model_prefix: + if not any(key.startswith(prefix) for prefix in args.model_prefix): + continue # Filter by precision if specified - if args.precision and val['precision'] != args.precision: + if args.precision and val['precision'] not in args.precision: continue # Filter by framework if specified - if args.framework and val['framework'] != args.framework: + if args.framework and val['framework'] not in args.framework: continue # Filter by runner type if specified - if args.runner_type and val['runner'] != args.runner_type: + if args.runner_type and val['runner'] not in args.runner_type: continue seq_len_configs = val['seq-len-configs'] @@ -330,15 +351,15 @@ def generate_filtered_sweep(args, all_config_data): if len(matrix_values) == 0: error_msg = "No configs found matching filters:" if args.model_prefix: - error_msg += f" model-prefix='{args.model_prefix}'" + error_msg += f" model-prefix={args.model_prefix}" if args.precision: - error_msg += f" precision='{args.precision}'" + error_msg += f" precision={args.precision}" if args.framework: - error_msg += f" framework='{args.framework}'" + error_msg += f" framework={args.framework}" if args.runner_type: - error_msg += f" runner-type='{args.runner_type}'" + error_msg += f" runner-type={args.runner_type}" if seq_lens_filter: - error_msg += f" seq-lens={list(args.seq_lens)}" + error_msg += f" seq-lens={args.seq_lens}" raise ValueError(error_msg) return matrix_values @@ -752,23 +773,32 @@ def main(): ) filtered_sweep_parser.add_argument( '--model-prefix', + nargs='+', required=False, - help='Model prefix to filter configurations (optional)' + help='Model prefix(es) to filter configurations (optional, can specify multiple)' ) filtered_sweep_parser.add_argument( '--precision', + nargs='+', required=False, - help='Precision to filter by (e.g., fp4, fp8) (optional)' + help='Precision(s) to filter by (e.g., fp4, fp8) (optional, can specify multiple)' ) filtered_sweep_parser.add_argument( '--framework', + nargs='+', required=False, - help='Framework to filter by (e.g., vllm, trt, sglang) (optional)' + help='Framework(s) to filter by (e.g., vllm, trt, sglang) (optional, can specify multiple)' ) filtered_sweep_parser.add_argument( '--runner-type', + nargs='+', + required=False, + help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)' + ) + filtered_sweep_parser.add_argument( + '--runner-config', required=False, - help='Runner type to filter by (e.g., h200, h100) (optional)' + help='Configuration file holding runner information (required if --runner-type is specified)' ) filtered_sweep_parser.add_argument( '--seq-lens', From b0aaf6a6a93efda49738d65e595a62bb61e8365b Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 14:45:02 -0500 Subject: [PATCH 067/149] reverse seq len mapping --- utils/matrix-logic/generate_sweep_configs.py | 21 +++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 9bfc2ac1f..7f1b76490 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -10,6 +10,18 @@ "8k1k": (8192, 1024) } +# Reverse mapping for exp-name generation +seq_len_itos = {v: k for k, v in seq_len_stoi.items()} + + +def seq_len_to_str(isl: int, osl: int) -> str: + """Convert sequence lengths to short string representation. + + Returns the short name (e.g., '1k1k') if it exists in the mapping, + otherwise returns 'isl_osl' format. + """ + return seq_len_itos.get((isl, osl), f"{isl}_{osl}") + class MatrixEntry(BaseModel): """Pydantic model for validating matrix entry structure.""" @@ -174,6 +186,7 @@ def generate_full_sweep(args, all_config_data): # Generate entries for each concurrency value in the range conc = conc_start while conc <= conc_end: + seq_len_str = seq_len_to_str(isl, osl) entry = { 'image': image, 'model': model, @@ -187,7 +200,7 @@ def generate_full_sweep(args, all_config_data): 'max-model-len': isl + osl + 200, 'ep': 1, # Default 'dp-attn': False, # Default - 'exp-name': f"{model_code}_{isl}_{osl}_sweep", + 'exp-name': f"{model_code}_{seq_len_str}_sweep", } # Add optional fields if they exist @@ -286,6 +299,7 @@ def generate_filtered_sweep(args, all_config_data): ep = highest_tp_bmk.get('ep') dp_attn = highest_tp_bmk.get('dp-attn') + seq_len_str = seq_len_to_str(isl, osl) entry = { 'image': image, 'model': model, @@ -299,7 +313,7 @@ def generate_filtered_sweep(args, all_config_data): 'dp-attn': False, # Default 'conc': conc, 'max-model-len': isl + osl + 200, - 'exp-name': f"{model_code}_{isl}_{osl}_test", + 'exp-name': f"{model_code}_{seq_len_str}_test", } if ep is not None: @@ -319,6 +333,7 @@ def generate_filtered_sweep(args, all_config_data): conc = conc_start while conc <= conc_end: + seq_len_str = seq_len_to_str(isl, osl) entry = { 'image': image, 'model': model, @@ -332,7 +347,7 @@ def generate_filtered_sweep(args, all_config_data): 'max-model-len': isl + osl + 200, 'ep': 1, # Default 'dp-attn': False, # Default - 'exp-name': f"{model_code}_{isl}_{osl}_sweep", + 'exp-name': f"{model_code}_{seq_len_str}_sweep", } if ep is not None: From de9e367123ce3017aa6a64e3db4917d220a9b422 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 14:47:43 -0500 Subject: [PATCH 068/149] less verbose --- utils/matrix-logic/generate_sweep_configs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 7f1b76490..6d092eac8 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -200,7 +200,7 @@ def generate_full_sweep(args, all_config_data): 'max-model-len': isl + osl + 200, 'ep': 1, # Default 'dp-attn': False, # Default - 'exp-name': f"{model_code}_{seq_len_str}_sweep", + 'exp-name': f"{model_code}_{seq_len_str}", } # Add optional fields if they exist @@ -313,7 +313,7 @@ def generate_filtered_sweep(args, all_config_data): 'dp-attn': False, # Default 'conc': conc, 'max-model-len': isl + osl + 200, - 'exp-name': f"{model_code}_{seq_len_str}_test", + 'exp-name': f"{model_code}_{seq_len_str}", } if ep is not None: @@ -347,7 +347,7 @@ def generate_filtered_sweep(args, all_config_data): 'max-model-len': isl + osl + 200, 'ep': 1, # Default 'dp-attn': False, # Default - 'exp-name': f"{model_code}_{seq_len_str}_sweep", + 'exp-name': f"{model_code}_{seq_len_str}", } if ep is not None: From 6df2657dfaa2f81f161fcf577c55b4af3375b483 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 17:05:27 -0500 Subject: [PATCH 069/149] deleting files --- .github/workflows/{test.yml => e2e-tests.yml} | 2 +- .../workflows/full-sweep-1k1k-scheduler.yml | 59 - .../workflows/full-sweep-1k8k-scheduler.yml | 59 - .../workflows/full-sweep-8k1k-scheduler.yml | 59 - .github/workflows/full-sweep-test.yml | 89 - .github/workflows/full-sweep-tmpl.yml | 263 --- .github/workflows/runner-model-sweep-test.yml | 300 ---- .github/workflows/runner-sweep-test.yml | 333 ---- .github/workflows/runner-test.yml | 136 -- .gitignore | 2 + utils/matrix-logic/generate_sweep_configs.py | 135 +- utils/matrix-logic/pytest.ini | 12 + .../test_generate_sweep_configs.py | 1545 +++++++++++++++++ 13 files changed, 1573 insertions(+), 1421 deletions(-) rename .github/workflows/{test.yml => e2e-tests.yml} (99%) delete mode 100644 .github/workflows/full-sweep-1k1k-scheduler.yml delete mode 100644 .github/workflows/full-sweep-1k8k-scheduler.yml delete mode 100644 .github/workflows/full-sweep-8k1k-scheduler.yml delete mode 100644 .github/workflows/full-sweep-test.yml delete mode 100644 .github/workflows/full-sweep-tmpl.yml delete mode 100644 .github/workflows/runner-model-sweep-test.yml delete mode 100644 .github/workflows/runner-sweep-test.yml delete mode 100644 .github/workflows/runner-test.yml create mode 100644 .gitignore create mode 100644 utils/matrix-logic/pytest.ini create mode 100644 utils/matrix-logic/test_generate_sweep_configs.py diff --git a/.github/workflows/test.yml b/.github/workflows/e2e-tests.yml similarity index 99% rename from .github/workflows/test.yml rename to .github/workflows/e2e-tests.yml index 78b9b1f5e..ff7ecb92b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/e2e-tests.yml @@ -1,4 +1,4 @@ -name: Test Sweep +name: End-to-End Tests # concurrency: # group: benchmark-lock diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml deleted file mode 100644 index 601c760b3..000000000 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: Full Sweep Scheduler - 1k1k - -concurrency: - group: benchmark-lock-1k1k - cancel-in-progress: true - -on: - workflow_dispatch: - schedule: - - cron: '0 23 * * *' - -jobs: - mega-run: - uses: ./.github/workflows/full-sweep-tmpl.yml - secrets: inherit - with: - run_1k1k: true - run_8k1k: false - run_1k8k: false - use_h100: true - use_h200: true - use_b200: true - use_mi300x: true - use_mi325x: true - use_mi355x: true - use_gb200: true - - calc-success-rate: - needs: mega-run - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@v3 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@v4 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@v4 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml deleted file mode 100644 index 967935335..000000000 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: Full Sweep Scheduler - 1k8k - -concurrency: - group: benchmark-lock-1k8k - cancel-in-progress: true - -on: - workflow_dispatch: - schedule: - - cron: '0 23 * * *' - -jobs: - mega-run: - uses: ./.github/workflows/full-sweep-tmpl.yml - secrets: inherit - with: - run_1k1k: false - run_8k1k: false - run_1k8k: true - use_h100: true - use_h200: true - use_b200: true - use_mi300x: true - use_mi325x: true - use_mi355x: true - use_gb200: true - - calc-success-rate: - needs: mega-run - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@v3 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@v4 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@v4 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml deleted file mode 100644 index 791d9e017..000000000 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: Full Sweep Scheduler - 8k1k - -concurrency: - group: benchmark-lock-8k1k - cancel-in-progress: true - -on: - workflow_dispatch: - schedule: - - cron: '0 23 * * *' - -jobs: - mega-run: - uses: ./.github/workflows/full-sweep-tmpl.yml - secrets: inherit - with: - run_1k1k: false - run_8k1k: true - run_1k8k: false - use_h100: true - use_h200: true - use_b200: true - use_mi300x: true - use_mi325x: true - use_mi355x: true - use_gb200: true - - calc-success-rate: - needs: mega-run - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@v3 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@v4 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@v4 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml deleted file mode 100644 index b134e407c..000000000 --- a/.github/workflows/full-sweep-test.yml +++ /dev/null @@ -1,89 +0,0 @@ -name: Test - Full Sweep - -concurrency: - group: benchmark-lock - cancel-in-progress: false - -on: - workflow_dispatch: - inputs: - run_1k1k: - type: boolean - required: false - run_8k1k: - type: boolean - required: false - run_1k8k: - type: boolean - required: false - - use_h100: - type: boolean - required: false - use_h200: - type: boolean - required: false - use_b200: - type: boolean - required: false - use_mi300x: - type: boolean - required: false - use_mi325x: - type: boolean - required: false - use_mi355x: - type: boolean - required: false - use_gb200: - type: boolean - required: false - -jobs: - mega-test-run: - uses: ./.github/workflows/full-sweep-tmpl.yml - secrets: inherit - with: - run_1k1k: ${{ inputs.run_1k1k }} - run_8k1k: ${{ inputs.run_8k1k }} - run_1k8k: ${{ inputs.run_1k8k }} - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - use_gb200: ${{ inputs.use_gb200 }} - - calc-success-rate: - needs: mega-test-run - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@v3 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@v4 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@v4 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml deleted file mode 100644 index b086460df..000000000 --- a/.github/workflows/full-sweep-tmpl.yml +++ /dev/null @@ -1,263 +0,0 @@ -name: Template - Full Sweep - -on: - workflow_call: - inputs: - run_1k1k: - type: boolean - required: true - run_8k1k: - type: boolean - required: true - run_1k8k: - type: boolean - required: true - - use_h100: - type: boolean - required: true - use_h200: - type: boolean - required: true - use_b200: - type: boolean - required: true - use_mi300x: - type: boolean - required: true - use_mi325x: - type: boolean - required: true - use_mi355x: - type: boolean - required: true - use_gb200: - type: boolean - required: false - default: false - -jobs: - _70b-1k1k: - if: ${{ inputs.run_1k1k }} - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-70b-1k1k-results: - needs: _70b-1k1k - if: ${{ inputs.run_1k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: '70b_1k1k' - - dsr1-1k1k: - if: ${{ inputs.run_1k1k }} - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - use_gb200: ${{ inputs.use_gb200 }} - - collect-dsr1-1k1k-results: - needs: dsr1-1k1k - if: ${{ inputs.run_1k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'dsr1_1k1k' - - gptoss-1k1k: - if: ${{ inputs.run_1k1k }} - uses: ./.github/workflows/gptoss-tmpl.yml - secrets: inherit - with: - exp-name: 'gptoss_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-gptoss-1k1k-results: - needs: gptoss-1k1k - if: ${{ inputs.run_1k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'gptoss_1k1k' - - _70b-8k1k: - if: ${{ inputs.run_8k1k }} - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-70b-8k1k-results: - needs: _70b-8k1k - if: ${{ inputs.run_8k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: '70b_8k1k' - - dsr1-8k1k: - if: ${{ inputs.run_8k1k }} - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - use_gb200: ${{ inputs.use_gb200 }} - - collect-dsr1-8k1k-results: - needs: dsr1-8k1k - if: ${{ inputs.run_8k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'dsr1_8k1k' - - gptoss-8k1k: - if: ${{ inputs.run_8k1k }} - uses: ./.github/workflows/gptoss-tmpl.yml - secrets: inherit - with: - exp-name: 'gptoss_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-gptoss-8k1k-results: - needs: gptoss-8k1k - if: ${{ inputs.run_8k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'gptoss_8k1k' - - _70b-1k8k: - if: ${{ inputs.run_1k8k }} - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-70b-1k8k-results: - needs: _70b-1k8k - if: ${{ inputs.run_1k8k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: '70b_1k8k' - - dsr1-1k8k: - if: ${{ inputs.run_1k8k }} - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - use_gb200: ${{ inputs.use_gb200 }} - - collect-dsr1-1k8k-results: - needs: dsr1-1k8k - if: ${{ inputs.run_1k8k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'dsr1_1k8k' - - gptoss-1k8k: - if: ${{ inputs.run_1k8k }} - uses: ./.github/workflows/gptoss-tmpl.yml - secrets: inherit - with: - exp-name: 'gptoss_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-gptoss-1k8k-results: - needs: gptoss-1k8k - if: ${{ inputs.run_1k8k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'gptoss_1k8k' diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml deleted file mode 100644 index 212ffc07c..000000000 --- a/.github/workflows/runner-model-sweep-test.yml +++ /dev/null @@ -1,300 +0,0 @@ -name: 'Test - Runner Model Sweep' -run-name: '${{ github.event.inputs.runner }} Sweep' -on: - workflow_dispatch: - inputs: - runner: - description: 'Runner Type' - required: true - type: choice - options: - - 'h100' - - 'h200' - - 'h200-trt' - - 'b200' - - 'b200-trt' - - 'mi300x' - - 'mi325x' - - 'mi355x' - -env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - HF_HUB_CACHE: '/mnt/hf_hub_cache/' - -jobs: - bmk-h100: - if: ${{ inputs.runner == 'h100' }} - strategy: - fail-fast: false - matrix: - runner: - - 'h100-cr_0' - - 'h100-cr_1' - - 'h100-cw_0' - - 'h100-cw_1' - config: - - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-h200: - if: ${{ inputs.runner == 'h200' }} - strategy: - fail-fast: false - matrix: - runner: - - 'h200-cw_0' - - 'h200-cw_1' - - 'h200-nb_0' - - 'h200-nb_1' - - 'h200-nb_2' - - 'h200-nb_3' - - 'h200-nv_0' - - 'h200-nv_1' - - 'h200-nv_2' - - 'h200-nv_3' - config: - - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-h200-trt: - if: ${{ inputs.runner == 'h200-trt' }} - strategy: - fail-fast: false - matrix: - runner: - - 'h200-cw_0' - - 'h200-cw_1' - - 'h200-nb_0' - - 'h200-nb_1' - - 'h200-nb_2' - - 'h200-nb_3' - - 'h200-nv_0' - - 'h200-nv_1' - - 'h200-nv_2' - - 'h200-nv_3' - config: - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' } - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-b200: - if: ${{ inputs.runner == 'b200' }} - strategy: - fail-fast: false - matrix: - runner: - - 'b200-nvd_0' - - 'b200-nvd_1' - - 'b200-nvd_2' - - 'b200-nvd_3' - config: - - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' } - - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } - - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[4]' - - bmk-b200-trt: - if: ${{ inputs.runner == 'b200-trt' }} - strategy: - fail-fast: false - matrix: - runner: - - 'b200-nv_0' - - 'b200-nv_1' - - 'b200-nb_0' - - 'b200-nb_1' - config: - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' } - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'trt', precision: 'fp4', exp-name: '70b_test' } - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' } - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-mi300x: - if: ${{ inputs.runner == 'mi300x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi300x-amd_0' - - 'mi300x-amd_1' - - 'mi300x-amd_2' - - 'mi300x-amd_3' - - 'mi300x-amd_4' - - 'mi300x-cr_0' - - 'mi300x-oci_0' - config: - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-mi325x: - if: ${{ inputs.runner == 'mi325x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi325x-amd_0' - - 'mi325x-tw_0' - - 'mi325x-tw_1' - - 'mi325x-tw_2' - - 'mi325x-tw_3' - config: - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-mi355x: - if: ${{ inputs.runner == 'mi355x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi355x-amd_0' - - 'mi355x-amd_1' - - 'mi355x-amd_2' - - 'mi355x-amd_3' - config: - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml deleted file mode 100644 index fd100474f..000000000 --- a/.github/workflows/runner-sweep-test.yml +++ /dev/null @@ -1,333 +0,0 @@ -name: 'Test - Runner Sweep' -run-name: '${{ github.event.inputs.runner }} Sweep - ${{ github.event.inputs.model }}' -on: - workflow_dispatch: - inputs: - runner: - description: 'Runner Type' - required: true - type: choice - options: - - 'h100' - - 'h200' - - 'b200' - - 'h200-trt' - - 'b200-trt' - - 'mi300x' - - 'mi325x' - - 'mi355x' - - 'gb200' - - image: - description: 'Docker Image' - required: true - type: choice - options: - - 'lmsysorg/sglang:v0.4.9.post1-cu126' - - 'lmsysorg/sglang:v0.5.0rc1-cu128-b200' - - 'lmsysorg/sglang:v0.5.2rc2-cu126' - - 'lmsysorg/sglang:v0.5.3rc1-cu129-b200' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' - - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' - - 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - - 'vllm/vllm-openai:v0.10.2' - - model: - description: 'Model' - required: true - type: choice - options: - - 'amd/DeepSeek-R1-0528-MXFP4-Preview' - - 'amd/Llama-3.3-70B-Instruct-FP8-KV' - - 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview' - - 'deepseek-ai/DeepSeek-R1-0528' - - 'nvidia/Llama-3.3-70B-Instruct-FP8' - - 'nvidia/Llama-3.3-70B-Instruct-FP4' - - 'nvidia/DeepSeek-R1-0528-FP4' - - 'nvidia/DeepSeek-R1-0528-FP4-v2' - - 'openai/gpt-oss-120b' - - framework: - description: 'Framework' - required: true - type: choice - options: - - 'vllm' - - 'sglang' - - 'trt' - - precision: - description: 'Precision' - required: true - type: choice - options: - - 'fp8' - - 'fp4' - - exp-name: - description: 'Experiment Name' - required: true - type: choice - options: - - '70b_test' - - 'dsr1_test' - - 'gptoss_test' - - -env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - HF_HUB_CACHE: '/mnt/hf_hub_cache/' - -jobs: - bmk_h100: - if: ${{ inputs.runner == 'h100' }} - strategy: - fail-fast: false - matrix: - runner: - - 'h100-cr_0' - - 'h100-cr_1' - - 'h100-cw_0' - - 'h100-cw_1' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_h200: - if: ${{ inputs.runner == 'h200' || inputs.runner == 'h200-trt' }} - strategy: - fail-fast: false - matrix: - runner: - - 'h200-cw_0' - - 'h200-cw_1' - - 'h200-nb_0' - - 'h200-nb_1' - - 'h200-nb_2' - - 'h200-nb_3' - - 'h200-nv_0' - - 'h200-nv_1' - - 'h200-nv_2' - - 'h200-nv_3' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[4]' - conc-list: '[64]' - - bmk_b200: - if: ${{ inputs.runner == 'b200' }} - strategy: - fail-fast: false - matrix: - runner: - - 'b200-nv_0' - - 'b200-nv_1' - - 'b200-nvd_0' - - 'b200-nvd_1' - - 'b200-tg_0' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_b200-trt: - if: ${{ inputs.runner == 'b200-trt' }} - strategy: - fail-fast: false - matrix: - runner: - - 'b200-nv_0' - - 'b200-nv_1' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_mi300x: - if: ${{ inputs.runner == 'mi300x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi300x-amd_0' - - 'mi300x-amd_1' - - 'mi300x-amd_2' - - 'mi300x-amd_3' - - 'mi300x-amd_4' - - 'mi300x-cr_0' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_mi325x: - if: ${{ inputs.runner == 'mi325x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi325x-amd_0' - - 'mi325x-tw_0' - - 'mi325x-tw_1' - - 'mi325x-tw_2' - - 'mi325x-tw_3' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_mi355x: - if: ${{ inputs.runner == 'mi355x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi355x-amd_0' - - 'mi355x-amd_1' - - 'mi355x-amd_2' - - 'mi355x-amd_3' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_gb200: - if: ${{ inputs.runner == 'gb200' && inputs.framework == 'trt' }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - secrets: inherit - with: - runner: gb200 - image: 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3' - model: 'deepseek-r1-fp4' - framework: 'dynamo-trtllm' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - mtp-mode: 'off' - - bmk_gb200-sgl: - if: ${{ inputs.runner == 'gb200' && inputs.framework == 'sglang' }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - secrets: inherit - with: - runner: gb200 - image: 'nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'dynamo-sglang' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: 8192 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - mtp-mode: 'off' - - collect-test-results: - needs: [ bmk_h100, bmk_h200, bmk_b200, bmk_b200-trt, bmk_mi300x, bmk_mi325x, bmk_mi355x, bmk_gb200, bmk_gb200-sgl ] - if: ${{ always() && !cancelled() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} diff --git a/.github/workflows/runner-test.yml b/.github/workflows/runner-test.yml deleted file mode 100644 index 983394035..000000000 --- a/.github/workflows/runner-test.yml +++ /dev/null @@ -1,136 +0,0 @@ -name: Test - Runner -run-name: '${{ github.event.inputs.runner }} - ${{ github.event.inputs.model }}' -on: - workflow_dispatch: - inputs: - runner: - description: 'Runner' - required: true - type: choice - options: - - 'h100-cr_0' - - 'h100-cr_1' - - 'h100-cw_0' - - 'h100-cw_1' - - 'h200-cw_0' - - 'h200-cw_1' - - 'h200-nb_0' - - 'h200-nb_1' - - 'h200-nb_2' - - 'h200-nb_3' - - 'h200-nv_0' - - 'h200-nv_1' - - 'h200-nv_2' - - 'h200-nv_3' - - 'b200-nv_0' - - 'b200-nv_1' - - 'b200-nb_0' - - 'b200-nb_1' - - 'b200-nvd_0' - - 'b200-nvd_1' - - 'b200-nvd_2' - - 'b200-nvd_3' - - 'b200-tg_0' - - 'mi300x-amd_0' - - 'mi300x-amd_1' - - 'mi300x-amd_2' - - 'mi300x-amd_3' - - 'mi300x-amd_4' - - 'mi300x-cr_0' - - 'mi300x-oci_0' - - 'mi325x-amd_0' - - 'mi325x-tw_0' - - 'mi325x-tw_1' - - 'mi325x-tw_2' - - 'mi325x-tw_3' - - 'mi355x-amd_0' - - 'mi355x-amd_1' - - 'mi355x-amd_2' - - 'mi355x-amd_3' - - image: - description: 'Docker Image' - required: true - type: choice - options: - - 'lmsysorg/sglang:v0.4.9.post1-cu126' - - 'lmsysorg/sglang:v0.5.0rc1-cu128-b200' - - 'lmsysorg/sglang:v0.5.2rc2-cu126' - - 'lmsysorg/sglang:v0.5.3rc1-cu129-b200' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' - - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' - - 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.10.1_instinct_rc1' - - 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_sgl-dev-v0.5.2rc2-mi30x_rc1' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - - 'vllm/vllm-openai:v0.10.2' - model: - description: 'Model' - required: true - type: choice - options: - - 'amd/DeepSeek-R1-0528-MXFP4-Preview' - - 'amd/Llama-3.3-70B-Instruct-FP8-KV' - - 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview' - - 'deepseek-ai/DeepSeek-R1-0528' - - 'nvidia/Llama-3.3-70B-Instruct-FP8' - - 'nvidia/Llama-3.3-70B-Instruct-FP4' - - 'nvidia/DeepSeek-R1-0528-FP4' - - 'nvidia/DeepSeek-R1-0528-FP4-v2' - - 'openai/gpt-oss-120b' - - framework: - description: 'Framework' - required: true - type: choice - options: - - 'vllm' - - 'sglang' - - 'trt' - - precision: - description: 'Precision' - required: true - type: choice - options: - - 'fp8' - - 'fp4' - - exp-name: - description: 'Experiment Name' - required: true - type: choice - options: - - '70b_test' - - 'dsr1_test' - - 'gptoss_test' - -jobs: - runner-test: - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ inputs.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[4]' - - collect-test-results: - needs: runner-test - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..03d36472a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +**/__pycache__/** +**/.coverage \ No newline at end of file diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 6d092eac8..7574579af 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -142,90 +142,13 @@ def validate_master_configs_structure(all_config_data): def generate_full_sweep(args, all_config_data): - """Generate full sweep configurations based on model prefix and sequence lengths. + """Generate full sweep configurations with optional filtering. - Assumes all_config_data has been validated by validate_config_structure(). - """ - isl, osl = seq_len_stoi[args.seq_lens] - - matrix_values = [] - for key, val in all_config_data.items(): - # Filter by model prefix - if not key.startswith(args.model_prefix): - continue - - seq_len_configs = val['seq-len-configs'] - image = val['image'] - model = val['model'] - precision = val['precision'] - framework = val['framework'] - runner = val['runner'] - # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name - # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME - model_code = key.split('-')[0] - - # Check if this config has matching sequence lengths - matching_seq_config = None - for slq in seq_len_configs: - if slq['isl'] == isl and slq['osl'] == osl: - matching_seq_config = slq - break - - if not matching_seq_config: - continue # Skip this config if no matching sequence length - - bmk_space = matching_seq_config['search-space'] - - for bmk in bmk_space: - tp = bmk['tp'] - conc_start = bmk['conc-start'] - conc_end = bmk['conc-end'] - ep = bmk.get('ep') - dp_attn = bmk.get('dp-attn') - - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: - seq_len_str = seq_len_to_str(isl, osl) - entry = { - 'image': image, - 'model': model, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'conc': conc, - 'max-model-len': isl + osl + 200, - 'ep': 1, # Default - 'dp-attn': False, # Default - 'exp-name': f"{model_code}_{seq_len_str}", - } - - # Add optional fields if they exist - if ep is not None: - entry['ep'] = ep - if dp_attn is not None: - entry['dp-attn'] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - return matrix_values - - -def generate_filtered_sweep(args, all_config_data): - """Generate sweep configurations with filtering options. - - Allows filtering by model prefix, precision, framework, runner type, and sequence lengths. + Supports filtering by model prefix, precision, framework, runner type, and sequence lengths. Supports test mode to only run highest TP with lowest concurrency. + All filters are optional - can generate sweeps for all configs or filter by specific criteria. + Assumes all_config_data has been validated by validate_config_structure(). """ # Validate runner types if specified @@ -754,86 +677,56 @@ def main(): 'full-sweep', parents=[parent_parser], add_help=False, - help='Generate full sweep configurations based on model prefix' - ) - full_sweep_parser.add_argument( - '--seq-lens', - choices=list(seq_len_stoi.keys()), - required=True, - help=f"Sequence length configuration: {', '.join(seq_len_stoi.keys())}" + help='Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths' ) full_sweep_parser.add_argument( - '--model-prefix', - required=True, - help='Model prefix to filter configurations' - ) - full_sweep_parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - full_sweep_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: filtered-sweep - filtered_sweep_parser = subparsers.add_parser( - 'filtered-sweep', - parents=[parent_parser], - add_help=False, - help='Generate sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths' - ) - filtered_sweep_parser.add_argument( '--model-prefix', nargs='+', required=False, help='Model prefix(es) to filter configurations (optional, can specify multiple)' ) - filtered_sweep_parser.add_argument( + full_sweep_parser.add_argument( '--precision', nargs='+', required=False, help='Precision(s) to filter by (e.g., fp4, fp8) (optional, can specify multiple)' ) - filtered_sweep_parser.add_argument( + full_sweep_parser.add_argument( '--framework', nargs='+', required=False, help='Framework(s) to filter by (e.g., vllm, trt, sglang) (optional, can specify multiple)' ) - filtered_sweep_parser.add_argument( + full_sweep_parser.add_argument( '--runner-type', nargs='+', required=False, help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)' ) - filtered_sweep_parser.add_argument( + full_sweep_parser.add_argument( '--runner-config', required=False, help='Configuration file holding runner information (required if --runner-type is specified)' ) - filtered_sweep_parser.add_argument( + full_sweep_parser.add_argument( '--seq-lens', nargs='+', choices=list(seq_len_stoi.keys()), required=False, help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." ) - filtered_sweep_parser.add_argument( + full_sweep_parser.add_argument( '--step-size', type=int, default=2, help='Step size for concurrency values (default: 2)' ) - filtered_sweep_parser.add_argument( + full_sweep_parser.add_argument( '--test-mode', action='store_true', help='Test mode: only run highest TP with lowest concurrency for each matching config' ) - filtered_sweep_parser.add_argument( + full_sweep_parser.add_argument( '-h', '--help', action='help', help='Show this help message and exit' @@ -998,8 +891,6 @@ def main(): # Route to appropriate function based on subcommand if args.command == 'full-sweep': matrix_values = generate_full_sweep(args, all_config_data) - elif args.command == 'filtered-sweep': - matrix_values = generate_filtered_sweep(args, all_config_data) elif args.command == 'test-config': matrix_values = generate_test_config(args, all_config_data) elif args.command == 'runner-model-sweep': diff --git a/utils/matrix-logic/pytest.ini b/utils/matrix-logic/pytest.ini new file mode 100644 index 000000000..c3cd9aac7 --- /dev/null +++ b/utils/matrix-logic/pytest.ini @@ -0,0 +1,12 @@ +[pytest] +testpaths = . +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = + -v + --strict-markers + --tb=short +markers = + slow: marks tests as slow (deselect with '-m "not slow"') + integration: marks tests as integration tests diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py new file mode 100644 index 000000000..36cb14cd7 --- /dev/null +++ b/utils/matrix-logic/test_generate_sweep_configs.py @@ -0,0 +1,1545 @@ +import pytest +import yaml +from unittest.mock import patch +from generate_sweep_configs import ( + validate_master_configs_structure, + validate_matrix_output, + seq_len_to_str, + generate_full_sweep, + generate_test_config, + generate_runner_model_sweep_config, + generate_runner_sweep_config, + generate_custom_test, + load_config_files, + main, + MatrixEntry, +) + + +# Fixtures for test config files +@pytest.fixture +def sample_master_config(): + """Sample master config with valid entries.""" + return { + "70b-fp8-vllm": { + "image": "vllm/vllm-openai:v0.10.2", + "model": "meta-llama/Llama-3-70b", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-start": 1, "conc-end": 4}, + {"tp": 8, "conc-start": 2, "conc-end": 8, "ep": 2, "dp-attn": True} + ] + }, + { + "isl": 1024, + "osl": 8192, + "search-space": [ + {"tp": 8, "conc-start": 1, "conc-end": 2} + ] + } + ] + }, + "8b-fp4-trt": { + "image": "nvcr.io/nvidia/tritonserver:24.01", + "model": "meta-llama/Llama-3-8b", + "precision": "fp4", + "framework": "trt", + "runner": "h100", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 2, "conc-start": 4, "conc-end": 16} + ] + } + ] + }, + "gptoss-120b-fp8-vllm": { + "image": "vllm/vllm-openai:latest", + "model": "openai/gpt-oss-120b", + "precision": "fp8", + "framework": "vllm", + "runner": "h200-trt", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 1, "conc-end": 4} + ] + } + ] + } + } + + +@pytest.fixture +def sample_runner_config(): + """Sample runner config.""" + return { + "h200": ["h200-nv_1", "h200-nv_2"], + "h100": ["h100-aws_1"], + "h200-trt": ["h200-trt_1", "h200-trt_2", "h200-trt_3"] + } + + +@pytest.fixture +def temp_config_files(tmp_path, sample_master_config, sample_runner_config): + """Create temporary config files.""" + master_file = tmp_path / "master.yaml" + runner_file = tmp_path / "runners.yaml" + + with open(master_file, 'w') as f: + yaml.dump(sample_master_config, f) + + with open(runner_file, 'w') as f: + yaml.dump(sample_runner_config, f) + + return str(master_file), str(runner_file) + + +@pytest.fixture +def invalid_master_config(): + """Master config with validation errors.""" + return { + "missing-field": { + "image": "test:latest", + "model": "test/model", + # Missing precision, framework, runner, seq-len-configs + } + } + + +# Tests for seq_len_to_str +def test_seq_len_to_str_with_mapping(): + """Test seq_len_to_str with known mappings.""" + assert seq_len_to_str(1024, 1024) == "1k1k" + assert seq_len_to_str(1024, 8192) == "1k8k" + assert seq_len_to_str(8192, 1024) == "8k1k" + + +def test_seq_len_to_str_without_mapping(): + """Test seq_len_to_str fallback for unknown mappings.""" + assert seq_len_to_str(2048, 4096) == "2048_4096" + assert seq_len_to_str(512, 512) == "512_512" + + +# Tests for MatrixEntry validation +def test_matrix_entry_valid(): + """Test valid MatrixEntry.""" + entry = { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "isl": 1024, + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2048, + "exp-name": "test_exp" + } + result = MatrixEntry(**entry) + assert result.image == "test:latest" + assert result.tp == 8 + + +def test_matrix_entry_missing_field(): + """Test MatrixEntry with missing required field.""" + entry = { + "image": "test:latest", + "model": "test/model", + # Missing other required fields + } + with pytest.raises(Exception): # Pydantic ValidationError + MatrixEntry(**entry) + + +def test_matrix_entry_wrong_type(): + """Test MatrixEntry with wrong type.""" + entry = { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "isl": "not_an_int", # Wrong type + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2048, + "exp-name": "test_exp" + } + with pytest.raises(Exception): # Pydantic ValidationError + MatrixEntry(**entry) + + +def test_matrix_entry_extra_field(): + """Test MatrixEntry with extra field (should be forbidden).""" + entry = { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "isl": 1024, + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2048, + "exp-name": "test_exp", + "extra-field": "should_fail" + } + with pytest.raises(Exception): # Pydantic ValidationError + MatrixEntry(**entry) + + +# Tests for validate_matrix_output +def test_validate_matrix_output_valid(): + """Test validate_matrix_output with valid entries.""" + entries = [ + { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "isl": 1024, + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2048, + "exp-name": "test_exp" + } + ] + result = validate_matrix_output(entries) + assert result == entries + + +def test_validate_matrix_output_invalid(): + """Test validate_matrix_output with invalid entry.""" + entries = [ + { + "image": "test:latest", + "model": "test/model", + # Missing required fields + } + ] + with pytest.raises(ValueError, match="Matrix entry at index 0 failed validation"): + validate_matrix_output(entries) + + +def test_validate_matrix_output_multiple_entries(): + """Test validate_matrix_output with multiple entries.""" + entries = [ + { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "isl": 1024, + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2048, + "exp-name": "test_exp" + }, + { + "image": "test2:latest", + "model": "test2/model", + "precision": "fp4", + "framework": "trt", + "runner": "h100", + "isl": 1024, + "osl": 1024, + "tp": 4, + "ep": 2, + "dp-attn": True, + "conc": 8, + "max-model-len": 2048, + "exp-name": "test_exp2" + } + ] + result = validate_matrix_output(entries) + assert len(result) == 2 + + +# Tests for validate_master_configs_structure +def test_validate_master_configs_structure_valid(sample_master_config): + """Test validation of valid master config.""" + validate_master_configs_structure(sample_master_config) + + +def test_validate_master_configs_structure_missing_field(): + """Test validation with missing required field.""" + config = { + "test-key": { + "image": "test:latest", + # Missing other required fields + } + } + with pytest.raises(ValueError, match="Missing required field"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_wrong_type(): + """Test validation with wrong field type.""" + config = { + "test-key": { + "image": 123, # Should be string + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [] + } + } + with pytest.raises(ValueError, match="must be str"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_empty_seq_len_configs(): + """Test validation with empty seq-len-configs.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [] + } + } + with pytest.raises(ValueError, match="must be a non-empty list"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_invalid_search_space(): + """Test validation with invalid search-space.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8} # Missing conc-start and conc-end + ] + } + ] + } + } + with pytest.raises(ValueError, match="Missing 'conc-start'"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_missing_search_space(): + """Test validation with missing search-space.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024 + # Missing search-space + } + ] + } + } + with pytest.raises(ValueError, match="Missing or invalid 'search-space'"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_search_space_not_list(): + """Test validation with search-space not being a list.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": "not_a_list" + } + ] + } + } + with pytest.raises(ValueError, match="Missing or invalid 'search-space'"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_extra_fields_in_search_space(): + """Test validation with extra fields in search-space.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "tp": 8, + "conc-start": 1, + "conc-end": 4, + "invalid-field": "value" + } + ] + } + ] + } + } + with pytest.raises(ValueError, match="Extra fields"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_missing_isl(): + """Test validation with missing isl.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "osl": 1024, + "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] + } + ] + } + } + with pytest.raises(ValueError, match="Missing 'isl'"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_wrong_isl_type(): + """Test validation with wrong isl type.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": "not_int", + "osl": 1024, + "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] + } + ] + } + } + with pytest.raises(ValueError, match="'isl' must be int"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_missing_osl(): + """Test validation with missing osl.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] + } + ] + } + } + with pytest.raises(ValueError, match="Missing 'osl'"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_wrong_osl_type(): + """Test validation with wrong osl type.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": "not_int", + "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] + } + ] + } + } + with pytest.raises(ValueError, match="'osl' must be int"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_wrong_tp_type(): + """Test validation with wrong tp type.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": "not_int", "conc-start": 1, "conc-end": 4}] + } + ] + } + } + with pytest.raises(ValueError, match="'tp' must be int"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_wrong_conc_start_type(): + """Test validation with wrong conc-start type.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 8, "conc-start": "not_int", "conc-end": 4}] + } + ] + } + } + with pytest.raises(ValueError, match="'conc-start' must be int"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_wrong_conc_end_type(): + """Test validation with wrong conc-end type.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 8, "conc-start": 1, "conc-end": "not_int"}] + } + ] + } + } + with pytest.raises(ValueError, match="'conc-end' must be int"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_wrong_ep_type(): + """Test validation with wrong ep type.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "ep": "not_int"}] + } + ] + } + } + with pytest.raises(ValueError, match="'ep' must be int"): + validate_master_configs_structure(config) + + +def test_validate_master_configs_structure_wrong_dp_attn_type(): + """Test validation with wrong dp-attn type.""" + config = { + "test-key": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "dp-attn": "not_bool"}] + } + ] + } + } + with pytest.raises(ValueError, match="'dp-attn' must be bool"): + validate_master_configs_structure(config) + + +# Tests for load_config_files +def test_load_config_files_valid(temp_config_files): + """Test loading valid config files.""" + master_file, _ = temp_config_files + result = load_config_files([master_file]) + assert len(result) == 3 + assert "70b-fp8-vllm" in result + + +def test_load_config_files_multiple(tmp_path, sample_master_config): + """Test loading multiple config files.""" + file1 = tmp_path / "config1.yaml" + file2 = tmp_path / "config2.yaml" + + config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} + config2 = {"8b-fp4-trt": sample_master_config["8b-fp4-trt"]} + + with open(file1, 'w') as f: + yaml.dump(config1, f) + with open(file2, 'w') as f: + yaml.dump(config2, f) + + result = load_config_files([str(file1), str(file2)]) + assert len(result) == 2 + + +def test_load_config_files_not_found(): + """Test loading non-existent config file.""" + with pytest.raises(ValueError, match="does not exist"): + load_config_files(["/nonexistent/file.yaml"]) + + +def test_load_config_files_duplicate_keys(tmp_path, sample_master_config): + """Test loading files with duplicate keys.""" + file1 = tmp_path / "config1.yaml" + file2 = tmp_path / "config2.yaml" + + config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} + config2 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} # Duplicate + + with open(file1, 'w') as f: + yaml.dump(config1, f) + with open(file2, 'w') as f: + yaml.dump(config2, f) + + with pytest.raises(ValueError, match="Duplicate configuration keys"): + load_config_files([str(file1), str(file2)]) + + +# Tests for generate_full_sweep +def test_generate_full_sweep_basic(sample_master_config, temp_config_files): + """Test basic full sweep generation.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["70b"] + seq_lens = ["1k1k"] + step_size = 2 + precision = None + framework = None + runner_type = None + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + assert len(result) > 0 + assert all(entry['exp-name'].startswith('70b_1k1k') for entry in result) + assert all(entry['isl'] == 1024 and entry['osl'] == 1024 for entry in result) + + +def test_generate_full_sweep_with_optionals(sample_master_config, temp_config_files): + """Test full sweep with optional ep and dp-attn.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["70b"] + seq_lens = ["1k1k"] + step_size = 2 + precision = None + framework = None + runner_type = None + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + # Find entry with tp=8 which should have ep=2 and dp-attn=True + tp8_entries = [e for e in result if e['tp'] == 8] + assert len(tp8_entries) > 0 + assert all(e['ep'] == 2 for e in tp8_entries) + assert all(e['dp-attn'] == True for e in tp8_entries) + + +def test_generate_full_sweep_no_matches(sample_master_config, temp_config_files): + """Test full sweep with no matching configs.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["nonexistent"] + seq_lens = ["1k1k"] + step_size = 2 + precision = None + framework = None + runner_type = None + test_mode = False + runner_config = runner_file + + with pytest.raises(ValueError, match="No configs found matching filters"): + generate_full_sweep(Args(), sample_master_config) + + +def test_generate_full_sweep_different_seq_len(sample_master_config, temp_config_files): + """Test full sweep with different sequence length.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["70b"] + seq_lens = ["1k8k"] + step_size = 2 + precision = None + framework = None + runner_type = None + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + assert len(result) > 0 + assert all(entry['isl'] == 1024 and entry['osl'] == 8192 for entry in result) + + +def test_generate_full_sweep_step_size(sample_master_config, temp_config_files): + """Test full sweep with different step size.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["8b"] + seq_lens = ["1k1k"] + step_size = 4 + precision = None + framework = None + runner_type = None + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + # Should have entries at conc=4, 8, 16 (step_size=4, conc-start=4, conc-end=16) + conc_values = sorted(set(e['conc'] for e in result)) + assert 4 in conc_values + assert 16 in conc_values + + +def test_generate_full_sweep_seq_len_not_in_config(temp_config_files): + """Test full sweep when requested seq-len is not in config.""" + _, runner_file = temp_config_files + + config = { + "test-fp8-vllm": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 8192, + "osl": 1024, # Only has 8k1k, not 1k1k + "search-space": [ + {"tp": 4, "conc-start": 1, "conc-end": 4} + ] + } + ] + } + } + + class Args: + model_prefix = ["test"] + seq_lens = ["1k1k"] # Requesting 1k1k but config only has 8k1k + step_size = 2 + precision = None + framework = None + runner_type = None + test_mode = False + runner_config = runner_file + + # Should raise error since no matching seq-len + with pytest.raises(ValueError, match="No configs found matching filters"): + generate_full_sweep(Args(), config) + + +def test_generate_full_sweep_concurrency_overshoot(temp_config_files): + """Test full sweep when concurrency step overshoots end value.""" + _, runner_file = temp_config_files + + config = { + "test-fp8-vllm": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-start": 1, "conc-end": 5} # 1, 3*2=6 overshoots, clamps to 5 + ] + } + ] + } + } + + class Args: + model_prefix = ["test"] + seq_lens = ["1k1k"] + step_size = 3 # Will overshoot: 1, 3, 9 (clamped to 5) + precision = None + framework = None + runner_type = None + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), config) + conc_values = sorted(set(e['conc'] for e in result)) + # Should have 1, 3, 5 (5 is the clamped value) + assert conc_values == [1, 3, 5] + + +# Tests for generate_full_sweep with filters +def test_generate_full_sweep_no_filters(sample_master_config, temp_config_files): + """Test filtered sweep with no filters.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = None + precision = None + framework = None + runner_type = None + seq_lens = None + step_size = 2 + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + assert len(result) > 0 + + +def test_generate_full_sweep_with_filters_model_prefix(sample_master_config, temp_config_files): + """Test filtered sweep with model prefix filter.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["70b"] + precision = None + framework = None + runner_type = None + seq_lens = None + step_size = 2 + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + assert all("70b" in entry['exp-name'] for entry in result) + + +def test_generate_full_sweep_with_filters_multiple_filters(sample_master_config, temp_config_files): + """Test filtered sweep with multiple filters.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["70b"] + precision = ["fp8"] + framework = ["vllm"] + runner_type = None + seq_lens = ["1k1k"] + step_size = 2 + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + assert len(result) > 0 + assert all(entry['precision'] == 'fp8' for entry in result) + assert all(entry['framework'] == 'vllm' for entry in result) + + +def test_generate_full_sweep_with_filters_test_mode(sample_master_config, temp_config_files): + """Test filtered sweep in test mode.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["70b"] + precision = None + framework = None + runner_type = None + seq_lens = ["1k1k"] + step_size = 2 + test_mode = True + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + # In test mode, should only get one entry per seq-len (highest TP, lowest conc) + assert len(result) == 1 # Only one config matches 70b with 1k1k + assert result[0]['tp'] == 8 # Highest TP + assert '70b_1k1k' in result[0]['exp-name'] + + +def test_generate_full_sweep_with_filters_runner_type_validation(sample_master_config, temp_config_files): + """Test filtered sweep with invalid runner type.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = None + precision = None + framework = None + runner_type = ["invalid-runner"] + seq_lens = None + step_size = 2 + test_mode = False + runner_config = runner_file + + with pytest.raises(ValueError, match="Invalid runner type"): + generate_full_sweep(Args(), sample_master_config) + + +def test_generate_full_sweep_with_filters_runner_type_no_config(sample_master_config): + """Test filtered sweep with runner type but no config file.""" + class Args: + model_prefix = None + precision = None + framework = None + runner_type = ["h200"] + seq_lens = None + step_size = 2 + test_mode = False + runner_config = None + + with pytest.raises(ValueError, match="runner-config is required"): + generate_full_sweep(Args(), sample_master_config) + + +def test_generate_full_sweep_with_filters_multiple_runner_types(sample_master_config, temp_config_files): + """Test filtered sweep with multiple runner types.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = None + precision = None + framework = None + runner_type = ["h200", "h100"] + seq_lens = ["1k1k"] + step_size = 2 + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + runners = set(entry['runner'] for entry in result) + assert 'h200' in runners or 'h100' in runners + + +def test_generate_full_sweep_with_filters_no_matches(sample_master_config, temp_config_files): + """Test filtered sweep with no matching configs.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["nonexistent"] + precision = None + framework = None + runner_type = None + seq_lens = None + step_size = 2 + test_mode = False + runner_config = runner_file + + with pytest.raises(ValueError, match="No configs found matching filters"): + generate_full_sweep(Args(), sample_master_config) + + +def test_generate_full_sweep_with_filters_concurrency_overshoot(temp_config_files): + """Test filtered sweep when concurrency step overshoots end value.""" + _, runner_file = temp_config_files + + config = { + "test-fp8-vllm": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-start": 2, "conc-end": 7} # 2, 8 overshoots, clamps to 7 + ] + } + ] + } + } + + class Args: + model_prefix = None + precision = None + framework = None + runner_type = None + seq_lens = None + step_size = 4 # Will overshoot: 2, 8 (clamped to 7) + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), config) + conc_values = sorted(set(e['conc'] for e in result)) + # Should have 2, 7 (7 is the clamped value) + assert 2 in conc_values + assert 7 in conc_values + + +# Tests for generate_test_config +def test_generate_test_config_basic(sample_master_config, temp_config_files): + """Test basic test config generation.""" + _, runner_file = temp_config_files + + class Args: + key = "70b-fp8-vllm" + runner_config = runner_file + runner_node = "h200-nv_1" + seq_lens = None + step_size = 2 + test_mode = False + + result = generate_test_config(Args(), sample_master_config) + assert len(result) > 0 + + +def test_generate_test_config_test_mode(sample_master_config, temp_config_files): + """Test test config in test mode.""" + _, runner_file = temp_config_files + + class Args: + key = "70b-fp8-vllm" + runner_config = runner_file + runner_node = "h200-nv_1" + seq_lens = ["1k1k"] + step_size = 2 + test_mode = True + + result = generate_test_config(Args(), sample_master_config) + # In test mode, should only use lowest concurrency + assert all(entry['conc'] == 1 or entry['conc'] == 2 for entry in result) + + +def test_generate_test_config_specific_runner_node(sample_master_config, temp_config_files): + """Test test config with specific runner node.""" + _, runner_file = temp_config_files + + class Args: + key = "70b-fp8-vllm" + runner_config = runner_file + runner_node = "h200-nv_1" + seq_lens = None + step_size = 2 + test_mode = False + + result = generate_test_config(Args(), sample_master_config) + assert all(entry['runner'] == 'h200-nv_1' for entry in result) + + +def test_generate_test_config_invalid_key(sample_master_config, temp_config_files): + """Test test config with invalid key.""" + _, runner_file = temp_config_files + + class Args: + key = "nonexistent-key" + runner_config = runner_file + runner_node = None + seq_lens = None + step_size = 2 + test_mode = False + + with pytest.raises(ValueError, match="does not exist in config files"): + generate_test_config(Args(), sample_master_config) + + +def test_generate_test_config_invalid_runner_node(sample_master_config, temp_config_files): + """Test test config with invalid runner node.""" + _, runner_file = temp_config_files + + class Args: + key = "70b-fp8-vllm" + runner_config = runner_file + runner_node = "invalid-node" + seq_lens = None + step_size = 2 + test_mode = False + + with pytest.raises(ValueError, match="is not compatible"): + generate_test_config(Args(), sample_master_config) + + +def test_generate_test_config_missing_runner_config(sample_master_config): + """Test test config with missing runner config file.""" + class Args: + key = "70b-fp8-vllm" + runner_config = "/nonexistent/file.yaml" + runner_node = None + seq_lens = None + step_size = 2 + test_mode = False + + with pytest.raises(ValueError, match="does not exist"): + generate_test_config(Args(), sample_master_config) + + +def test_generate_test_config_concurrency_overshoot(temp_config_files): + """Test test config when concurrency step overshoots end value.""" + _, runner_file = temp_config_files + + config = { + "test-fp8-vllm": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-start": 1, "conc-end": 6} + ] + } + ] + } + } + + class Args: + key = "test-fp8-vllm" + runner_config = runner_file + runner_node = "h200-nv_1" + seq_lens = None + step_size = 4 # Will overshoot: 1, 4, 16 (clamped to 6) + test_mode = False + + result = generate_test_config(Args(), config) + conc_values = sorted(set(e['conc'] for e in result)) + assert 1 in conc_values + assert 4 in conc_values + assert 6 in conc_values + + +# Tests for generate_runner_model_sweep_config +def test_generate_runner_model_sweep_config(sample_master_config, temp_config_files): + """Test runner-model sweep config generation.""" + _, runner_file = temp_config_files + + class Args: + runner_type = "h200" + runner_config = runner_file + + result = generate_runner_model_sweep_config(Args(), sample_master_config) + assert len(result) > 0 + # Should have entries for each runner node under h200 + runners = set(entry['runner'] for entry in result) + assert 'h200-nv_1' in runners + assert 'h200-nv_2' in runners + + +def test_generate_runner_model_sweep_config_invalid_runner(sample_master_config, temp_config_files): + """Test runner-model sweep with invalid runner type.""" + _, runner_file = temp_config_files + + class Args: + runner_type = "invalid-runner" + runner_config = runner_file + + with pytest.raises(ValueError, match="does not exist in runner config"): + generate_runner_model_sweep_config(Args(), sample_master_config) + + +# Tests for generate_runner_sweep_config +def test_generate_runner_sweep_config(sample_master_config, temp_config_files): + """Test runner sweep config generation.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = "70b" + precision = None + framework = None + runner_config = runner_file + + result = generate_runner_sweep_config(Args(), sample_master_config) + assert len(result) > 0 + + +def test_generate_runner_sweep_config_with_filters(sample_master_config, temp_config_files): + """Test runner sweep with precision and framework filters.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = "70b" + precision = "fp8" + framework = "vllm" + runner_config = runner_file + + result = generate_runner_sweep_config(Args(), sample_master_config) + assert all(entry['precision'] == 'fp8' for entry in result) + assert all(entry['framework'] == 'vllm' for entry in result) + + +def test_generate_runner_sweep_config_no_matches(sample_master_config, temp_config_files): + """Test runner sweep with no matching configs.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = "nonexistent" + precision = None + framework = None + runner_config = runner_file + + with pytest.raises(ValueError, match="No configs found matching"): + generate_runner_sweep_config(Args(), sample_master_config) + + +# Tests for generate_custom_test +def test_generate_custom_test(temp_config_files): + """Test custom test generation.""" + _, runner_file = temp_config_files + + class Args: + runner_label = "h200" + image = "vllm/vllm-openai:latest" + model = "test/model" + framework = "vllm" + precision = "fp8" + exp_name = "custom_test" + runner_config = runner_file + + result = generate_custom_test(Args()) + assert len(result) == 1 + assert result[0]['image'] == "vllm/vllm-openai:latest" + assert result[0]['exp-name'] == "custom_test" + + +def test_generate_custom_test_invalid_runner(temp_config_files): + """Test custom test with invalid runner label.""" + _, runner_file = temp_config_files + + class Args: + runner_label = "invalid-runner" + image = "vllm/vllm-openai:latest" + model = "test/model" + framework = "vllm" + precision = "fp8" + exp_name = "custom_test" + runner_config = runner_file + + with pytest.raises(ValueError, match="Unable to find specified runner label"): + generate_custom_test(Args()) + + +# Tests for main function +def test_main_full_sweep(temp_config_files): + """Test main function with full-sweep command.""" + master_file, _ = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "full-sweep", + "--config-files", master_file, + "--seq-lens", "1k1k", + "--model-prefix", "70b", + "--step-size", "2" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + + +def test_main_full_sweep_with_filters(temp_config_files): + """Test main function with full-sweep command with filters.""" + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "full-sweep", + "--config-files", master_file, + "--runner-config", runner_file, + "--model-prefix", "70b", + "--precision", "fp8", + "--test-mode" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + + +def test_main_test_config(temp_config_files): + """Test main function with test-config command.""" + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "test-config", + "--config-files", master_file, + "--runner-config", runner_file, + "--key", "70b-fp8-vllm", + "--runner-node", "h200-nv_1", + "--test-mode" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + + +def test_main_runner_model_sweep(temp_config_files): + """Test main function with runner-model-sweep command.""" + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "runner-model-sweep", + "--config-files", master_file, + "--runner-config", runner_file, + "--runner-type", "h200" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + + +def test_main_runner_sweep(temp_config_files): + """Test main function with runner-sweep command.""" + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "runner-sweep", + "--config-files", master_file, + "--runner-config", runner_file, + "--model-prefix", "70b" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + + +def test_main_custom(temp_config_files): + """Test main function with custom command.""" + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "custom", + "--config-files", master_file, + "--runner-config", runner_file, + "--runner-label", "h200", + "--image", "test:latest", + "--model", "test/model", + "--framework", "vllm", + "--precision", "fp8", + "--exp-name", "custom_test" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) == 1 + + +def test_main_invalid_config_structure(tmp_path): + """Test main with invalid config structure.""" + invalid_file = tmp_path / "invalid.yaml" + with open(invalid_file, 'w') as f: + yaml.dump({"key": {"image": "test"}}, f) # Missing required fields + + test_args = [ + "generate_sweep_configs.py", + "full-sweep", + "--config-files", str(invalid_file), + "--seq-lens", "1k1k", + "--model-prefix", "test" + ] + + with patch('sys.argv', test_args): + with pytest.raises(ValueError): + main() + + +def test_main_validation_failure(temp_config_files, monkeypatch): + """Test main with validation failure on output.""" + master_file, _ = temp_config_files + + # Monkey patch validate_matrix_output to always fail + def mock_validate(entries): + raise ValueError("Validation failed") + + monkeypatch.setattr('generate_sweep_configs.validate_matrix_output', mock_validate) + + test_args = [ + "generate_sweep_configs.py", + "full-sweep", + "--config-files", master_file, + "--seq-lens", "1k1k", + "--model-prefix", "70b" + ] + + with patch('sys.argv', test_args): + with pytest.raises(ValueError, match="Validation failed"): + main() + + +# Edge case tests +def test_concurrency_step_reaches_exact_end(sample_master_config, temp_config_files): + """Test that concurrency stepping reaches exact end value.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["8b"] + seq_lens = ["1k1k"] + step_size = 2 + precision = None + framework = None + runner_type = None + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + # conc-start=4, conc-end=16, step=2 should give 4,8,16 + conc_values = sorted(set(e['conc'] for e in result)) + assert 16 in conc_values + + +def test_multiple_model_prefixes_filtered_sweep(sample_master_config, temp_config_files): + """Test filtered sweep with multiple model prefixes.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["70b", "8b"] + precision = None + framework = None + runner_type = None + seq_lens = ["1k1k"] + step_size = 2 + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + exp_names = [e['exp-name'] for e in result] + assert any('70b' in name for name in exp_names) + assert any('8b' in name for name in exp_names) + + +def test_seq_len_filter_multiple(sample_master_config, temp_config_files): + """Test filtering with multiple sequence lengths.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["70b"] + precision = None + framework = None + runner_type = None + seq_lens = ["1k1k", "1k8k"] + step_size = 2 + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + seq_lens = set((e['isl'], e['osl']) for e in result) + assert (1024, 1024) in seq_lens + assert (1024, 8192) in seq_lens + + +def test_default_ep_dp_attn_values(sample_master_config, temp_config_files): + """Test that default ep and dp-attn values are set correctly.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["8b"] + seq_lens = ["1k1k"] + step_size = 2 + precision = None + framework = None + runner_type = None + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + # 8b config doesn't specify ep/dp-attn, so should use defaults + assert all(e['ep'] == 1 for e in result) + assert all(e['dp-attn'] == False for e in result) + + +def test_max_model_len_calculation(sample_master_config, temp_config_files): + """Test that max-model-len is calculated correctly.""" + _, runner_file = temp_config_files + + class Args: + model_prefix = ["70b"] + seq_lens = ["1k8k"] + step_size = 2 + precision = None + framework = None + runner_type = None + test_mode = False + runner_config = runner_file + + result = generate_full_sweep(Args(), sample_master_config) + # isl=1024, osl=8192, so max-model-len should be 1024+8192+200=9416 + assert all(e['max-model-len'] == 9416 for e in result) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "--cov=generate_sweep_configs", "--cov-report=term-missing"]) From 5729c677cf3f55b3cf1dc536b04c57c41d4721ed Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 09:08:45 -0500 Subject: [PATCH 070/149] list tp ep dpa then conc --- utils/summarize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/summarize.py b/utils/summarize.py index de8863c78..6d926255e 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -12,7 +12,7 @@ results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['ep'], r['conc'])) summary_header = f'''\ -| Hardware | Framework | Precision | TP | EP | Conc | DP Attention | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | +| Hardware | Framework | Precision | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(summary_header) @@ -26,8 +26,8 @@ f"| {precision.upper()} " f"| {result['tp']} " f"| {result['ep']} " - f"| {result['conc']} " f"| {result['dp_attention']} " + f"| {result['conc']} " f"| {(result['median_ttft'] * 1000):.4f} " f"| {(result['median_tpot'] * 1000):.4f} " f"| {result['median_e2el']:.4f} " From 6edcc3ac6d0b0755a45f60a6236d134976a33f05 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 09:15:01 -0500 Subject: [PATCH 071/149] removing 70b stuff --- .github/configs/amd-master.yaml | 116 --------------- .github/configs/nvidia-master.yaml | 199 -------------------------- .github/workflows/1k1k-sweep.yml | 46 ------ .github/workflows/1k8k-sweep.yml | 45 ------ .github/workflows/8k1k-sweep.yml | 45 ------ benchmarks/70b_fp4_b200_docker.sh | 48 ------- benchmarks/70b_fp4_b200_trt_docker.sh | 46 ------ benchmarks/70b_fp4_b200_trt_slurm.sh | 81 ----------- benchmarks/70b_fp4_mi355x_docker.sh | 55 ------- benchmarks/70b_fp4_mi355x_slurm.sh | 84 ----------- benchmarks/70b_fp8_b200_docker.sh | 46 ------ benchmarks/70b_fp8_b200_trt_docker.sh | 46 ------ benchmarks/70b_fp8_b200_trt_slurm.sh | 81 ----------- benchmarks/70b_fp8_h100_docker.sh | 29 ---- benchmarks/70b_fp8_h100_slurm.sh | 60 -------- benchmarks/70b_fp8_h200_slurm.sh | 76 ---------- benchmarks/70b_fp8_h200_trt_slurm.sh | 76 ---------- benchmarks/70b_fp8_mi300x_docker.sh | 59 -------- benchmarks/70b_fp8_mi300x_slurm.sh | 92 ------------ benchmarks/70b_fp8_mi325x_docker.sh | 53 ------- benchmarks/70b_fp8_mi325x_slurm.sh | 92 ------------ benchmarks/70b_fp8_mi355x_docker.sh | 50 ------- benchmarks/70b_fp8_mi355x_slurm.sh | 75 ---------- 23 files changed, 1600 deletions(-) delete mode 100644 benchmarks/70b_fp4_b200_docker.sh delete mode 100644 benchmarks/70b_fp4_b200_trt_docker.sh delete mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh delete mode 100644 benchmarks/70b_fp4_mi355x_docker.sh delete mode 100644 benchmarks/70b_fp4_mi355x_slurm.sh delete mode 100644 benchmarks/70b_fp8_b200_docker.sh delete mode 100644 benchmarks/70b_fp8_b200_trt_docker.sh delete mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh delete mode 100755 benchmarks/70b_fp8_h100_docker.sh delete mode 100644 benchmarks/70b_fp8_h100_slurm.sh delete mode 100644 benchmarks/70b_fp8_h200_slurm.sh delete mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh delete mode 100644 benchmarks/70b_fp8_mi300x_docker.sh delete mode 100644 benchmarks/70b_fp8_mi300x_slurm.sh delete mode 100644 benchmarks/70b_fp8_mi325x_docker.sh delete mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh delete mode 100644 benchmarks/70b_fp8_mi355x_docker.sh delete mode 100644 benchmarks/70b_fp8_mi355x_slurm.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 81c436366..55086d443 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1,119 +1,3 @@ -70b-fp4-mi355x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: amd/Llama-3.3-70B-Instruct-MXFP4-Preview - runner: mi355x - precision: fp4 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - -70b-fp8-mi300x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: amd/Llama-3.3-70B-Instruct-FP8-KV - runner: mi300x - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - -70b-fp8-mi325x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: amd/Llama-3.3-70B-Instruct-FP8-KV - runner: mi325x - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 32, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 64, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 16, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 32 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - -70b-fp8-mi355x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 - model: amd/Llama-3.3-70B-Instruct-FP8-KV - runner: mi355x - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - dsr1-fp4-mi355x-sgl: image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915 model: amd/DeepSeek-R1-0528-MXFP4-Preview diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index fe9ef989d..9da1cd0f9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1,202 +1,3 @@ -70b-fp4-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: nvidia/Llama-3.3-70B-Instruct-FP4 - runner: b200-trt - precision: fp4 - framework: trt - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 16, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 32, conc-end: 128 } - - { tp: 2, conc-start: 16, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - -70b-fp4-b200-vllm: - image: vllm/vllm-openai:v0.10.2 - model: nvidia/Llama-3.3-70B-Instruct-FP4 - runner: b200 - precision: fp4 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 16, conc-end: 64 } - - { tp: 2, conc-start: 16, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 32 } - - { tp: 8, conc-start: 4, conc-end: 8 } - -70b-fp8-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: nvidia/Llama-3.3-70B-Instruct-FP8 - runner: b200-trt - precision: fp8 - framework: trt - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 16, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 32, conc-end: 128 } - - { tp: 2, conc-start: 16, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 16 } - -70b-fp8-b200-vllm: - image: vllm/vllm-openai:v0.10.2 - model: nvidia/Llama-3.3-70B-Instruct-FP8 - runner: b200 - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 16, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, conc-start: 16, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 32 } - -70b-fp8-h100-vllm: - image: vllm/vllm-openai:v0.10.2 - model: nvidia/Llama-3.3-70B-Instruct-FP8 - runner: h100 - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - -70b-fp8-h200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: nvidia/Llama-3.3-70B-Instruct-FP8 - runner: h200-trt - precision: fp8 - framework: trt - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 128, conc-end: 128 } - - { tp: 2, conc-start: 64, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 16, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 32 } - -70b-fp8-h200-vllm: - image: vllm/vllm-openai:v0.10.2 - model: nvidia/Llama-3.3-70B-Instruct-FP8 - runner: h200 - precision: fp8 - framework: vllm - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 32, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 16, conc-end: 64 } - - { tp: 2, conc-start: 16, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - dsr1-fp4-b200-sgl: image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 model: nvidia/DeepSeek-R1-0528-FP4-V2 diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 958fd73b9..cbdc490e2 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -11,20 +11,6 @@ on: # - cron: '0 23 * * *' jobs: - get-70b-configs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - id: get-70b-configs - run: | - pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - get-dsr1-configs: runs-on: ubuntu-latest outputs: @@ -53,30 +39,6 @@ jobs: CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - benchmark-70b: - needs: get-70b-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: 70b 1k1k - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "70b_1k1k" - isl: 1024 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - benchmark-dsr1: needs: get-dsr1-configs uses: ./.github/workflows/benchmark-tmpl.yml @@ -125,14 +87,6 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - collect-70b-results: - needs: benchmark-70b - if: ${{ always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "70b_1k1k" - collect-dsr1-results: needs: benchmark-dsr1 if: ${{ always() }} diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index 68fbac028..25fc3a362 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -11,19 +11,6 @@ on: # - cron: '0 23 * * *' jobs: - get-70b-configs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - id: get-70b-configs - run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - get-dsr1-configs: runs-on: ubuntu-latest outputs: @@ -50,30 +37,6 @@ jobs: CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - benchmark-70b: - needs: get-70b-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: 70b 1k8k - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "70b_1k8k" - isl: 1024 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - benchmark-dsr1: needs: get-dsr1-configs uses: ./.github/workflows/benchmark-tmpl.yml @@ -122,14 +85,6 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - collect-70b-results: - needs: benchmark-70b - if: ${{ always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "70b_1k8k" - collect-dsr1-results: needs: benchmark-dsr1 if: ${{ always() }} diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml index 7be91c4fb..c8338d533 100644 --- a/.github/workflows/8k1k-sweep.yml +++ b/.github/workflows/8k1k-sweep.yml @@ -11,19 +11,6 @@ on: # - cron: '0 23 * * *' jobs: - get-70b-configs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - id: get-70b-configs - run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix 70b) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - get-dsr1-configs: runs-on: ubuntu-latest outputs: @@ -50,30 +37,6 @@ jobs: CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - benchmark-70b: - needs: get-70b-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: 70b 8k1k - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "70b_8k1k" - isl: 1024 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - benchmark-dsr1: needs: get-dsr1-configs uses: ./.github/workflows/benchmark-tmpl.yml @@ -122,14 +85,6 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - collect-70b-results: - needs: benchmark-70b - if: ${{ always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "70b_8k1k" - collect-dsr1-results: needs: benchmark-dsr1 if: ${{ always() }} diff --git a/benchmarks/70b_fp4_b200_docker.sh b/benchmarks/70b_fp4_b200_docker.sh deleted file mode 100644 index a76ffb9f8..000000000 --- a/benchmarks/70b_fp4_b200_docker.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -nvidia-smi - -# To improve CI stability, we patch this helper function to prevent a race condition that -# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779 -sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py - -# Calculate max-model-len based on ISL and OSL -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) -elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) -else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}' -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -export TORCH_CUDA_ARCH_LIST="10.0" -export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' -export PYTHONNOUSERSITE=1 - -set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ ---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ ---disable-log-requests diff --git a/benchmarks/70b_fp4_b200_trt_docker.sh b/benchmarks/70b_fp4_b200_trt_docker.sh deleted file mode 100644 index e30478672..000000000 --- a/benchmarks/70b_fp4_b200_trt_docker.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -set -x -# Launch TRT-LLM server -mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \ ---max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh deleted file mode 100644 index a480ca910..000000000 --- a/benchmarks/70b_fp4_b200_trt_slurm.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - -set -x - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -# Launch TRT-LLM server -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp4_mi355x_docker.sh b/benchmarks/70b_fp4_mi355x_docker.sh deleted file mode 100644 index 681a629fb..000000000 --- a/benchmarks/70b_fp4_mi355x_docker.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -le "16" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -le "16" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi - if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -fi - -set -x -vllm serve $MODEL \ ---host=0.0.0.0 \ ---port $PORT \ ---swap-space 64 \ ---max-model-len $MAX_MODEL_LEN \ ---tensor-parallel-size $TP \ ---max-num-seqs 1024 \ ---kv-cache-dtype fp8 \ ---gpu-memory-utilization 0.94 \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ ---max-num-batched-tokens 131072 \ ---no-enable-prefix-caching \ ---disable-log-requests \ ---async-scheduling diff --git a/benchmarks/70b_fp4_mi355x_slurm.sh b/benchmarks/70b_fp4_mi355x_slurm.sh deleted file mode 100644 index 0d5a469d0..000000000 --- a/benchmarks/70b_fp4_mi355x_slurm.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# PORT -# RESULT_FILENAME - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=8888 - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -le "16" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -le "16" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi - if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -fi - - -set -x -vllm serve $MODEL \ ---host=0.0.0.0 \ ---port $PORT \ ---swap-space 64 \ ---max-model-len $MAX_MODEL_LEN \ ---tensor-parallel-size $TP \ ---max-num-seqs 1024 \ ---kv-cache-dtype fp8 \ ---gpu-memory-utilization 0.94 \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ ---max-num-batched-tokens 131072 \ ---no-enable-prefix-caching \ ---disable-log-requests \ ---async-scheduling > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json - diff --git a/benchmarks/70b_fp8_b200_docker.sh b/benchmarks/70b_fp8_b200_docker.sh deleted file mode 100644 index dbcfaf6fd..000000000 --- a/benchmarks/70b_fp8_b200_docker.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -nvidia-smi - -# To improve CI stability, we patch this helper function to prevent a race condition that -# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779 -sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py - - - -FUSION_FLAG='{'\ -'"pass_config": {"enable_fi_allreduce_fusion": true, "enable_attn_fusion": true, "enable_noop": true},'\ -'"custom_ops": ["+quant_fp8", "+rms_norm"],'\ -'"cudagraph_mode": "FULL_DECODE_ONLY",'\ -'"splitting_ops": []'\ -'}' -cat > config.yaml <<-EOF -kv-cache-dtype: fp8 -compilation-config: '$FUSION_FLAG' -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: $MAX_MODEL_LEN -EOF - -cat config.yaml # Debugging - -export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' -export PYTHONNOUSERSITE=1 - -set -x -vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=512 \ ---config config.yaml \ ---disable-log-requests diff --git a/benchmarks/70b_fp8_b200_trt_docker.sh b/benchmarks/70b_fp8_b200_trt_docker.sh deleted file mode 100644 index e30478672..000000000 --- a/benchmarks/70b_fp8_b200_trt_docker.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -set -x -# Launch TRT-LLM server -mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \ ---max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh deleted file mode 100644 index a480ca910..000000000 --- a/benchmarks/70b_fp8_b200_trt_slurm.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - -set -x - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -# Launch TRT-LLM server -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h100_docker.sh b/benchmarks/70b_fp8_h100_docker.sh deleted file mode 100755 index 5d8df1bac..000000000 --- a/benchmarks/70b_fp8_h100_docker.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# MAX_MODEL_LEN -# TP -# CONC - -pip install -q datasets pandas - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: 10240 -EOF - -export PYTHONNOUSERSITE=1 - -vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---config=config.yaml \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=$CONC \ ---disable-log-requests diff --git a/benchmarks/70b_fp8_h100_slurm.sh b/benchmarks/70b_fp8_h100_slurm.sh deleted file mode 100644 index 485aa8817..000000000 --- a/benchmarks/70b_fp8_h100_slurm.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: 10240 -EOF - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) - -export TORCH_CUDA_ARCH_LIST="9.0" - -set -x -PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---config=config.yaml \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=$CONC \ ---disable-log-requests > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -pip install -q datasets pandas -git clone https://github.com/kimbochen/bench_serving.git -set -x -python3 bench_serving/benchmark_serving.py \ ---model=$MODEL \ ---backend=vllm \ ---base-url="http://0.0.0.0:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh deleted file mode 100644 index 86eefd8ce..000000000 --- a/benchmarks/70b_fp8_h200_slurm.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -set -x -hf download $MODEL -pip install datasets pandas - -# Calculate max-model-len based on ISL and OSL -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) -elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) -else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} -fi - -# Create config.yaml -cat > config.yaml << EOF -kv-cache-dtype: fp8 -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -export TORCH_CUDA_ARCH_LIST="9.0" - -PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ - --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ - --disable-log-requests > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - # Ignore intel_extension_for_pytorch import errors - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ "intel_extension_for_pytorch" ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh deleted file mode 100644 index 28112196f..000000000 --- a/benchmarks/70b_fp8_h200_trt_slurm.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh deleted file mode 100644 index 941e95023..000000000 --- a/benchmarks/70b_fp8_mi300x_docker.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -# If the machine runs a MEC FW older than 177, RCCL -# cannot reclaim some memory. -# Disable that features to avoid crashes. -# This is related to the changes in the driver at: -# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -# In this specific case, float16 performs better than the datatype -# picked by vllm when using auto for --dtype (bfloat16). -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=float16 --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh deleted file mode 100644 index b387505f0..000000000 --- a/benchmarks/70b_fp8_mi300x_slurm.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -huggingface-cli download $MODEL - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=8888 - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -# If the machine runs a MEC FW older than 177, RCCL -# cannot reclaim some memory. -# Disable that features to avoid crashes. -# This is related to the changes in the driver at: -# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -# In this specific case, float16 performs better than the datatype -# picked by vllm when using auto for --dtype (bfloat16). -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=float16 --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests \ -> $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm \ ---base-url="http://0.0.0.0:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh deleted file mode 100644 index 9e1fcdf8b..000000000 --- a/benchmarks/70b_fp8_mi325x_docker.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -# Patch the aiter config script to deal -# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch. -file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py' -sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch - - -# In this specific case, float16 performs better than the datatype -# picked by vllm when using auto for --dtype (bfloat16). -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=float16 --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh deleted file mode 100644 index 105ba7185..000000000 --- a/benchmarks/70b_fp8_mi325x_slurm.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -huggingface-cli download $MODEL - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -# Patch the aiter config script to deal -# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch. -file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py' -sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch - - -# In this specific case, float16 performs better than the datatype -# picked by vllm when using auto for --dtype (bfloat16). -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=float16 --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests \ -> $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_mi355x_docker.sh b/benchmarks/70b_fp8_mi355x_docker.sh deleted file mode 100644 index 6310a5f64..000000000 --- a/benchmarks/70b_fp8_mi355x_docker.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -sleep 5 -cat config.yaml - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=auto --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests - diff --git a/benchmarks/70b_fp8_mi355x_slurm.sh b/benchmarks/70b_fp8_mi355x_slurm.sh deleted file mode 100644 index 2abfee137..000000000 --- a/benchmarks/70b_fp8_mi355x_slurm.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# PORT -# RESULT_FILENAME - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=auto --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests \ -> $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json - -exit From 69844b2eadc804093fab94a1359aee8c21aaf4b5 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 17:37:58 -0500 Subject: [PATCH 072/149] temp fix (#148) --- benchmarks/70b_fp4_b200_trt_slurm.sh | 75 +++++++++++++++++++++ benchmarks/70b_fp8_b200_trt_slurm.sh | 75 +++++++++++++++++++++ benchmarks/70b_fp8_h200_slurm.sh | 69 ++++++++++++++++++++ benchmarks/70b_fp8_h200_trt_slurm.sh | 70 ++++++++++++++++++++ benchmarks/70b_fp8_mi325x_slurm.sh | 86 +++++++++++++++++++++++++ benchmarks/dsr1_fp4_b200_trt_slurm.sh | 6 -- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 6 -- benchmarks/dsr1_fp8_h200_slurm.sh | 6 -- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 6 -- benchmarks/dsr1_fp8_mi325x_slurm.sh | 6 -- benchmarks/gptoss_fp4_h200_slurm.sh | 7 -- benchmarks/gptoss_fp4_h200_trt_slurm.sh | 6 -- 12 files changed, 375 insertions(+), 43 deletions(-) create mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh create mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh create mode 100644 benchmarks/70b_fp8_h200_slurm.sh create mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh create mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh new file mode 100644 index 000000000..ad24453b3 --- /dev/null +++ b/benchmarks/70b_fp4_b200_trt_slurm.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + + +set -x + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh new file mode 100644 index 000000000..ad24453b3 --- /dev/null +++ b/benchmarks/70b_fp8_b200_trt_slurm.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + + +set -x + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh new file mode 100644 index 000000000..094fbd19c --- /dev/null +++ b/benchmarks/70b_fp8_h200_slurm.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +set -x +hf download $MODEL +pip install datasets pandas + +# Calculate max-model-len based on ISL and OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) +elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) +else + CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} +fi + +# Create config.yaml +cat > config.yaml << EOF +kv-cache-dtype: fp8 +async-scheduling: true +no-enable-prefix-caching: true +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +export TORCH_CUDA_ARCH_LIST="9.0" + +PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ + --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ + --disable-log-requests > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh new file mode 100644 index 000000000..dfb2324b9 --- /dev/null +++ b/benchmarks/70b_fp8_h200_trt_slurm.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh new file mode 100644 index 000000000..1febeff13 --- /dev/null +++ b/benchmarks/70b_fp8_mi325x_slurm.sh @@ -0,0 +1,86 @@ +#!/usr/bin/bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +huggingface-cli download $MODEL + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +# Reference +# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark + +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ "$CONC" -ge "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi +fi + +# Patch the aiter config script to deal +# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch. +file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py' +sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch + + +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). +set -x +vllm serve $MODEL --port=$PORT \ +--swap-space=64 \ +--gpu-memory-utilization=0.94 \ +--dtype=float16 --kv-cache-dtype=fp8 \ +--distributed-executor-backend=mp --tensor-parallel-size=$TP \ +--max-model-len=$MAX_MODEL_LEN \ +--max-seq-len-to-capture=$MAX_MODEL_LEN \ +--max-num-seqs=$CONC \ +--max-num-batched-tokens=131072 \ +--no-enable-prefix-caching \ +--config config.yaml \ +--async-scheduling \ +--disable-log-requests \ +> $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index ababfa150..6f4f814a0 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -104,12 +104,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi if [[ "$line" == *"Application startup complete"* ]]; then break fi diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 509cca7ba..58d4525f1 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -74,12 +74,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi if [[ "$line" == *"Application startup complete"* ]]; then break fi diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 765cf7dcd..74a005a78 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -47,12 +47,6 @@ fi set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi if [[ "$line" == *"Application startup complete"* ]]; then break fi diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 174d67b53..7b566c0ab 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -74,12 +74,6 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi if [[ "$line" == *"Application startup complete"* ]]; then break fi diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index acbe78d08..d502093d8 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -26,12 +26,6 @@ python3 -m sglang.launch_server \ set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 "$SERVER_LOG" - echo "JOB $SLURM_JOB_ID ran on $SLURMD_NODENAME" - exit 1 - fi if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then break fi diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 61bef8aaa..23ac0bfa1 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -50,13 +50,6 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config set +x while IFS= read -r line; do printf '%s\n' "$line" - # Ignore intel_extension_for_pytorch import errors - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ "intel_extension_for_pytorch" ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi if [[ "$line" == *"Application startup complete"* ]]; then break fi diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index 969d65310..c148a3cb7 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -51,12 +51,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --max_batch_ set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 $SERVER_LOG - echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME" - exit 1 - fi if [[ "$line" == *"Application startup complete"* ]]; then break fi From 1105aea6ae2ed4ff700505ee73bf49678b7ab7d1 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 29 Oct 2025 22:01:12 -0400 Subject: [PATCH 073/149] remove: llama 70b --- .github/workflows/full-sweep-tmpl.yml | 188 ++++++++++ .github/workflows/runner-model-sweep-test.yml | 289 +++++++++++++++ .github/workflows/runner-sweep-test.yml | 328 ++++++++++++++++++ benchmarks/70b_fp4_b200_trt_slurm.sh | 75 ---- benchmarks/70b_fp8_b200_trt_slurm.sh | 75 ---- benchmarks/70b_fp8_h200_slurm.sh | 69 ---- benchmarks/70b_fp8_h200_trt_slurm.sh | 70 ---- benchmarks/70b_fp8_mi325x_slurm.sh | 86 ----- 8 files changed, 805 insertions(+), 375 deletions(-) create mode 100644 .github/workflows/full-sweep-tmpl.yml create mode 100644 .github/workflows/runner-model-sweep-test.yml create mode 100644 .github/workflows/runner-sweep-test.yml delete mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh delete mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh delete mode 100644 benchmarks/70b_fp8_h200_slurm.sh delete mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh delete mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml new file mode 100644 index 000000000..869928cb7 --- /dev/null +++ b/.github/workflows/full-sweep-tmpl.yml @@ -0,0 +1,188 @@ +name: Template - Full Sweep + +on: + workflow_call: + inputs: + run_1k1k: + type: boolean + required: true + run_8k1k: + type: boolean + required: true + run_1k8k: + type: boolean + required: true + + use_h100: + type: boolean + required: true + use_h200: + type: boolean + required: true + use_b200: + type: boolean + required: true + use_mi300x: + type: boolean + required: true + use_mi325x: + type: boolean + required: true + use_mi355x: + type: boolean + required: true + use_gb200: + type: boolean + required: false + default: false + +jobs: + dsr1-1k1k: + if: ${{ inputs.run_1k1k }} + uses: ./.github/workflows/dsr1-tmpl.yml + secrets: inherit + with: + exp-name: 'dsr1_1k1k' + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} + use_gb200: ${{ inputs.use_gb200 }} + + collect-dsr1-1k1k-results: + needs: dsr1-1k1k + if: ${{ inputs.run_1k1k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: 'dsr1_1k1k' + + gptoss-1k1k: + if: ${{ inputs.run_1k1k }} + uses: ./.github/workflows/gptoss-tmpl.yml + secrets: inherit + with: + exp-name: 'gptoss_1k1k' + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + use_h100: ${{ inputs.use_h100 }} + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} + + collect-gptoss-1k1k-results: + needs: gptoss-1k1k + if: ${{ inputs.run_1k1k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: 'gptoss_1k1k' + + dsr1-8k1k: + if: ${{ inputs.run_8k1k }} + uses: ./.github/workflows/dsr1-tmpl.yml + secrets: inherit + with: + exp-name: 'dsr1_8k1k' + isl: 8192 + osl: 1024 + max-model-len: 9216 + random-range-ratio: 0.8 + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} + use_gb200: ${{ inputs.use_gb200 }} + + collect-dsr1-8k1k-results: + needs: dsr1-8k1k + if: ${{ inputs.run_8k1k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: 'dsr1_8k1k' + + gptoss-8k1k: + if: ${{ inputs.run_8k1k }} + uses: ./.github/workflows/gptoss-tmpl.yml + secrets: inherit + with: + exp-name: 'gptoss_8k1k' + isl: 8192 + osl: 1024 + max-model-len: 9216 + random-range-ratio: 0.8 + use_h100: ${{ inputs.use_h100 }} + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} + + collect-gptoss-8k1k-results: + needs: gptoss-8k1k + if: ${{ inputs.run_8k1k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: 'gptoss_8k1k' + + dsr1-1k8k: + if: ${{ inputs.run_1k8k }} + uses: ./.github/workflows/dsr1-tmpl.yml + secrets: inherit + with: + exp-name: 'dsr1_1k8k' + isl: 1024 + osl: 8192 + max-model-len: 9216 + random-range-ratio: 0.8 + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} + use_gb200: ${{ inputs.use_gb200 }} + + collect-dsr1-1k8k-results: + needs: dsr1-1k8k + if: ${{ inputs.run_1k8k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: 'dsr1_1k8k' + + gptoss-1k8k: + if: ${{ inputs.run_1k8k }} + uses: ./.github/workflows/gptoss-tmpl.yml + secrets: inherit + with: + exp-name: 'gptoss_1k8k' + isl: 1024 + osl: 8192 + max-model-len: 9216 + random-range-ratio: 0.8 + use_h100: ${{ inputs.use_h100 }} + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} + + collect-gptoss-1k8k-results: + needs: gptoss-1k8k + if: ${{ inputs.run_1k8k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: 'gptoss_1k8k' diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml new file mode 100644 index 000000000..e4f2b7303 --- /dev/null +++ b/.github/workflows/runner-model-sweep-test.yml @@ -0,0 +1,289 @@ +name: 'Test - Runner Model Sweep' +run-name: '${{ github.event.inputs.runner }} Sweep' +on: + workflow_dispatch: + inputs: + runner: + description: 'Runner Type' + required: true + type: choice + options: + - 'h100' + - 'h200' + - 'h200-trt' + - 'b200' + - 'b200-trt' + - 'mi300x' + - 'mi325x' + - 'mi355x' + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_CACHE: '/mnt/hf_hub_cache/' + +jobs: + bmk-h100: + if: ${{ inputs.runner == 'h100' }} + strategy: + fail-fast: false + matrix: + runner: + - 'h100-cr_0' + - 'h100-cr_1' + - 'h100-cw_0' + - 'h100-cw_1' + config: + - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk-h200: + if: ${{ inputs.runner == 'h200' }} + strategy: + fail-fast: false + matrix: + runner: + - 'h200-cw_0' + - 'h200-cw_1' + - 'h200-nb_0' + - 'h200-nb_1' + - 'h200-nb_2' + - 'h200-nb_3' + - 'h200-nv_0' + - 'h200-nv_1' + - 'h200-nv_2' + - 'h200-nv_3' + config: + - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } + - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk-h200-trt: + if: ${{ inputs.runner == 'h200-trt' }} + strategy: + fail-fast: false + matrix: + runner: + - 'h200-cw_0' + - 'h200-cw_1' + - 'h200-nb_0' + - 'h200-nb_1' + - 'h200-nb_2' + - 'h200-nb_3' + - 'h200-nv_0' + - 'h200-nv_1' + - 'h200-nv_2' + - 'h200-nv_3' + config: + - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' } + - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' } + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk-b200: + if: ${{ inputs.runner == 'b200' }} + strategy: + fail-fast: false + matrix: + runner: + - 'b200-nvd_0' + - 'b200-nvd_1' + - 'b200-nvd_2' + - 'b200-nvd_3' + config: + - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } + - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } + - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[4]' + + bmk-b200-trt: + if: ${{ inputs.runner == 'b200-trt' }} + strategy: + fail-fast: false + matrix: + runner: + - 'b200-nv_0' + - 'b200-nv_1' + - 'b200-nb_0' + - 'b200-nb_1' + config: + - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' } + - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' } + - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' } + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk-mi300x: + if: ${{ inputs.runner == 'mi300x' }} + strategy: + fail-fast: false + matrix: + runner: + - 'mi300x-amd_0' + - 'mi300x-amd_1' + - 'mi300x-amd_2' + - 'mi300x-amd_3' + - 'mi300x-amd_4' + - 'mi300x-cr_0' + - 'mi300x-oci_0' + config: + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk-mi325x: + if: ${{ inputs.runner == 'mi325x' }} + strategy: + fail-fast: false + matrix: + runner: + - 'mi325x-amd_0' + - 'mi325x-tw_0' + - 'mi325x-tw_1' + - 'mi325x-tw_2' + - 'mi325x-tw_3' + config: + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk-mi355x: + if: ${{ inputs.runner == 'mi355x' }} + strategy: + fail-fast: false + matrix: + runner: + - 'mi355x-amd_0' + - 'mi355x-amd_1' + - 'mi355x-amd_2' + - 'mi355x-amd_3' + config: + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml new file mode 100644 index 000000000..8f824c4d1 --- /dev/null +++ b/.github/workflows/runner-sweep-test.yml @@ -0,0 +1,328 @@ +name: 'Test - Runner Sweep' +run-name: '${{ github.event.inputs.runner }} Sweep - ${{ github.event.inputs.model }}' +on: + workflow_dispatch: + inputs: + runner: + description: 'Runner Type' + required: true + type: choice + options: + - 'h100' + - 'h200' + - 'b200' + - 'h200-trt' + - 'b200-trt' + - 'mi300x' + - 'mi325x' + - 'mi355x' + - 'gb200' + + image: + description: 'Docker Image' + required: true + type: choice + options: + - 'lmsysorg/sglang:v0.4.9.post1-cu126' + - 'lmsysorg/sglang:v0.5.0rc1-cu128-b200' + - 'lmsysorg/sglang:v0.5.2rc2-cu126' + - 'lmsysorg/sglang:v0.5.3rc1-cu129-b200' + - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' + - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' + - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' + - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' + - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' + - 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3' + - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' + - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915' + - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915' + - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' + - 'vllm/vllm-openai:v0.10.2' + + model: + description: 'Model' + required: true + type: choice + options: + - 'amd/DeepSeek-R1-0528-MXFP4-Preview' + - 'deepseek-ai/DeepSeek-R1-0528' + - 'nvidia/DeepSeek-R1-0528-FP4' + - 'nvidia/DeepSeek-R1-0528-FP4-v2' + - 'openai/gpt-oss-120b' + + framework: + description: 'Framework' + required: true + type: choice + options: + - 'vllm' + - 'sglang' + - 'trt' + + precision: + description: 'Precision' + required: true + type: choice + options: + - 'fp8' + - 'fp4' + + exp-name: + description: 'Experiment Name' + required: true + type: choice + options: + - 'dsr1_test' + - 'gptoss_test' + + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_CACHE: '/mnt/hf_hub_cache/' + +jobs: + bmk_h100: + if: ${{ inputs.runner == 'h100' }} + strategy: + fail-fast: false + matrix: + runner: + - 'h100-cr_0' + - 'h100-cr_1' + - 'h100-cw_0' + - 'h100-cw_1' + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ inputs.image }} + model: ${{ inputs.model }} + framework: ${{ inputs.framework }} + precision: ${{ inputs.precision }} + exp-name: ${{ inputs.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk_h200: + if: ${{ inputs.runner == 'h200' || inputs.runner == 'h200-trt' }} + strategy: + fail-fast: false + matrix: + runner: + - 'h200-cw_0' + - 'h200-cw_1' + - 'h200-nb_0' + - 'h200-nb_1' + - 'h200-nb_2' + - 'h200-nb_3' + - 'h200-nv_0' + - 'h200-nv_1' + - 'h200-nv_2' + - 'h200-nv_3' + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ inputs.image }} + model: ${{ inputs.model }} + framework: ${{ inputs.framework }} + precision: ${{ inputs.precision }} + exp-name: ${{ inputs.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[4]' + conc-list: '[64]' + + bmk_b200: + if: ${{ inputs.runner == 'b200' }} + strategy: + fail-fast: false + matrix: + runner: + - 'b200-nv_0' + - 'b200-nv_1' + - 'b200-nvd_0' + - 'b200-nvd_1' + - 'b200-tg_0' + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ inputs.image }} + model: ${{ inputs.model }} + framework: ${{ inputs.framework }} + precision: ${{ inputs.precision }} + exp-name: ${{ inputs.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk_b200-trt: + if: ${{ inputs.runner == 'b200-trt' }} + strategy: + fail-fast: false + matrix: + runner: + - 'b200-nv_0' + - 'b200-nv_1' + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ inputs.image }} + model: ${{ inputs.model }} + framework: ${{ inputs.framework }} + precision: ${{ inputs.precision }} + exp-name: ${{ inputs.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk_mi300x: + if: ${{ inputs.runner == 'mi300x' }} + strategy: + fail-fast: false + matrix: + runner: + - 'mi300x-amd_0' + - 'mi300x-amd_1' + - 'mi300x-amd_2' + - 'mi300x-amd_3' + - 'mi300x-amd_4' + - 'mi300x-cr_0' + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ inputs.image }} + model: ${{ inputs.model }} + framework: ${{ inputs.framework }} + precision: ${{ inputs.precision }} + exp-name: ${{ inputs.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk_mi325x: + if: ${{ inputs.runner == 'mi325x' }} + strategy: + fail-fast: false + matrix: + runner: + - 'mi325x-amd_0' + - 'mi325x-tw_0' + - 'mi325x-tw_1' + - 'mi325x-tw_2' + - 'mi325x-tw_3' + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ inputs.image }} + model: ${{ inputs.model }} + framework: ${{ inputs.framework }} + precision: ${{ inputs.precision }} + exp-name: ${{ inputs.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk_mi355x: + if: ${{ inputs.runner == 'mi355x' }} + strategy: + fail-fast: false + matrix: + runner: + - 'mi355x-amd_0' + - 'mi355x-amd_1' + - 'mi355x-amd_2' + - 'mi355x-amd_3' + + name: '${{ matrix.runner }}' + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: ${{ matrix.runner }} + image: ${{ inputs.image }} + model: ${{ inputs.model }} + framework: ${{ inputs.framework }} + precision: ${{ inputs.precision }} + exp-name: ${{ inputs.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + tp-list: '[8]' + conc-list: '[1]' + + bmk_gb200: + if: ${{ inputs.runner == 'gb200' && inputs.framework == 'trt' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + secrets: inherit + with: + runner: gb200 + image: 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3' + model: 'deepseek-r1-fp4' + framework: 'dynamo-trtllm' + precision: 'fp4' + exp-name: ${{ inputs.exp-name }} + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + mtp-mode: 'off' + + bmk_gb200-sgl: + if: ${{ inputs.runner == 'gb200' && inputs.framework == 'sglang' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + secrets: inherit + with: + runner: gb200 + image: 'nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1' + model: 'deepseek-ai/DeepSeek-R1-0528' + framework: 'dynamo-sglang' + precision: 'fp8' + exp-name: ${{ inputs.exp-name }} + isl: 8192 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + mtp-mode: 'off' + + collect-test-results: + needs: [ bmk_h100, bmk_h200, bmk_b200, bmk_b200-trt, bmk_mi300x, bmk_mi325x, bmk_mi355x, bmk_gb200, bmk_gb200-sgl ] + if: ${{ always() && !cancelled() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: ${{ inputs.exp-name }} diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh deleted file mode 100644 index ad24453b3..000000000 --- a/benchmarks/70b_fp4_b200_trt_slurm.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - -set -x - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -# Launch TRT-LLM server -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh deleted file mode 100644 index ad24453b3..000000000 --- a/benchmarks/70b_fp8_b200_trt_slurm.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - -set -x - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -# Launch TRT-LLM server -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh deleted file mode 100644 index 094fbd19c..000000000 --- a/benchmarks/70b_fp8_h200_slurm.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -set -x -hf download $MODEL -pip install datasets pandas - -# Calculate max-model-len based on ISL and OSL -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) -elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) -else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} -fi - -# Create config.yaml -cat > config.yaml << EOF -kv-cache-dtype: fp8 -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -export TORCH_CUDA_ARCH_LIST="9.0" - -PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ - --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ - --disable-log-requests > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh deleted file mode 100644 index dfb2324b9..000000000 --- a/benchmarks/70b_fp8_h200_trt_slurm.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh deleted file mode 100644 index 1febeff13..000000000 --- a/benchmarks/70b_fp8_mi325x_slurm.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -huggingface-cli download $MODEL - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -# Patch the aiter config script to deal -# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch. -file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py' -sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch - - -# In this specific case, float16 performs better than the datatype -# picked by vllm when using auto for --dtype (bfloat16). -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=float16 --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests \ -> $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json From 24ea7def4aeaade77e95e35ae0cab30c5259a9ff Mon Sep 17 00:00:00 2001 From: "kimbo@semianalysis.com" Date: Thu, 30 Oct 2025 02:05:35 +0000 Subject: [PATCH 074/149] revert remove: llama 70b --- .github/workflows/70b-tmpl.yml | 230 ++++++++++++++++++ .github/workflows/full-sweep-tmpl.yml | 75 ++++++ .github/workflows/runner-model-sweep-test.yml | 11 + .github/workflows/runner-sweep-test.yml | 5 + benchmarks/70b_fp4_b200_docker.sh | 48 ++++ benchmarks/70b_fp4_b200_trt_docker.sh | 46 ++++ benchmarks/70b_fp4_b200_trt_slurm.sh | 75 ++++++ benchmarks/70b_fp4_mi355x_docker.sh | 55 +++++ benchmarks/70b_fp4_mi355x_slurm.sh | 84 +++++++ benchmarks/70b_fp8_b200_docker.sh | 46 ++++ benchmarks/70b_fp8_b200_trt_docker.sh | 46 ++++ benchmarks/70b_fp8_b200_trt_slurm.sh | 75 ++++++ benchmarks/70b_fp8_h100_docker.sh | 29 +++ benchmarks/70b_fp8_h100_slurm.sh | 60 +++++ benchmarks/70b_fp8_h200_slurm.sh | 69 ++++++ benchmarks/70b_fp8_h200_trt_slurm.sh | 70 ++++++ benchmarks/70b_fp8_mi300x_docker.sh | 59 +++++ benchmarks/70b_fp8_mi300x_slurm.sh | 92 +++++++ benchmarks/70b_fp8_mi325x_docker.sh | 53 ++++ benchmarks/70b_fp8_mi325x_slurm.sh | 86 +++++++ benchmarks/70b_fp8_mi355x_docker.sh | 50 ++++ benchmarks/70b_fp8_mi355x_slurm.sh | 75 ++++++ 22 files changed, 1439 insertions(+) create mode 100644 .github/workflows/70b-tmpl.yml create mode 100644 benchmarks/70b_fp4_b200_docker.sh create mode 100644 benchmarks/70b_fp4_b200_trt_docker.sh create mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh create mode 100644 benchmarks/70b_fp4_mi355x_docker.sh create mode 100644 benchmarks/70b_fp4_mi355x_slurm.sh create mode 100644 benchmarks/70b_fp8_b200_docker.sh create mode 100644 benchmarks/70b_fp8_b200_trt_docker.sh create mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh create mode 100755 benchmarks/70b_fp8_h100_docker.sh create mode 100644 benchmarks/70b_fp8_h100_slurm.sh create mode 100644 benchmarks/70b_fp8_h200_slurm.sh create mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh create mode 100644 benchmarks/70b_fp8_mi300x_docker.sh create mode 100644 benchmarks/70b_fp8_mi300x_slurm.sh create mode 100644 benchmarks/70b_fp8_mi325x_docker.sh create mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh create mode 100644 benchmarks/70b_fp8_mi355x_docker.sh create mode 100644 benchmarks/70b_fp8_mi355x_slurm.sh diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml new file mode 100644 index 000000000..3d1dd5051 --- /dev/null +++ b/.github/workflows/70b-tmpl.yml @@ -0,0 +1,230 @@ +name: Template - LLaMA 70B + +on: + workflow_call: + inputs: + exp-name: + required: true + type: string + isl: + required: true + type: string + osl: + required: true + type: string + max-model-len: + required: true + type: string + random-range-ratio: + required: true + type: string + + use_h100: + type: boolean + required: true + use_h200: + type: boolean + required: true + use_b200: + type: boolean + required: true + use_mi300x: + type: boolean + required: true + use_mi325x: + type: boolean + required: true + use_mi355x: + type: boolean + required: true + +jobs: + bmk-h100-fp8: + if: ${{ inputs.use_h100 }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: h100 + image: 'vllm/vllm-openai:v0.10.2' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + framework: 'vllm' + precision: 'fp8' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[2, 4, 8]' + + bmk-h200-fp8: + if: ${{ inputs.use_h200 }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: h200 + image: 'vllm/vllm-openai:v0.10.2' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + framework: 'vllm' + precision: 'fp8' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[1, 2, 4, 8]' + + bmk-h200-trt-fp8: + if: ${{ inputs.use_h200 }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: h200-trt + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + framework: 'trt' + precision: 'fp8' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[1, 2, 4, 8]' + conc-list: '[4, 8, 16, 32, 64, 128]' # H200 can achieve TPS/User >= 30 with larger concurrency till 128 + + bmk-b200-fp8: + if: ${{ inputs.use_b200 }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: b200 + image: 'vllm/vllm-openai:v0.10.2' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + framework: 'vllm' + precision: 'fp8' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has + + bmk-b200-trt-fp8: + if: ${{ inputs.use_b200 }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: b200-trt + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' + model: 'nvidia/Llama-3.3-70B-Instruct-FP8' + framework: 'trt' + precision: 'fp8' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has + conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 256 + + bmk-mi300x-fp8: + if: ${{ inputs.use_mi300x }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: mi300x + image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' + model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + framework: 'vllm' + precision: 'fp8' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[1, 2, 4, 8]' + + bmk-mi325x-fp8: + if: ${{ inputs.use_mi325x }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: mi325x + image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' + model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + framework: 'vllm' + precision: 'fp8' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[1, 2, 4, 8]' + + bmk-mi355x-fp8: + if: ${{ inputs.use_mi355x }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: mi355x + image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' + model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' + framework: 'vllm' + precision: 'fp8' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[1, 2, 4, 8]' + + bmk-b200-fp4: + if: ${{ inputs.use_b200 }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: b200 + image: 'vllm/vllm-openai:v0.10.2' + model: 'nvidia/Llama-3.3-70B-Instruct-FP4' + framework: 'vllm' + precision: 'fp4' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has + + bmk-b200-trt-fp4: + if: ${{ inputs.use_b200 }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: b200-trt + image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' + model: 'nvidia/Llama-3.3-70B-Instruct-FP4' + framework: 'trt' + precision: 'fp4' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has + conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 128 + + bmk-mi355x-fp4: + if: ${{ inputs.use_mi355x }} + uses: ./.github/workflows/benchmark-tmpl.yml + secrets: inherit + with: + runner: mi355x + image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' + model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview' + framework: 'vllm' + precision: 'fp4' + exp-name: ${{ inputs.exp-name }} + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ inputs.max-model-len }} + random-range-ratio: ${{ inputs.random-range-ratio }} + tp-list: '[1, 2, 4, 8]' diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml index 869928cb7..b086460df 100644 --- a/.github/workflows/full-sweep-tmpl.yml +++ b/.github/workflows/full-sweep-tmpl.yml @@ -37,6 +37,31 @@ on: default: false jobs: + _70b-1k1k: + if: ${{ inputs.run_1k1k }} + uses: ./.github/workflows/70b-tmpl.yml + secrets: inherit + with: + exp-name: '70b_1k1k' + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + use_h100: ${{ inputs.use_h100 }} + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} + + collect-70b-1k1k-results: + needs: _70b-1k1k + if: ${{ inputs.run_1k1k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: '70b_1k1k' + dsr1-1k1k: if: ${{ inputs.run_1k1k }} uses: ./.github/workflows/dsr1-tmpl.yml @@ -87,6 +112,31 @@ jobs: with: exp-name: 'gptoss_1k1k' + _70b-8k1k: + if: ${{ inputs.run_8k1k }} + uses: ./.github/workflows/70b-tmpl.yml + secrets: inherit + with: + exp-name: '70b_8k1k' + isl: 8192 + osl: 1024 + max-model-len: 9216 + random-range-ratio: 0.8 + use_h100: ${{ inputs.use_h100 }} + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} + + collect-70b-8k1k-results: + needs: _70b-8k1k + if: ${{ inputs.run_8k1k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: '70b_8k1k' + dsr1-8k1k: if: ${{ inputs.run_8k1k }} uses: ./.github/workflows/dsr1-tmpl.yml @@ -137,6 +187,31 @@ jobs: with: exp-name: 'gptoss_8k1k' + _70b-1k8k: + if: ${{ inputs.run_1k8k }} + uses: ./.github/workflows/70b-tmpl.yml + secrets: inherit + with: + exp-name: '70b_1k8k' + isl: 1024 + osl: 8192 + max-model-len: 9216 + random-range-ratio: 0.8 + use_h100: ${{ inputs.use_h100 }} + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} + + collect-70b-1k8k-results: + needs: _70b-1k8k + if: ${{ inputs.run_1k8k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: '70b_1k8k' + dsr1-1k8k: if: ${{ inputs.run_1k8k }} uses: ./.github/workflows/dsr1-tmpl.yml diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml index e4f2b7303..212ffc07c 100644 --- a/.github/workflows/runner-model-sweep-test.yml +++ b/.github/workflows/runner-model-sweep-test.yml @@ -33,6 +33,7 @@ jobs: - 'h100-cw_0' - 'h100-cw_1' config: + - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } name: '${{ matrix.runner }}' @@ -69,6 +70,7 @@ jobs: - 'h200-nv_2' - 'h200-nv_3' config: + - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } @@ -106,6 +108,7 @@ jobs: - 'h200-nv_2' - 'h200-nv_3' config: + - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' } - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' } @@ -137,6 +140,8 @@ jobs: - 'b200-nvd_2' - 'b200-nvd_3' config: + - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } + - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' } - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } @@ -169,6 +174,8 @@ jobs: - 'b200-nb_0' - 'b200-nb_1' config: + - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' } + - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'trt', precision: 'fp4', exp-name: '70b_test' } - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' } - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' } @@ -204,6 +211,7 @@ jobs: - 'mi300x-cr_0' - 'mi300x-oci_0' config: + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } @@ -236,6 +244,7 @@ jobs: - 'mi325x-tw_2' - 'mi325x-tw_3' config: + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } @@ -267,6 +276,8 @@ jobs: - 'mi355x-amd_2' - 'mi355x-amd_3' config: + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } + - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml index 8f824c4d1..fd100474f 100644 --- a/.github/workflows/runner-sweep-test.yml +++ b/.github/workflows/runner-sweep-test.yml @@ -45,7 +45,11 @@ on: type: choice options: - 'amd/DeepSeek-R1-0528-MXFP4-Preview' + - 'amd/Llama-3.3-70B-Instruct-FP8-KV' + - 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview' - 'deepseek-ai/DeepSeek-R1-0528' + - 'nvidia/Llama-3.3-70B-Instruct-FP8' + - 'nvidia/Llama-3.3-70B-Instruct-FP4' - 'nvidia/DeepSeek-R1-0528-FP4' - 'nvidia/DeepSeek-R1-0528-FP4-v2' - 'openai/gpt-oss-120b' @@ -72,6 +76,7 @@ on: required: true type: choice options: + - '70b_test' - 'dsr1_test' - 'gptoss_test' diff --git a/benchmarks/70b_fp4_b200_docker.sh b/benchmarks/70b_fp4_b200_docker.sh new file mode 100644 index 000000000..a76ffb9f8 --- /dev/null +++ b/benchmarks/70b_fp4_b200_docker.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +nvidia-smi + +# To improve CI stability, we patch this helper function to prevent a race condition that +# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779 +sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py + +# Calculate max-model-len based on ISL and OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) +elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) +else + CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}' +async-scheduling: true +no-enable-prefix-caching: true +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +export TORCH_CUDA_ARCH_LIST="10.0" +export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' +export PYTHONNOUSERSITE=1 + +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ +--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ +--disable-log-requests diff --git a/benchmarks/70b_fp4_b200_trt_docker.sh b/benchmarks/70b_fp4_b200_trt_docker.sh new file mode 100644 index 000000000..e30478672 --- /dev/null +++ b/benchmarks/70b_fp4_b200_trt_docker.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +set -x +# Launch TRT-LLM server +mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \ +--max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh new file mode 100644 index 000000000..ad24453b3 --- /dev/null +++ b/benchmarks/70b_fp4_b200_trt_slurm.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + + +set -x + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp4_mi355x_docker.sh b/benchmarks/70b_fp4_mi355x_docker.sh new file mode 100644 index 000000000..681a629fb --- /dev/null +++ b/benchmarks/70b_fp4_mi355x_docker.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# PORT +# TP +# CONC +# MAX_MODEL_LEN + +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 + if [[ "$CONC" -le "16" ]]; then + export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 + else + export VLLM_TRITON_FP4_GEMM_USE_ASM=1 + fi +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 + if [[ "$CONC" -le "16" ]]; then + export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 + else + export VLLM_TRITON_FP4_GEMM_USE_ASM=1 + fi +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ "$CONC" -ge "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi + if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then + export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 + else + export VLLM_TRITON_FP4_GEMM_USE_ASM=1 + fi +fi + +set -x +vllm serve $MODEL \ +--host=0.0.0.0 \ +--port $PORT \ +--swap-space 64 \ +--max-model-len $MAX_MODEL_LEN \ +--tensor-parallel-size $TP \ +--max-num-seqs 1024 \ +--kv-cache-dtype fp8 \ +--gpu-memory-utilization 0.94 \ +--max-seq-len-to-capture $MAX_MODEL_LEN \ +--max-num-batched-tokens 131072 \ +--no-enable-prefix-caching \ +--disable-log-requests \ +--async-scheduling diff --git a/benchmarks/70b_fp4_mi355x_slurm.sh b/benchmarks/70b_fp4_mi355x_slurm.sh new file mode 100644 index 000000000..0d5a469d0 --- /dev/null +++ b/benchmarks/70b_fp4_mi355x_slurm.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash + +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# PORT +# RESULT_FILENAME + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=8888 + +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 + if [[ "$CONC" -le "16" ]]; then + export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 + else + export VLLM_TRITON_FP4_GEMM_USE_ASM=1 + fi +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 + if [[ "$CONC" -le "16" ]]; then + export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 + else + export VLLM_TRITON_FP4_GEMM_USE_ASM=1 + fi +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ "$CONC" -ge "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi + if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then + export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 + else + export VLLM_TRITON_FP4_GEMM_USE_ASM=1 + fi +fi + + +set -x +vllm serve $MODEL \ +--host=0.0.0.0 \ +--port $PORT \ +--swap-space 64 \ +--max-model-len $MAX_MODEL_LEN \ +--tensor-parallel-size $TP \ +--max-num-seqs 1024 \ +--kv-cache-dtype fp8 \ +--gpu-memory-utilization 0.94 \ +--max-seq-len-to-capture $MAX_MODEL_LEN \ +--max-num-batched-tokens 131072 \ +--no-enable-prefix-caching \ +--disable-log-requests \ +--async-scheduling > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm \ +--base-url "http://0.0.0.0:$PORT" \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics "ttft,tpot,itl,e2el" \ +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json + diff --git a/benchmarks/70b_fp8_b200_docker.sh b/benchmarks/70b_fp8_b200_docker.sh new file mode 100644 index 000000000..dbcfaf6fd --- /dev/null +++ b/benchmarks/70b_fp8_b200_docker.sh @@ -0,0 +1,46 @@ +#!/usr/bin/bash + +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# PORT +# TP +# CONC +# MAX_MODEL_LEN + +nvidia-smi + +# To improve CI stability, we patch this helper function to prevent a race condition that +# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779 +sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py + + + +FUSION_FLAG='{'\ +'"pass_config": {"enable_fi_allreduce_fusion": true, "enable_attn_fusion": true, "enable_noop": true},'\ +'"custom_ops": ["+quant_fp8", "+rms_norm"],'\ +'"cudagraph_mode": "FULL_DECODE_ONLY",'\ +'"splitting_ops": []'\ +'}' +cat > config.yaml <<-EOF +kv-cache-dtype: fp8 +compilation-config: '$FUSION_FLAG' +async-scheduling: true +no-enable-prefix-caching: true +max-num-batched-tokens: 8192 +max-model-len: $MAX_MODEL_LEN +EOF + +cat config.yaml # Debugging + +export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' +export PYTHONNOUSERSITE=1 + +set -x +vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ +--gpu-memory-utilization=0.9 \ +--tensor-parallel-size=$TP \ +--max-num-seqs=512 \ +--config config.yaml \ +--disable-log-requests diff --git a/benchmarks/70b_fp8_b200_trt_docker.sh b/benchmarks/70b_fp8_b200_trt_docker.sh new file mode 100644 index 000000000..e30478672 --- /dev/null +++ b/benchmarks/70b_fp8_b200_trt_docker.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +set -x +# Launch TRT-LLM server +mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \ +--max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh new file mode 100644 index 000000000..ad24453b3 --- /dev/null +++ b/benchmarks/70b_fp8_b200_trt_slurm.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + + +set -x + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h100_docker.sh b/benchmarks/70b_fp8_h100_docker.sh new file mode 100755 index 000000000..5d8df1bac --- /dev/null +++ b/benchmarks/70b_fp8_h100_docker.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# MAX_MODEL_LEN +# TP +# CONC + +pip install -q datasets pandas + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +async-scheduling: true +no-enable-prefix-caching: true +max-num-batched-tokens: 8192 +max-model-len: 10240 +EOF + +export PYTHONNOUSERSITE=1 + +vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ +--config=config.yaml \ +--gpu-memory-utilization=0.9 \ +--tensor-parallel-size=$TP \ +--max-num-seqs=$CONC \ +--disable-log-requests diff --git a/benchmarks/70b_fp8_h100_slurm.sh b/benchmarks/70b_fp8_h100_slurm.sh new file mode 100644 index 000000000..485aa8817 --- /dev/null +++ b/benchmarks/70b_fp8_h100_slurm.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +async-scheduling: true +no-enable-prefix-caching: true +max-num-batched-tokens: 8192 +max-model-len: 10240 +EOF + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + +export TORCH_CUDA_ARCH_LIST="9.0" + +set -x +PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ +--config=config.yaml \ +--gpu-memory-utilization=0.9 \ +--tensor-parallel-size=$TP \ +--max-num-seqs=$CONC \ +--disable-log-requests > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +pip install -q datasets pandas +git clone https://github.com/kimbochen/bench_serving.git +set -x +python3 bench_serving/benchmark_serving.py \ +--model=$MODEL \ +--backend=vllm \ +--base-url="http://0.0.0.0:$PORT" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics='ttft,tpot,itl,e2el' \ +--result-dir=/workspace/ \ +--result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh new file mode 100644 index 000000000..094fbd19c --- /dev/null +++ b/benchmarks/70b_fp8_h200_slurm.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +set -x +hf download $MODEL +pip install datasets pandas + +# Calculate max-model-len based on ISL and OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) +elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) +else + CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} +fi + +# Create config.yaml +cat > config.yaml << EOF +kv-cache-dtype: fp8 +async-scheduling: true +no-enable-prefix-caching: true +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +export TORCH_CUDA_ARCH_LIST="9.0" + +PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ + --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ + --disable-log-requests > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh new file mode 100644 index 000000000..dfb2324b9 --- /dev/null +++ b/benchmarks/70b_fp8_h200_trt_slurm.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh new file mode 100644 index 000000000..941e95023 --- /dev/null +++ b/benchmarks/70b_fp8_mi300x_docker.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# PORT +# TP +# CONC +# MAX_MODEL_LEN + +# Reference +# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark + +# If the machine runs a MEC FW older than 177, RCCL +# cannot reclaim some memory. +# Disable that features to avoid crashes. +# This is related to the changes in the driver at: +# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates + +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ "$CONC" -ge "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi +fi + +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). +set -x +vllm serve $MODEL --port=$PORT \ +--swap-space=64 \ +--gpu-memory-utilization=0.94 \ +--dtype=float16 --kv-cache-dtype=fp8 \ +--distributed-executor-backend=mp --tensor-parallel-size=$TP \ +--max-model-len=$MAX_MODEL_LEN \ +--max-seq-len-to-capture=$MAX_MODEL_LEN \ +--max-num-seqs=$CONC \ +--max-num-batched-tokens=131072 \ +--no-enable-prefix-caching \ +--config config.yaml \ +--async-scheduling \ +--disable-log-requests diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh new file mode 100644 index 000000000..b387505f0 --- /dev/null +++ b/benchmarks/70b_fp8_mi300x_slurm.sh @@ -0,0 +1,92 @@ +#!/usr/bin/bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +huggingface-cli download $MODEL + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=8888 + +# Reference +# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark + +# If the machine runs a MEC FW older than 177, RCCL +# cannot reclaim some memory. +# Disable that features to avoid crashes. +# This is related to the changes in the driver at: +# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates + +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ "$CONC" -ge "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi +fi + +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). +set -x +vllm serve $MODEL --port=$PORT \ +--swap-space=64 \ +--gpu-memory-utilization=0.94 \ +--dtype=float16 --kv-cache-dtype=fp8 \ +--distributed-executor-backend=mp --tensor-parallel-size=$TP \ +--max-model-len=$MAX_MODEL_LEN \ +--max-seq-len-to-capture=$MAX_MODEL_LEN \ +--max-num-seqs=$CONC \ +--max-num-batched-tokens=131072 \ +--no-enable-prefix-caching \ +--config config.yaml \ +--async-scheduling \ +--disable-log-requests \ +> $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model=$MODEL --backend=vllm \ +--base-url="http://0.0.0.0:$PORT" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics='ttft,tpot,itl,e2el' \ +--result-dir=/workspace/ \ +--result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh new file mode 100644 index 000000000..9e1fcdf8b --- /dev/null +++ b/benchmarks/70b_fp8_mi325x_docker.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# PORT +# TP +# CONC +# MAX_MODEL_LEN + +# Reference +# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark + +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ "$CONC" -ge "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi +fi + +# Patch the aiter config script to deal +# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch. +file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py' +sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch + + +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). +set -x +vllm serve $MODEL --port=$PORT \ +--swap-space=64 \ +--gpu-memory-utilization=0.94 \ +--dtype=float16 --kv-cache-dtype=fp8 \ +--distributed-executor-backend=mp --tensor-parallel-size=$TP \ +--max-model-len=$MAX_MODEL_LEN \ +--max-seq-len-to-capture=$MAX_MODEL_LEN \ +--max-num-seqs=$CONC \ +--max-num-batched-tokens=131072 \ +--no-enable-prefix-caching \ +--config config.yaml \ +--async-scheduling \ +--disable-log-requests diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh new file mode 100644 index 000000000..1febeff13 --- /dev/null +++ b/benchmarks/70b_fp8_mi325x_slurm.sh @@ -0,0 +1,86 @@ +#!/usr/bin/bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +huggingface-cli download $MODEL + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +# Reference +# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark + +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ "$CONC" -ge "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi +fi + +# Patch the aiter config script to deal +# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch. +file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py' +sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch + + +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). +set -x +vllm serve $MODEL --port=$PORT \ +--swap-space=64 \ +--gpu-memory-utilization=0.94 \ +--dtype=float16 --kv-cache-dtype=fp8 \ +--distributed-executor-backend=mp --tensor-parallel-size=$TP \ +--max-model-len=$MAX_MODEL_LEN \ +--max-seq-len-to-capture=$MAX_MODEL_LEN \ +--max-num-seqs=$CONC \ +--max-num-batched-tokens=131072 \ +--no-enable-prefix-caching \ +--config config.yaml \ +--async-scheduling \ +--disable-log-requests \ +> $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_mi355x_docker.sh b/benchmarks/70b_fp8_mi355x_docker.sh new file mode 100644 index 000000000..6310a5f64 --- /dev/null +++ b/benchmarks/70b_fp8_mi355x_docker.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# PORT +# TP +# CONC +# MAX_MODEL_LEN + +# Reference +# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark + +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + +sleep 5 +cat config.yaml + +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ "$CONC" -ge "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi +fi + +set -x +vllm serve $MODEL --port=$PORT \ +--swap-space=64 \ +--gpu-memory-utilization=0.94 \ +--dtype=auto --kv-cache-dtype=fp8 \ +--distributed-executor-backend=mp --tensor-parallel-size=$TP \ +--max-model-len=$MAX_MODEL_LEN \ +--max-seq-len-to-capture=$MAX_MODEL_LEN \ +--max-num-seqs=$CONC \ +--max-num-batched-tokens=131072 \ +--no-enable-prefix-caching \ +--config config.yaml \ +--async-scheduling \ +--disable-log-requests + diff --git a/benchmarks/70b_fp8_mi355x_slurm.sh b/benchmarks/70b_fp8_mi355x_slurm.sh new file mode 100644 index 000000000..2abfee137 --- /dev/null +++ b/benchmarks/70b_fp8_mi355x_slurm.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# PORT +# RESULT_FILENAME + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + +# Reference +# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark + +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ "$CONC" -ge "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi +fi + +set -x +vllm serve $MODEL --port=$PORT \ +--swap-space=64 \ +--gpu-memory-utilization=0.94 \ +--dtype=auto --kv-cache-dtype=fp8 \ +--distributed-executor-backend=mp --tensor-parallel-size=$TP \ +--max-model-len=$MAX_MODEL_LEN \ +--max-seq-len-to-capture=$MAX_MODEL_LEN \ +--max-num-seqs=$CONC \ +--max-num-batched-tokens=131072 \ +--no-enable-prefix-caching \ +--config config.yaml \ +--async-scheduling \ +--disable-log-requests \ +> $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm \ +--base-url "http://0.0.0.0:$PORT" \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics "ttft,tpot,itl,e2el" \ +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json + +exit From b89047d5fc054f47e046433351a49014ec4cd95d Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 29 Oct 2025 22:18:46 -0400 Subject: [PATCH 075/149] remove llama 70b (#149) --- .github/workflows/70b-tmpl.yml | 230 ------------------ .github/workflows/full-sweep-tmpl.yml | 75 ------ .github/workflows/runner-model-sweep-test.yml | 11 - .github/workflows/runner-sweep-test.yml | 5 - benchmarks/70b_fp4_b200_docker.sh | 48 ---- benchmarks/70b_fp4_b200_trt_docker.sh | 46 ---- benchmarks/70b_fp4_b200_trt_slurm.sh | 75 ------ benchmarks/70b_fp4_mi355x_docker.sh | 55 ----- benchmarks/70b_fp4_mi355x_slurm.sh | 84 ------- benchmarks/70b_fp8_b200_docker.sh | 46 ---- benchmarks/70b_fp8_b200_trt_docker.sh | 46 ---- benchmarks/70b_fp8_b200_trt_slurm.sh | 75 ------ benchmarks/70b_fp8_h100_docker.sh | 29 --- benchmarks/70b_fp8_h100_slurm.sh | 60 ----- benchmarks/70b_fp8_h200_slurm.sh | 69 ------ benchmarks/70b_fp8_h200_trt_slurm.sh | 70 ------ benchmarks/70b_fp8_mi300x_docker.sh | 59 ----- benchmarks/70b_fp8_mi300x_slurm.sh | 92 ------- benchmarks/70b_fp8_mi325x_docker.sh | 53 ---- benchmarks/70b_fp8_mi325x_slurm.sh | 86 ------- benchmarks/70b_fp8_mi355x_docker.sh | 50 ---- benchmarks/70b_fp8_mi355x_slurm.sh | 75 ------ 22 files changed, 1439 deletions(-) delete mode 100644 .github/workflows/70b-tmpl.yml delete mode 100644 benchmarks/70b_fp4_b200_docker.sh delete mode 100644 benchmarks/70b_fp4_b200_trt_docker.sh delete mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh delete mode 100644 benchmarks/70b_fp4_mi355x_docker.sh delete mode 100644 benchmarks/70b_fp4_mi355x_slurm.sh delete mode 100644 benchmarks/70b_fp8_b200_docker.sh delete mode 100644 benchmarks/70b_fp8_b200_trt_docker.sh delete mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh delete mode 100755 benchmarks/70b_fp8_h100_docker.sh delete mode 100644 benchmarks/70b_fp8_h100_slurm.sh delete mode 100644 benchmarks/70b_fp8_h200_slurm.sh delete mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh delete mode 100644 benchmarks/70b_fp8_mi300x_docker.sh delete mode 100644 benchmarks/70b_fp8_mi300x_slurm.sh delete mode 100644 benchmarks/70b_fp8_mi325x_docker.sh delete mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh delete mode 100644 benchmarks/70b_fp8_mi355x_docker.sh delete mode 100644 benchmarks/70b_fp8_mi355x_slurm.sh diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml deleted file mode 100644 index 3d1dd5051..000000000 --- a/.github/workflows/70b-tmpl.yml +++ /dev/null @@ -1,230 +0,0 @@ -name: Template - LLaMA 70B - -on: - workflow_call: - inputs: - exp-name: - required: true - type: string - isl: - required: true - type: string - osl: - required: true - type: string - max-model-len: - required: true - type: string - random-range-ratio: - required: true - type: string - - use_h100: - type: boolean - required: true - use_h200: - type: boolean - required: true - use_b200: - type: boolean - required: true - use_mi300x: - type: boolean - required: true - use_mi325x: - type: boolean - required: true - use_mi355x: - type: boolean - required: true - -jobs: - bmk-h100-fp8: - if: ${{ inputs.use_h100 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: h100 - image: 'vllm/vllm-openai:v0.10.2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[2, 4, 8]' - - bmk-h200-fp8: - if: ${{ inputs.use_h200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: h200 - image: 'vllm/vllm-openai:v0.10.2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - - bmk-h200-trt-fp8: - if: ${{ inputs.use_h200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: h200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - framework: 'trt' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - conc-list: '[4, 8, 16, 32, 64, 128]' # H200 can achieve TPS/User >= 30 with larger concurrency till 128 - - bmk-b200-fp8: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200 - image: 'vllm/vllm-openai:v0.10.2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - - bmk-b200-trt-fp8: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP8' - framework: 'trt' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 256 - - bmk-mi300x-fp8: - if: ${{ inputs.use_mi300x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi300x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - - bmk-mi325x-fp8: - if: ${{ inputs.use_mi325x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi325x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - - bmk-mi355x-fp8: - if: ${{ inputs.use_mi355x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi355x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'amd/Llama-3.3-70B-Instruct-FP8-KV' - framework: 'vllm' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - - bmk-b200-fp4: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200 - image: 'vllm/vllm-openai:v0.10.2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP4' - framework: 'vllm' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - - bmk-b200-trt-fp4: - if: ${{ inputs.use_b200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: b200-trt - image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - model: 'nvidia/Llama-3.3-70B-Instruct-FP4' - framework: 'trt' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 128 - - bmk-mi355x-fp4: - if: ${{ inputs.use_mi355x }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: mi355x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview' - framework: 'vllm' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml index b086460df..869928cb7 100644 --- a/.github/workflows/full-sweep-tmpl.yml +++ b/.github/workflows/full-sweep-tmpl.yml @@ -37,31 +37,6 @@ on: default: false jobs: - _70b-1k1k: - if: ${{ inputs.run_1k1k }} - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-70b-1k1k-results: - needs: _70b-1k1k - if: ${{ inputs.run_1k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: '70b_1k1k' - dsr1-1k1k: if: ${{ inputs.run_1k1k }} uses: ./.github/workflows/dsr1-tmpl.yml @@ -112,31 +87,6 @@ jobs: with: exp-name: 'gptoss_1k1k' - _70b-8k1k: - if: ${{ inputs.run_8k1k }} - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-70b-8k1k-results: - needs: _70b-8k1k - if: ${{ inputs.run_8k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: '70b_8k1k' - dsr1-8k1k: if: ${{ inputs.run_8k1k }} uses: ./.github/workflows/dsr1-tmpl.yml @@ -187,31 +137,6 @@ jobs: with: exp-name: 'gptoss_8k1k' - _70b-1k8k: - if: ${{ inputs.run_1k8k }} - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-70b-1k8k-results: - needs: _70b-1k8k - if: ${{ inputs.run_1k8k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: '70b_1k8k' - dsr1-1k8k: if: ${{ inputs.run_1k8k }} uses: ./.github/workflows/dsr1-tmpl.yml diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml index 212ffc07c..e4f2b7303 100644 --- a/.github/workflows/runner-model-sweep-test.yml +++ b/.github/workflows/runner-model-sweep-test.yml @@ -33,7 +33,6 @@ jobs: - 'h100-cw_0' - 'h100-cw_1' config: - - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } name: '${{ matrix.runner }}' @@ -70,7 +69,6 @@ jobs: - 'h200-nv_2' - 'h200-nv_3' config: - - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } @@ -108,7 +106,6 @@ jobs: - 'h200-nv_2' - 'h200-nv_3' config: - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' } - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' } @@ -140,8 +137,6 @@ jobs: - 'b200-nvd_2' - 'b200-nvd_3' config: - - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' } - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } @@ -174,8 +169,6 @@ jobs: - 'b200-nb_0' - 'b200-nb_1' config: - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' } - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'trt', precision: 'fp4', exp-name: '70b_test' } - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' } - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' } @@ -211,7 +204,6 @@ jobs: - 'mi300x-cr_0' - 'mi300x-oci_0' config: - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } @@ -244,7 +236,6 @@ jobs: - 'mi325x-tw_2' - 'mi325x-tw_3' config: - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } @@ -276,8 +267,6 @@ jobs: - 'mi355x-amd_2' - 'mi355x-amd_3' config: - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml index fd100474f..8f824c4d1 100644 --- a/.github/workflows/runner-sweep-test.yml +++ b/.github/workflows/runner-sweep-test.yml @@ -45,11 +45,7 @@ on: type: choice options: - 'amd/DeepSeek-R1-0528-MXFP4-Preview' - - 'amd/Llama-3.3-70B-Instruct-FP8-KV' - - 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview' - 'deepseek-ai/DeepSeek-R1-0528' - - 'nvidia/Llama-3.3-70B-Instruct-FP8' - - 'nvidia/Llama-3.3-70B-Instruct-FP4' - 'nvidia/DeepSeek-R1-0528-FP4' - 'nvidia/DeepSeek-R1-0528-FP4-v2' - 'openai/gpt-oss-120b' @@ -76,7 +72,6 @@ on: required: true type: choice options: - - '70b_test' - 'dsr1_test' - 'gptoss_test' diff --git a/benchmarks/70b_fp4_b200_docker.sh b/benchmarks/70b_fp4_b200_docker.sh deleted file mode 100644 index a76ffb9f8..000000000 --- a/benchmarks/70b_fp4_b200_docker.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -nvidia-smi - -# To improve CI stability, we patch this helper function to prevent a race condition that -# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779 -sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py - -# Calculate max-model-len based on ISL and OSL -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) -elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) -else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}' -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -export TORCH_CUDA_ARCH_LIST="10.0" -export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' -export PYTHONNOUSERSITE=1 - -set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ ---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ ---disable-log-requests diff --git a/benchmarks/70b_fp4_b200_trt_docker.sh b/benchmarks/70b_fp4_b200_trt_docker.sh deleted file mode 100644 index e30478672..000000000 --- a/benchmarks/70b_fp4_b200_trt_docker.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -set -x -# Launch TRT-LLM server -mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \ ---max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh deleted file mode 100644 index ad24453b3..000000000 --- a/benchmarks/70b_fp4_b200_trt_slurm.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - -set -x - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -# Launch TRT-LLM server -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp4_mi355x_docker.sh b/benchmarks/70b_fp4_mi355x_docker.sh deleted file mode 100644 index 681a629fb..000000000 --- a/benchmarks/70b_fp4_mi355x_docker.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -le "16" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -le "16" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi - if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -fi - -set -x -vllm serve $MODEL \ ---host=0.0.0.0 \ ---port $PORT \ ---swap-space 64 \ ---max-model-len $MAX_MODEL_LEN \ ---tensor-parallel-size $TP \ ---max-num-seqs 1024 \ ---kv-cache-dtype fp8 \ ---gpu-memory-utilization 0.94 \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ ---max-num-batched-tokens 131072 \ ---no-enable-prefix-caching \ ---disable-log-requests \ ---async-scheduling diff --git a/benchmarks/70b_fp4_mi355x_slurm.sh b/benchmarks/70b_fp4_mi355x_slurm.sh deleted file mode 100644 index 0d5a469d0..000000000 --- a/benchmarks/70b_fp4_mi355x_slurm.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# PORT -# RESULT_FILENAME - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=8888 - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -le "16" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -le "16" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi - if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then - export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 - else - export VLLM_TRITON_FP4_GEMM_USE_ASM=1 - fi -fi - - -set -x -vllm serve $MODEL \ ---host=0.0.0.0 \ ---port $PORT \ ---swap-space 64 \ ---max-model-len $MAX_MODEL_LEN \ ---tensor-parallel-size $TP \ ---max-num-seqs 1024 \ ---kv-cache-dtype fp8 \ ---gpu-memory-utilization 0.94 \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ ---max-num-batched-tokens 131072 \ ---no-enable-prefix-caching \ ---disable-log-requests \ ---async-scheduling > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json - diff --git a/benchmarks/70b_fp8_b200_docker.sh b/benchmarks/70b_fp8_b200_docker.sh deleted file mode 100644 index dbcfaf6fd..000000000 --- a/benchmarks/70b_fp8_b200_docker.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -nvidia-smi - -# To improve CI stability, we patch this helper function to prevent a race condition that -# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779 -sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py - - - -FUSION_FLAG='{'\ -'"pass_config": {"enable_fi_allreduce_fusion": true, "enable_attn_fusion": true, "enable_noop": true},'\ -'"custom_ops": ["+quant_fp8", "+rms_norm"],'\ -'"cudagraph_mode": "FULL_DECODE_ONLY",'\ -'"splitting_ops": []'\ -'}' -cat > config.yaml <<-EOF -kv-cache-dtype: fp8 -compilation-config: '$FUSION_FLAG' -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: $MAX_MODEL_LEN -EOF - -cat config.yaml # Debugging - -export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}' -export PYTHONNOUSERSITE=1 - -set -x -vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=512 \ ---config config.yaml \ ---disable-log-requests diff --git a/benchmarks/70b_fp8_b200_trt_docker.sh b/benchmarks/70b_fp8_b200_trt_docker.sh deleted file mode 100644 index e30478672..000000000 --- a/benchmarks/70b_fp8_b200_trt_docker.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -set -x -# Launch TRT-LLM server -mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \ ---max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh deleted file mode 100644 index ad24453b3..000000000 --- a/benchmarks/70b_fp8_b200_trt_slurm.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - -set -x - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -# Launch TRT-LLM server -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h100_docker.sh b/benchmarks/70b_fp8_h100_docker.sh deleted file mode 100755 index 5d8df1bac..000000000 --- a/benchmarks/70b_fp8_h100_docker.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# MAX_MODEL_LEN -# TP -# CONC - -pip install -q datasets pandas - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: 10240 -EOF - -export PYTHONNOUSERSITE=1 - -vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---config=config.yaml \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=$CONC \ ---disable-log-requests diff --git a/benchmarks/70b_fp8_h100_slurm.sh b/benchmarks/70b_fp8_h100_slurm.sh deleted file mode 100644 index 485aa8817..000000000 --- a/benchmarks/70b_fp8_h100_slurm.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: 10240 -EOF - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) - -export TORCH_CUDA_ARCH_LIST="9.0" - -set -x -PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---config=config.yaml \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=$CONC \ ---disable-log-requests > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -pip install -q datasets pandas -git clone https://github.com/kimbochen/bench_serving.git -set -x -python3 bench_serving/benchmark_serving.py \ ---model=$MODEL \ ---backend=vllm \ ---base-url="http://0.0.0.0:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh deleted file mode 100644 index 094fbd19c..000000000 --- a/benchmarks/70b_fp8_h200_slurm.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -set -x -hf download $MODEL -pip install datasets pandas - -# Calculate max-model-len based on ISL and OSL -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) -elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) -else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} -fi - -# Create config.yaml -cat > config.yaml << EOF -kv-cache-dtype: fp8 -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -export TORCH_CUDA_ARCH_LIST="9.0" - -PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ - --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ - --disable-log-requests > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh deleted file mode 100644 index dfb2324b9..000000000 --- a/benchmarks/70b_fp8_h200_trt_slurm.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh deleted file mode 100644 index 941e95023..000000000 --- a/benchmarks/70b_fp8_mi300x_docker.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -# If the machine runs a MEC FW older than 177, RCCL -# cannot reclaim some memory. -# Disable that features to avoid crashes. -# This is related to the changes in the driver at: -# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -# In this specific case, float16 performs better than the datatype -# picked by vllm when using auto for --dtype (bfloat16). -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=float16 --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh deleted file mode 100644 index b387505f0..000000000 --- a/benchmarks/70b_fp8_mi300x_slurm.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -huggingface-cli download $MODEL - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=8888 - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -# If the machine runs a MEC FW older than 177, RCCL -# cannot reclaim some memory. -# Disable that features to avoid crashes. -# This is related to the changes in the driver at: -# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -# In this specific case, float16 performs better than the datatype -# picked by vllm when using auto for --dtype (bfloat16). -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=float16 --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests \ -> $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model=$MODEL --backend=vllm \ ---base-url="http://0.0.0.0:$PORT" \ ---dataset-name=random \ ---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \ ---request-rate=inf --ignore-eos \ ---save-result --percentile-metrics='ttft,tpot,itl,e2el' \ ---result-dir=/workspace/ \ ---result-filename=$RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh deleted file mode 100644 index 9e1fcdf8b..000000000 --- a/benchmarks/70b_fp8_mi325x_docker.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -# Patch the aiter config script to deal -# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch. -file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py' -sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch - - -# In this specific case, float16 performs better than the datatype -# picked by vllm when using auto for --dtype (bfloat16). -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=float16 --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh deleted file mode 100644 index 1febeff13..000000000 --- a/benchmarks/70b_fp8_mi325x_slurm.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -huggingface-cli download $MODEL - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -# Patch the aiter config script to deal -# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch. -file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py' -sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch - - -# In this specific case, float16 performs better than the datatype -# picked by vllm when using auto for --dtype (bfloat16). -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=float16 --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests \ -> $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_mi355x_docker.sh b/benchmarks/70b_fp8_mi355x_docker.sh deleted file mode 100644 index 6310a5f64..000000000 --- a/benchmarks/70b_fp8_mi355x_docker.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# PORT -# TP -# CONC -# MAX_MODEL_LEN - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -sleep 5 -cat config.yaml - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=auto --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests - diff --git a/benchmarks/70b_fp8_mi355x_slurm.sh b/benchmarks/70b_fp8_mi355x_slurm.sh deleted file mode 100644 index 2abfee137..000000000 --- a/benchmarks/70b_fp8_mi355x_slurm.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# ========= Required Env Vars ========= -# HF_TOKEN -# HF_HUB_CACHE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# PORT -# RESULT_FILENAME - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=auto --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests \ -> $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url "http://0.0.0.0:$PORT" \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics "ttft,tpot,itl,e2el" \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json - -exit From be3b40f5314e4bd001dcb4a12e024813d32befc6 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 18:49:34 -0500 Subject: [PATCH 076/149] testing concurrency From 13803ac4347461c77a72eba1746b70dbdb6af172 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 14:42:41 -0500 Subject: [PATCH 077/149] adding more workflows --- .github/workflows/1k8k-sweep.yml | 5 + .github/workflows/test.yml | 147 ++++++++++++++++++ utils/matrix-logic/get_test_sweep_configs.py | 151 +++++++++++++++++++ 3 files changed, 303 insertions(+) create mode 100644 .github/workflows/test.yml create mode 100644 utils/matrix-logic/get_test_sweep_configs.py diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index 25fc3a362..581ec07cf 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -4,6 +4,11 @@ concurrency: group: benchmark-lock-1k8k cancel-in-progress: false +on: + workflow_dispatch: + schedule: + - cron: '0 23 * * *' + on: # pull_request: workflow_dispatch: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 000000000..0d92952da --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,147 @@ +name: Test - Full Sweep + +concurrency: + group: benchmark-lock + cancel-in-progress: false + +on: + pull_request: + workflow_dispatch: + inputs: + name: + description: "Name of benchmark from master configs" + required: true + type: string + default: 70b-fp4-mi355x-vllm + + run_1k1k: + description: "Run ISL/OSL 1k/1k" + type: boolean + required: true + run_1k8k: + description: "Run ISL/OSL 1k/8k" + type: boolean + required: true + run_8k1k: + description: "Run ISL/OSL 8k/1k" + type: boolean + required: true + + runner: + description: "Specific runner node to run on" + required: false + type: choice + options: + - "h100-cr_0" + - "h100-cr_1" + - "h100-cw_0" + - "h100-cw_1" + - "h200-cw_0" + - "h200-cw_1" + - "h200-nb_0" + - "h200-nb_1" + - "h200-nb_2" + - "h200-nb_3" + - "h200-nv_0" + - "h200-nv_1" + - "h200-nv_2" + - "h200-nv_3" + - "b200-nv_0" + - "b200-nv_1" + - "b200-nb_0" + - "b200-nb_1" + - "b200-nvd_0" + - "b200-nvd_1" + - "b200-nvd_2" + - "b200-nvd_3" + - "b200-tg_0" + - "mi300x-amd_0" + - "mi300x-amd_1" + - "mi300x-amd_2" + - "mi300x-amd_3" + - "mi300x-amd_4" + - "mi300x-cr_0" + - "mi300x-oci_0" + - "mi325x-amd_0" + - "mi325x-tw_0" + - "mi325x-tw_1" + - "mi325x-tw_2" + - "mi325x-tw_3" + - "mi355x-amd_0" + - "mi355x-amd_1" + - "mi355x-amd_2" + - "mi355x-amd_3" + +jobs: + get-jobs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-jobs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \ + --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ + --key ${{ inputs.name }} \ + ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }}) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + test-sweep: + needs: get-jobs + uses: ./.github/workflows/benchmark-tmpl.yml + name: test sweep - ${{ inputs.name }} + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "dsr1_1k1k" + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} + + calc-success-rate: + needs: test-sweep + if: ${{ always() }} + runs-on: ubuntu-latest + + env: + RESULTS_DIR: "results/" + STATS_FILENAME: "run_stats" + GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download results artifacts + uses: actions/download-artifact@v4 + with: + path: ${{ env.RESULTS_DIR }} + pattern: results_* + + - name: Install python dependencies + run: pip install PyGithub + + - name: Calculate success rate + run: python3 utils/calc_success_rate.py $STATS_FILENAME + + - uses: actions/upload-artifact@v4 + with: + name: "run-stats" + path: ${{ env.STATS_FILENAME }}.json diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py new file mode 100644 index 000000000..87ab0457b --- /dev/null +++ b/utils/matrix-logic/get_test_sweep_configs.py @@ -0,0 +1,151 @@ +import json +import yaml +import sys +import argparse + +seq_len_stoi = { + "1k1k": (1024, 1024), + "1k8k": (1024, 8192), + "8k1k": (8192, 1024) +} + +def main(): + parser = argparse.ArgumentParser( + description='Generate benchmark matrix from a specific configuration key' + ) + parser.add_argument( + '--config-files', + nargs='+', + required=True, + help='One or more configuration files (YAML format)' + ) + parser.add_argument( + '--key', + required=True, + help='Configuration key to use' + ) + parser.add_argument( + '--seq-lens', + nargs='+', + choices=list(seq_len_stoi.keys()), + required=False, + help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." + ) + parser.add_argument( + '--step-size', + type=int, + default=2, + help='Step size for concurrency values (default: 2)' + ) + + args = parser.parse_args() + + # Convert seq-lens to set of (isl, osl) tuples for filtering + seq_lens_filter = None + if args.seq_lens: + seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} + + # Load and merge all config files + all_config_data = {} + for config_file in args.config_files: + try: + with open(config_file, 'r') as f: + config_data = yaml.safe_load(f) + assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" + + # Check for duplicate keys + duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) + if duplicate_keys: + raise ValueError( + f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" + ) + + all_config_data.update(config_data) + except FileNotFoundError: + raise ValueError(f"Input file '{config_file}' does not exist.") + + # Check if the key exists + if args.key not in all_config_data: + available_keys = ', '.join(sorted(all_config_data.keys())) + raise ValueError( + f"Key '{args.key}' not found in configuration files. " + f"Available keys: {available_keys}" + ) + + val = all_config_data[args.key] + + # Validate required fields + seq_len_configs = val.get('seq-len-configs') + assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'" + + image = val.get('image') + model = val.get('model') + precision = val.get('precision') + framework = val.get('framework') + runner = val.get('runner') + + assert None not in (image, model, precision, framework, runner), \ + f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" + + matrix_values = [] + + # Process each sequence length configuration + for seq_config in seq_len_configs: + isl = seq_config.get('isl') + osl = seq_config.get('osl') + + assert None not in (isl, osl), \ + f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'" + + # Filter by sequence lengths if specified + if seq_lens_filter and (isl, osl) not in seq_lens_filter: + continue + + bmk_space = seq_config.get('bmk-space') + assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'" + + for bmk in bmk_space: + tp = bmk.get('tp') + conc_start = bmk.get('conc-start') + conc_end = bmk.get('conc-end') + ep = bmk.get('ep') + dp_attn = bmk.get('dp-attn') + + assert None not in (tp, conc_start, conc_end), \ + f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" + + # Generate entries for each concurrency value in the range + conc = conc_start + while conc <= conc_end: + entry = { + 'image': image, + 'model': model, + 'precision': precision, + 'framework': framework, + 'runner': runner, + 'isl': isl, + 'osl': osl, + 'tp': tp, + 'conc': conc, + 'max-model-len': isl + osl, + } + + # Add optional fields if they exist + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + print(json.dumps(matrix_values)) + return matrix_values + +if __name__ == "__main__": + main() \ No newline at end of file From 422e5b86a01700f0f346b319fdba01631b4314bd Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 17:05:27 -0500 Subject: [PATCH 078/149] deleting files --- .github/workflows/full-sweep-tmpl.yml | 188 ---------- .github/workflows/runner-model-sweep-test.yml | 289 --------------- .github/workflows/runner-sweep-test.yml | 328 ------------------ .github/workflows/test.yml | 147 -------- 4 files changed, 952 deletions(-) delete mode 100644 .github/workflows/full-sweep-tmpl.yml delete mode 100644 .github/workflows/runner-model-sweep-test.yml delete mode 100644 .github/workflows/runner-sweep-test.yml delete mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml deleted file mode 100644 index 869928cb7..000000000 --- a/.github/workflows/full-sweep-tmpl.yml +++ /dev/null @@ -1,188 +0,0 @@ -name: Template - Full Sweep - -on: - workflow_call: - inputs: - run_1k1k: - type: boolean - required: true - run_8k1k: - type: boolean - required: true - run_1k8k: - type: boolean - required: true - - use_h100: - type: boolean - required: true - use_h200: - type: boolean - required: true - use_b200: - type: boolean - required: true - use_mi300x: - type: boolean - required: true - use_mi325x: - type: boolean - required: true - use_mi355x: - type: boolean - required: true - use_gb200: - type: boolean - required: false - default: false - -jobs: - dsr1-1k1k: - if: ${{ inputs.run_1k1k }} - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - use_gb200: ${{ inputs.use_gb200 }} - - collect-dsr1-1k1k-results: - needs: dsr1-1k1k - if: ${{ inputs.run_1k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'dsr1_1k1k' - - gptoss-1k1k: - if: ${{ inputs.run_1k1k }} - uses: ./.github/workflows/gptoss-tmpl.yml - secrets: inherit - with: - exp-name: 'gptoss_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-gptoss-1k1k-results: - needs: gptoss-1k1k - if: ${{ inputs.run_1k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'gptoss_1k1k' - - dsr1-8k1k: - if: ${{ inputs.run_8k1k }} - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - use_gb200: ${{ inputs.use_gb200 }} - - collect-dsr1-8k1k-results: - needs: dsr1-8k1k - if: ${{ inputs.run_8k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'dsr1_8k1k' - - gptoss-8k1k: - if: ${{ inputs.run_8k1k }} - uses: ./.github/workflows/gptoss-tmpl.yml - secrets: inherit - with: - exp-name: 'gptoss_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-gptoss-8k1k-results: - needs: gptoss-8k1k - if: ${{ inputs.run_8k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'gptoss_8k1k' - - dsr1-1k8k: - if: ${{ inputs.run_1k8k }} - uses: ./.github/workflows/dsr1-tmpl.yml - secrets: inherit - with: - exp-name: 'dsr1_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - use_gb200: ${{ inputs.use_gb200 }} - - collect-dsr1-1k8k-results: - needs: dsr1-1k8k - if: ${{ inputs.run_1k8k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'dsr1_1k8k' - - gptoss-1k8k: - if: ${{ inputs.run_1k8k }} - uses: ./.github/workflows/gptoss-tmpl.yml - secrets: inherit - with: - exp-name: 'gptoss_1k8k' - isl: 1024 - osl: 8192 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} - - collect-gptoss-1k8k-results: - needs: gptoss-1k8k - if: ${{ inputs.run_1k8k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'gptoss_1k8k' diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml deleted file mode 100644 index e4f2b7303..000000000 --- a/.github/workflows/runner-model-sweep-test.yml +++ /dev/null @@ -1,289 +0,0 @@ -name: 'Test - Runner Model Sweep' -run-name: '${{ github.event.inputs.runner }} Sweep' -on: - workflow_dispatch: - inputs: - runner: - description: 'Runner Type' - required: true - type: choice - options: - - 'h100' - - 'h200' - - 'h200-trt' - - 'b200' - - 'b200-trt' - - 'mi300x' - - 'mi325x' - - 'mi355x' - -env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - HF_HUB_CACHE: '/mnt/hf_hub_cache/' - -jobs: - bmk-h100: - if: ${{ inputs.runner == 'h100' }} - strategy: - fail-fast: false - matrix: - runner: - - 'h100-cr_0' - - 'h100-cr_1' - - 'h100-cw_0' - - 'h100-cw_1' - config: - - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-h200: - if: ${{ inputs.runner == 'h200' }} - strategy: - fail-fast: false - matrix: - runner: - - 'h200-cw_0' - - 'h200-cw_1' - - 'h200-nb_0' - - 'h200-nb_1' - - 'h200-nb_2' - - 'h200-nb_3' - - 'h200-nv_0' - - 'h200-nv_1' - - 'h200-nv_2' - - 'h200-nv_3' - config: - - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-h200-trt: - if: ${{ inputs.runner == 'h200-trt' }} - strategy: - fail-fast: false - matrix: - runner: - - 'h200-cw_0' - - 'h200-cw_1' - - 'h200-nb_0' - - 'h200-nb_1' - - 'h200-nb_2' - - 'h200-nb_3' - - 'h200-nv_0' - - 'h200-nv_1' - - 'h200-nv_2' - - 'h200-nv_3' - config: - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-b200: - if: ${{ inputs.runner == 'b200' }} - strategy: - fail-fast: false - matrix: - runner: - - 'b200-nvd_0' - - 'b200-nvd_1' - - 'b200-nvd_2' - - 'b200-nvd_3' - config: - - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } - - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[4]' - - bmk-b200-trt: - if: ${{ inputs.runner == 'b200-trt' }} - strategy: - fail-fast: false - matrix: - runner: - - 'b200-nv_0' - - 'b200-nv_1' - - 'b200-nb_0' - - 'b200-nb_1' - config: - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' } - - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-mi300x: - if: ${{ inputs.runner == 'mi300x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi300x-amd_0' - - 'mi300x-amd_1' - - 'mi300x-amd_2' - - 'mi300x-amd_3' - - 'mi300x-amd_4' - - 'mi300x-cr_0' - - 'mi300x-oci_0' - config: - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-mi325x: - if: ${{ inputs.runner == 'mi325x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi325x-amd_0' - - 'mi325x-tw_0' - - 'mi325x-tw_1' - - 'mi325x-tw_2' - - 'mi325x-tw_3' - config: - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk-mi355x: - if: ${{ inputs.runner == 'mi355x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi355x-amd_0' - - 'mi355x-amd_1' - - 'mi355x-amd_2' - - 'mi355x-amd_3' - config: - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml deleted file mode 100644 index 8f824c4d1..000000000 --- a/.github/workflows/runner-sweep-test.yml +++ /dev/null @@ -1,328 +0,0 @@ -name: 'Test - Runner Sweep' -run-name: '${{ github.event.inputs.runner }} Sweep - ${{ github.event.inputs.model }}' -on: - workflow_dispatch: - inputs: - runner: - description: 'Runner Type' - required: true - type: choice - options: - - 'h100' - - 'h200' - - 'b200' - - 'h200-trt' - - 'b200-trt' - - 'mi300x' - - 'mi325x' - - 'mi355x' - - 'gb200' - - image: - description: 'Docker Image' - required: true - type: choice - options: - - 'lmsysorg/sglang:v0.4.9.post1-cu126' - - 'lmsysorg/sglang:v0.5.0rc1-cu128-b200' - - 'lmsysorg/sglang:v0.5.2rc2-cu126' - - 'lmsysorg/sglang:v0.5.3rc1-cu129-b200' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' - - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1' - - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' - - 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915' - - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' - - 'vllm/vllm-openai:v0.10.2' - - model: - description: 'Model' - required: true - type: choice - options: - - 'amd/DeepSeek-R1-0528-MXFP4-Preview' - - 'deepseek-ai/DeepSeek-R1-0528' - - 'nvidia/DeepSeek-R1-0528-FP4' - - 'nvidia/DeepSeek-R1-0528-FP4-v2' - - 'openai/gpt-oss-120b' - - framework: - description: 'Framework' - required: true - type: choice - options: - - 'vllm' - - 'sglang' - - 'trt' - - precision: - description: 'Precision' - required: true - type: choice - options: - - 'fp8' - - 'fp4' - - exp-name: - description: 'Experiment Name' - required: true - type: choice - options: - - 'dsr1_test' - - 'gptoss_test' - - -env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - HF_HUB_CACHE: '/mnt/hf_hub_cache/' - -jobs: - bmk_h100: - if: ${{ inputs.runner == 'h100' }} - strategy: - fail-fast: false - matrix: - runner: - - 'h100-cr_0' - - 'h100-cr_1' - - 'h100-cw_0' - - 'h100-cw_1' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_h200: - if: ${{ inputs.runner == 'h200' || inputs.runner == 'h200-trt' }} - strategy: - fail-fast: false - matrix: - runner: - - 'h200-cw_0' - - 'h200-cw_1' - - 'h200-nb_0' - - 'h200-nb_1' - - 'h200-nb_2' - - 'h200-nb_3' - - 'h200-nv_0' - - 'h200-nv_1' - - 'h200-nv_2' - - 'h200-nv_3' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[4]' - conc-list: '[64]' - - bmk_b200: - if: ${{ inputs.runner == 'b200' }} - strategy: - fail-fast: false - matrix: - runner: - - 'b200-nv_0' - - 'b200-nv_1' - - 'b200-nvd_0' - - 'b200-nvd_1' - - 'b200-tg_0' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_b200-trt: - if: ${{ inputs.runner == 'b200-trt' }} - strategy: - fail-fast: false - matrix: - runner: - - 'b200-nv_0' - - 'b200-nv_1' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_mi300x: - if: ${{ inputs.runner == 'mi300x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi300x-amd_0' - - 'mi300x-amd_1' - - 'mi300x-amd_2' - - 'mi300x-amd_3' - - 'mi300x-amd_4' - - 'mi300x-cr_0' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_mi325x: - if: ${{ inputs.runner == 'mi325x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi325x-amd_0' - - 'mi325x-tw_0' - - 'mi325x-tw_1' - - 'mi325x-tw_2' - - 'mi325x-tw_3' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_mi355x: - if: ${{ inputs.runner == 'mi355x' }} - strategy: - fail-fast: false - matrix: - runner: - - 'mi355x-amd_0' - - 'mi355x-amd_1' - - 'mi355x-amd_2' - - 'mi355x-amd_3' - - name: '${{ matrix.runner }}' - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: ${{ matrix.runner }} - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - tp-list: '[8]' - conc-list: '[1]' - - bmk_gb200: - if: ${{ inputs.runner == 'gb200' && inputs.framework == 'trt' }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - secrets: inherit - with: - runner: gb200 - image: 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3' - model: 'deepseek-r1-fp4' - framework: 'dynamo-trtllm' - precision: 'fp4' - exp-name: ${{ inputs.exp-name }} - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - mtp-mode: 'off' - - bmk_gb200-sgl: - if: ${{ inputs.runner == 'gb200' && inputs.framework == 'sglang' }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - secrets: inherit - with: - runner: gb200 - image: 'nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'dynamo-sglang' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: 8192 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - mtp-mode: 'off' - - collect-test-results: - needs: [ bmk_h100, bmk_h200, bmk_b200, bmk_b200-trt, bmk_mi300x, bmk_mi325x, bmk_mi355x, bmk_gb200, bmk_gb200-sgl ] - if: ${{ always() && !cancelled() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: ${{ inputs.exp-name }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml deleted file mode 100644 index 0d92952da..000000000 --- a/.github/workflows/test.yml +++ /dev/null @@ -1,147 +0,0 @@ -name: Test - Full Sweep - -concurrency: - group: benchmark-lock - cancel-in-progress: false - -on: - pull_request: - workflow_dispatch: - inputs: - name: - description: "Name of benchmark from master configs" - required: true - type: string - default: 70b-fp4-mi355x-vllm - - run_1k1k: - description: "Run ISL/OSL 1k/1k" - type: boolean - required: true - run_1k8k: - description: "Run ISL/OSL 1k/8k" - type: boolean - required: true - run_8k1k: - description: "Run ISL/OSL 8k/1k" - type: boolean - required: true - - runner: - description: "Specific runner node to run on" - required: false - type: choice - options: - - "h100-cr_0" - - "h100-cr_1" - - "h100-cw_0" - - "h100-cw_1" - - "h200-cw_0" - - "h200-cw_1" - - "h200-nb_0" - - "h200-nb_1" - - "h200-nb_2" - - "h200-nb_3" - - "h200-nv_0" - - "h200-nv_1" - - "h200-nv_2" - - "h200-nv_3" - - "b200-nv_0" - - "b200-nv_1" - - "b200-nb_0" - - "b200-nb_1" - - "b200-nvd_0" - - "b200-nvd_1" - - "b200-nvd_2" - - "b200-nvd_3" - - "b200-tg_0" - - "mi300x-amd_0" - - "mi300x-amd_1" - - "mi300x-amd_2" - - "mi300x-amd_3" - - "mi300x-amd_4" - - "mi300x-cr_0" - - "mi300x-oci_0" - - "mi325x-amd_0" - - "mi325x-tw_0" - - "mi325x-tw_1" - - "mi325x-tw_2" - - "mi325x-tw_3" - - "mi355x-amd_0" - - "mi355x-amd_1" - - "mi355x-amd_2" - - "mi355x-amd_3" - -jobs: - get-jobs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - id: get-jobs - run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \ - --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ - --key ${{ inputs.name }} \ - ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }}) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - - test-sweep: - needs: get-jobs - uses: ./.github/workflows/benchmark-tmpl.yml - name: test sweep - ${{ inputs.name }} - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "dsr1_1k1k" - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} - conc: ${{ matrix.config.conc }} - - calc-success-rate: - needs: test-sweep - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@v3 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@v4 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@v4 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json From 2d1e45763befe5b095ac197fdbabf6d8aab82a2c Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 09:20:47 -0500 Subject: [PATCH 079/149] cleaning up after rebase --- .github/workflows/1k8k-sweep.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index 581ec07cf..25fc3a362 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -4,11 +4,6 @@ concurrency: group: benchmark-lock-1k8k cancel-in-progress: false -on: - workflow_dispatch: - schedule: - - cron: '0 23 * * *' - on: # pull_request: workflow_dispatch: From 534d98c2ba3e535a6341b824272313751a07699c Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 11:38:01 -0500 Subject: [PATCH 080/149] adding docs for configs; adding field to configs --- .github/configs/CONFIGS.md | 52 +++++++++++++++++++ .github/configs/amd-master.yaml | 15 ++++-- .github/configs/nvidia-master.yaml | 17 ++++-- utils/matrix-logic/generate_sweep_configs.py | 19 ++++--- .../test_generate_sweep_configs.py | 24 +++++++++ 5 files changed, 110 insertions(+), 17 deletions(-) create mode 100644 .github/configs/CONFIGS.md diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md new file mode 100644 index 000000000..218e17821 --- /dev/null +++ b/.github/configs/CONFIGS.md @@ -0,0 +1,52 @@ +# Configs + +The config files in this directory are meant to be a "source of truth" for what benchmark configurations can/should be run. As such, they must follow a precise format which is described below. + +## Master Configs (AMD, NVIDIA, etc.) + +```yaml +entry-name: + image: string + model: string + model-prefix: string + runner: string + precision: string + framework: string + seq-len-configs: + - isl: int + osl: int + search-space: + - { tp: int, conc-start: int, conc-end: int } + # Optionally, specify 'ep' (expert-parallelism) and 'dp-attn' (data parallel attention) + - { tp: int, ep: int, dp-attn: bool, conc-start: int, conc-end: int } + - ... + - ... +``` +Note: while not required, `entry-name` typically takes the format `---`. + +The below list describes what each field is: + +- `image`: The image used to serve the benchmark, e.g., `vllm/vllm-openai:v0.10.2` +- `model`: The model to server, e.g., `openai/gpt-oss-120b` +- `model-prefix`: The canonical InferenceMAX model prefix reference, i.e., `dsr1` for Deepseek, `gptoss` for gptoss-120b, etc. This value is used to decipher which script in `benchmarks/` should be used in order to launch the benchmark. +- `runner`: This is the runner on which to run the benchmark. This must be a valid runner (key or value) from `runners.yaml`. +- `precision`: The precision to run the benchmark. Again, this is used to find which script to run in `benchmarks/`. +- `framework`: The framework (serving runtime) to serve the benchmark, e.g., `vllm`, `sglang`, `trt`. +- `seq-len-configs`: A list of possible sequence lengths to benchmark. Each entry must have the following fields: + - `isl`: An integer representing the input sequence length, e.g., `1024` + - `osl`: An integer representing the output sequence length, e.g., `8192` + - `search-space`: A list of configurations to run with respective `isl` and `osl`, each entry must be a dict with the following fields: + - `tp`: An integer representing the tensor parallelism level that the configuration will be served at. + - `conc-start`: An integer representing the starting level of concurrency e.g., `4` + - `conc-end`: An integer representing the ending level of concurrency (inclusive) e.g., `128` + - Note: the step factor between `conc-start` and `conc-end` is 2, so if `conc-start` is 4 and `conc-end` is 128, all concurrencies `4, 8, 16, 32, ..., 128` will be run. + - (Optional) `ep`: An integer representing the expert parallelism level that the configuration will be served at. Default is 1 (no expert parallelism) when not specified. + - (Optional) `dp-attn`: A boolean representing whether or not to activate data parallel attention for the configuration. Default is false when not specified. + +Notes: +- No extra fields besides the ones listed may be specified, or else the benchmarks will fail to run. +- Setting the fields above, particularly `ep` and `dp-attn`, only guarantee that the respective values will be passed as environment variables to the benchmark scripts! Actually using those environment variables is an implementation detail at the level of the benchmark Bash script. + +## Runners + +The `runners.yaml` config represents the available runners in the repository. The keys are the runner *types* (i.e., the GPUs as well as some specific combinations like `h200-trt`) whereas the value is a list of *runner nodes*. This config is used to verify the master configs. diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 55086d443..d9558f284 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1,6 +1,7 @@ -dsr1-fp4-mi355x-sgl: +dsr1-fp4-mi355x-sglang: image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915 model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 runner: mi355x precision: fp4 framework: sglang @@ -19,9 +20,10 @@ dsr1-fp4-mi355x-sgl: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -dsr1-fp8-mi300x-sgl: +dsr1-fp8-mi300x-sglang: image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915 model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 runner: mi300x precision: fp8 framework: sglang @@ -39,9 +41,10 @@ dsr1-fp8-mi300x-sgl: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -dsr1-fp8-mi325x-sgl: +dsr1-fp8-mi325x-sglang: image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915 model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 runner: mi325x precision: fp8 framework: sglang @@ -59,9 +62,10 @@ dsr1-fp8-mi325x-sgl: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -dsr1-fp8-mi355x-sgl: +dsr1-fp8-mi355x-sglang: image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915 model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 runner: mi355x precision: fp8 framework: sglang @@ -82,6 +86,7 @@ dsr1-fp8-mi355x-sgl: gptoss-fp4-mi300x-vllm: image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 model: openai/gpt-oss-120b + model-prefix: gptoss runner: mi300x precision: fp4 framework: vllm @@ -111,6 +116,7 @@ gptoss-fp4-mi300x-vllm: gptoss-fp4-mi325x-vllm: image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 model: openai/gpt-oss-120b + model-prefix: gptoss runner: mi325x precision: fp4 framework: vllm @@ -140,6 +146,7 @@ gptoss-fp4-mi325x-vllm: gptoss-fp4-mi355x-vllm: image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 model: openai/gpt-oss-120b + model-prefix: gptoss runner: mi355x precision: fp4 framework: vllm diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9da1cd0f9..92dfb5bbd 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1,6 +1,7 @@ -dsr1-fp4-b200-sgl: +dsr1-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 runner: b200 precision: fp4 framework: sglang @@ -24,6 +25,7 @@ dsr1-fp4-b200-sgl: dsr1-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 runner: b200-trt precision: fp4 framework: trt @@ -70,9 +72,10 @@ dsr1-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } -dsr1-fp8-b200-sgl: +dsr1-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.3rc1-cu129-b200 model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 runner: b200 precision: fp8 framework: sglang @@ -93,6 +96,7 @@ dsr1-fp8-b200-sgl: dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 runner: b200-trt precision: fp8 framework: trt @@ -115,9 +119,10 @@ dsr1-fp8-b200-trt: # If CONC > 64, then DP_ATTN=true - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } -dsr1-fp8-h200-sgl: +dsr1-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.2rc2-cu126 model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 runner: h200 precision: fp8 framework: sglang @@ -138,6 +143,7 @@ dsr1-fp8-h200-sgl: dsr1-fp8-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 runner: h200-trt precision: fp8 framework: trt @@ -163,6 +169,7 @@ dsr1-fp8-h200-trt: gptoss-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 model: openai/gpt-oss-120b + model-prefix: gptoss runner: b200-nvs precision: fp4 framework: trt @@ -193,6 +200,7 @@ gptoss-fp4-b200-trt: gptoss-fp4-b200-vllm: image: vllm/vllm-openai:v0.10.2 model: openai/gpt-oss-120b + model-prefix: gptoss runner: b200 precision: fp4 framework: vllm @@ -222,6 +230,7 @@ gptoss-fp4-b200-vllm: gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.10.2 model: openai/gpt-oss-120b + model-prefix: gptoss runner: h100 precision: fp4 framework: vllm @@ -248,6 +257,7 @@ gptoss-fp4-h100-vllm: gptoss-fp4-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev model: openai/gpt-oss-120b + model-prefix: gptoss runner: h200-trt precision: fp4 framework: trt @@ -277,6 +287,7 @@ gptoss-fp4-h200-trt: gptoss-fp4-h200-vllm: image: vllm/vllm-openai:v0.10.2 model: openai/gpt-oss-120b + model-prefix: gptoss runner: h200 precision: fp4 framework: vllm diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 7574579af..a768554e1 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -67,6 +67,7 @@ def validate_master_configs_structure(all_config_data): required_fields = { 'image': str, 'model': str, + 'model-prefix': str, 'precision': str, 'framework': str, 'runner': str, @@ -202,7 +203,7 @@ def generate_full_sweep(args, all_config_data): precision = val['precision'] framework = val['framework'] runner = val['runner'] - model_code = key.split('-')[0] + model_code = val['model-prefix'] for seq_config in seq_len_configs: isl = seq_config['isl'] @@ -315,15 +316,15 @@ def generate_test_config(args, all_config_data): raise ValueError( f"Runner config file '{args.runner_config}' does not exist.") - # Extract model code from config key - model_code = args.key.split('-')[0] - val = all_config_data.get(args.key) if not val: raise ValueError( f"Specified key '{args.key}' does not exist in config files.") + # Extract model code from config + model_code = val['model-prefix'] + runner_nodes = runner_config.get(val['runner'], []) if args.runner_node not in runner_nodes: raise ValueError( @@ -447,9 +448,8 @@ def generate_runner_model_sweep_config(args, all_config_data): if val['runner'] != args.runner_type: continue - # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name - # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME - model_code = key.split('-')[0] + # Get model code for exp_name + model_code = val['model-prefix'] # Find 1k1k config target_config = None @@ -561,9 +561,8 @@ def generate_runner_sweep_config(args, all_config_data): if (args.precision and val['precision'] != args.precision) or (args.framework and val['framework'] != args.framework): continue - # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name - # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME - model_code = key.split('-')[0] + # Get model code for exp_name + model_code = val['model-prefix'] runner_nodes = runner_config.get(val['runner']) if not runner_nodes: diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py index 36cb14cd7..cd5ff5b46 100644 --- a/utils/matrix-logic/test_generate_sweep_configs.py +++ b/utils/matrix-logic/test_generate_sweep_configs.py @@ -24,6 +24,7 @@ def sample_master_config(): "70b-fp8-vllm": { "image": "vllm/vllm-openai:v0.10.2", "model": "meta-llama/Llama-3-70b", + "model-prefix": "70b", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -48,6 +49,7 @@ def sample_master_config(): "8b-fp4-trt": { "image": "nvcr.io/nvidia/tritonserver:24.01", "model": "meta-llama/Llama-3-8b", + "model-prefix": "8b", "precision": "fp4", "framework": "trt", "runner": "h100", @@ -64,6 +66,7 @@ def sample_master_config(): "gptoss-120b-fp8-vllm": { "image": "vllm/vllm-openai:latest", "model": "openai/gpt-oss-120b", + "model-prefix": "gptoss", "precision": "fp8", "framework": "vllm", "runner": "h200-trt", @@ -112,6 +115,7 @@ def invalid_master_config(): "missing-field": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", # Missing precision, framework, runner, seq-len-configs } } @@ -294,6 +298,7 @@ def test_validate_master_configs_structure_missing_field(): config = { "test-key": { "image": "test:latest", + "model-prefix": "test", # Missing other required fields } } @@ -307,6 +312,7 @@ def test_validate_master_configs_structure_wrong_type(): "test-key": { "image": 123, # Should be string "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -323,6 +329,7 @@ def test_validate_master_configs_structure_empty_seq_len_configs(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -339,6 +346,7 @@ def test_validate_master_configs_structure_invalid_search_space(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -363,6 +371,7 @@ def test_validate_master_configs_structure_missing_search_space(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -385,6 +394,7 @@ def test_validate_master_configs_structure_search_space_not_list(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -407,6 +417,7 @@ def test_validate_master_configs_structure_extra_fields_in_search_space(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -436,6 +447,7 @@ def test_validate_master_configs_structure_missing_isl(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -457,6 +469,7 @@ def test_validate_master_configs_structure_wrong_isl_type(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -479,6 +492,7 @@ def test_validate_master_configs_structure_missing_osl(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -500,6 +514,7 @@ def test_validate_master_configs_structure_wrong_osl_type(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -522,6 +537,7 @@ def test_validate_master_configs_structure_wrong_tp_type(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -544,6 +560,7 @@ def test_validate_master_configs_structure_wrong_conc_start_type(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -566,6 +583,7 @@ def test_validate_master_configs_structure_wrong_conc_end_type(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -588,6 +606,7 @@ def test_validate_master_configs_structure_wrong_ep_type(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -610,6 +629,7 @@ def test_validate_master_configs_structure_wrong_dp_attn_type(): "test-key": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -784,6 +804,7 @@ def test_generate_full_sweep_seq_len_not_in_config(temp_config_files): "test-fp8-vllm": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -822,6 +843,7 @@ def test_generate_full_sweep_concurrency_overshoot(temp_config_files): "test-fp8-vllm": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -1010,6 +1032,7 @@ def test_generate_full_sweep_with_filters_concurrency_overshoot(temp_config_file "test-fp8-vllm": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", @@ -1146,6 +1169,7 @@ def test_generate_test_config_concurrency_overshoot(temp_config_files): "test-fp8-vllm": { "image": "test:latest", "model": "test/model", + "model-prefix": "test", "precision": "fp8", "framework": "vllm", "runner": "h200", From e21692059db837348c7dafbbfbccf42a5f420a20 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 11:39:05 -0500 Subject: [PATCH 081/149] hash on dpa too --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 4fb327381..5a3ebfae4 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -67,7 +67,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} conc${{ inputs.conc }}' + name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa${{ inputs.dp-attn }} conc${{ inputs.conc }}' steps: - name: Resource cleanup run: | From 751d092031738e7f407862e0c092ff196f60eec8 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 11:41:01 -0500 Subject: [PATCH 082/149] debug --- .github/workflows/e2e-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index ff7ecb92b..5cdc94a5e 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -5,6 +5,7 @@ name: End-to-End Tests # cancel-in-progress: false on: + pull_request: workflow_dispatch: inputs: generate-cli-command: From d4f57a787a85ac83ebf63ed17a8ebf8e402cd66d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 11:44:22 -0500 Subject: [PATCH 083/149] debug --- .github/workflows/e2e-tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 5cdc94a5e..411c19a6f 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -10,8 +10,9 @@ on: inputs: generate-cli-command: description: "Command passed to generate matrix script" - required: true + required: false type: string + default: "filtered-sweep --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type h200 h200-trt" jobs: get-jobs: From 825aa7e5faf4cf07a6629d836e870592e0017a66 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 11:46:06 -0500 Subject: [PATCH 084/149] debug --- .github/workflows/e2e-tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 411c19a6f..404447b54 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -6,6 +6,9 @@ name: End-to-End Tests on: pull_request: + push: + branches: + - initial-refactor workflow_dispatch: inputs: generate-cli-command: From 232b33bb8a53fc2815f16760e17f8caa879a3a39 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 11:46:58 -0500 Subject: [PATCH 085/149] debug --- .github/workflows/e2e-tests.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 404447b54..ff7ecb92b 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -5,17 +5,12 @@ name: End-to-End Tests # cancel-in-progress: false on: - pull_request: - push: - branches: - - initial-refactor workflow_dispatch: inputs: generate-cli-command: description: "Command passed to generate matrix script" - required: false + required: true type: string - default: "filtered-sweep --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type h200 h200-trt" jobs: get-jobs: From d2d025ecfdab16c31f21c5ab41fe67cc996448b8 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 11:50:15 -0500 Subject: [PATCH 086/149] update hashing --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 5a3ebfae4..571b39888 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -67,7 +67,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa${{ inputs.dp-attn }} conc${{ inputs.conc }}' + name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}' steps: - name: Resource cleanup run: | From e95af112be3c0dfd723b393eb0f193076726072a Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 11:53:49 -0500 Subject: [PATCH 087/149] deleting extraneous file --- utils/matrix-logic/get_test_sweep_configs.py | 151 ------------------- 1 file changed, 151 deletions(-) delete mode 100644 utils/matrix-logic/get_test_sweep_configs.py diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py deleted file mode 100644 index 87ab0457b..000000000 --- a/utils/matrix-logic/get_test_sweep_configs.py +++ /dev/null @@ -1,151 +0,0 @@ -import json -import yaml -import sys -import argparse - -seq_len_stoi = { - "1k1k": (1024, 1024), - "1k8k": (1024, 8192), - "8k1k": (8192, 1024) -} - -def main(): - parser = argparse.ArgumentParser( - description='Generate benchmark matrix from a specific configuration key' - ) - parser.add_argument( - '--config-files', - nargs='+', - required=True, - help='One or more configuration files (YAML format)' - ) - parser.add_argument( - '--key', - required=True, - help='Configuration key to use' - ) - parser.add_argument( - '--seq-lens', - nargs='+', - choices=list(seq_len_stoi.keys()), - required=False, - help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." - ) - parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - - args = parser.parse_args() - - # Convert seq-lens to set of (isl, osl) tuples for filtering - seq_lens_filter = None - if args.seq_lens: - seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} - - # Load and merge all config files - all_config_data = {} - for config_file in args.config_files: - try: - with open(config_file, 'r') as f: - config_data = yaml.safe_load(f) - assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" - - # Check for duplicate keys - duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) - if duplicate_keys: - raise ValueError( - f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" - ) - - all_config_data.update(config_data) - except FileNotFoundError: - raise ValueError(f"Input file '{config_file}' does not exist.") - - # Check if the key exists - if args.key not in all_config_data: - available_keys = ', '.join(sorted(all_config_data.keys())) - raise ValueError( - f"Key '{args.key}' not found in configuration files. " - f"Available keys: {available_keys}" - ) - - val = all_config_data[args.key] - - # Validate required fields - seq_len_configs = val.get('seq-len-configs') - assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'" - - image = val.get('image') - model = val.get('model') - precision = val.get('precision') - framework = val.get('framework') - runner = val.get('runner') - - assert None not in (image, model, precision, framework, runner), \ - f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" - - matrix_values = [] - - # Process each sequence length configuration - for seq_config in seq_len_configs: - isl = seq_config.get('isl') - osl = seq_config.get('osl') - - assert None not in (isl, osl), \ - f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'" - - # Filter by sequence lengths if specified - if seq_lens_filter and (isl, osl) not in seq_lens_filter: - continue - - bmk_space = seq_config.get('bmk-space') - assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'" - - for bmk in bmk_space: - tp = bmk.get('tp') - conc_start = bmk.get('conc-start') - conc_end = bmk.get('conc-end') - ep = bmk.get('ep') - dp_attn = bmk.get('dp-attn') - - assert None not in (tp, conc_start, conc_end), \ - f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" - - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: - entry = { - 'image': image, - 'model': model, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'conc': conc, - 'max-model-len': isl + osl, - } - - # Add optional fields if they exist - if ep is not None: - entry['ep'] = ep - if dp_attn is not None: - entry['dp-attn'] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - print(json.dumps(matrix_values)) - return matrix_values - -if __name__ == "__main__": - main() \ No newline at end of file From bed5406a4e3b90fea8fcbf6450d41a593514012d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 12:39:01 -0500 Subject: [PATCH 088/149] adding gb200 --- .github/workflows/1k1k-sweep.yml | 47 ++++++++++++++++++- .../workflows/benchmark-multinode-tmpl.yml | 3 +- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index cbdc490e2..3c1cd01ae 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -87,8 +87,53 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + # This is a workaround until we can integrate GB200 into master configs. + benchmark-gb200: + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gptoss 1k1k + strategy: + fail-fast: false + matrix: + config: + - { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trtllm", + "mtp": "off", + } + - { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trtllm", + "mtp": "on", + } + - { + "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "dynamo-sglang", + "mtp": "off", + } + secrets: inherit + with: + runner: gb200 + image: ${{ matrix.config.runner }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.model-prefix }}_1k1k + isl: 1024 + osl: 1024 + max-model-len: 2048 + mtp-mode: ${{ matrix.config.mtp }} + collect-dsr1-results: - needs: benchmark-dsr1 + needs: [benchmark-dsr1, benchmark-gb200] if: ${{ always() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 07f5b876d..0386e7d55 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -31,8 +31,9 @@ on: required: true type: string random-range-ratio: - required: true + required: false type: string + default: '0.8' mtp-mode: required: true type: string From 475559a4d79dcb79affc51e79e80bdac1b9fafd1 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 12:40:40 -0500 Subject: [PATCH 089/149] adding gb200 pt 2 --- .github/workflows/1k1k-sweep.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 3c1cd01ae..de26a1af0 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -90,7 +90,7 @@ jobs: # This is a workaround until we can integrate GB200 into master configs. benchmark-gb200: uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gptoss 1k1k + name: gb200 1k1k sweep strategy: fail-fast: false matrix: From f24799bb8f2723dc7b1b92b3668e6feeb93be402 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 12:47:04 -0500 Subject: [PATCH 090/149] adding gb200 pt 3 --- .github/workflows/1k1k-sweep.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index de26a1af0..699b0baff 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -122,7 +122,7 @@ jobs: secrets: inherit with: runner: gb200 - image: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} From 5f61cd3c75875357c056c5716340ff688cbc3662 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 12:50:56 -0500 Subject: [PATCH 091/149] adding gb200 to other isl osl sweeps --- .github/workflows/1k8k-sweep.yml | 45 ++++++++++++++++++++++++++++++++ .github/workflows/8k1k-sweep.yml | 45 ++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index 25fc3a362..78d9b939b 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -85,6 +85,51 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + # This is a workaround until we can integrate GB200 into master configs. + benchmark-gb200: + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gb200 1k1k sweep + strategy: + fail-fast: false + matrix: + config: + - { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trtllm", + "mtp": "off", + } + - { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trtllm", + "mtp": "on", + } + - { + "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "dynamo-sglang", + "mtp": "off", + } + secrets: inherit + with: + runner: gb200 + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.model-prefix }}_1k8k + isl: 1024 + osl: 8192 + max-model-len: 2048 + mtp-mode: ${{ matrix.config.mtp }} + collect-dsr1-results: needs: benchmark-dsr1 if: ${{ always() }} diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml index c8338d533..179b542ae 100644 --- a/.github/workflows/8k1k-sweep.yml +++ b/.github/workflows/8k1k-sweep.yml @@ -85,6 +85,51 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + # This is a workaround until we can integrate GB200 into master configs. + benchmark-gb200: + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gb200 1k1k sweep + strategy: + fail-fast: false + matrix: + config: + - { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trtllm", + "mtp": "off", + } + - { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trtllm", + "mtp": "on", + } + - { + "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "dynamo-sglang", + "mtp": "off", + } + secrets: inherit + with: + runner: gb200 + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.model-prefix }}_8k1k + isl: 8192 + osl: 1024 + max-model-len: 2048 + mtp-mode: ${{ matrix.config.mtp }} + collect-dsr1-results: needs: benchmark-dsr1 if: ${{ always() }} From 89ebc6e099c213d67013816b00a868b46ff2cb93 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 15:09:18 -0500 Subject: [PATCH 092/149] adding gb200 to other isl osl sweeps --- .github/workflows/gb200-tests.yml | 97 +++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 .github/workflows/gb200-tests.yml diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml new file mode 100644 index 000000000..8cc4d775f --- /dev/null +++ b/.github/workflows/gb200-tests.yml @@ -0,0 +1,97 @@ +name: End-to-End Tests + +# concurrency: +# group: benchmark-lock +# cancel-in-progress: false + +on: + push: + branches: + - initial-refactor + workflow_dispatch: + inputs: + image: + description: "Docker Image" + required: true + type: choice + options: + - "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1" + - "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3" + + model: + description: "Model" + required: true + type: choice + options: + - "deepseek-ai/DeepSeek-R1-0528" + - "deepseek-r1-fp4" + + precision: + description: "Precision" + required: true + type: choice + options: + - "fp4" + - "fp8" + + framework: + description: "Framework" + required: true + type: choice + options: + - "dynamo-trtllm" + - "dynamo-sglang" + + mtp: + description: "Mtp On/Off" + required: true + type: choice + options: + - "on" + - "off" + + isl: + description: "ISL" + required: true + type: string + + osl: + description: "OSL" + required: true + type: string + +jobs: + pre-run: + runs-on: ubuntu-latest + outputs: + max-model-len: ${{ steps.calc.outputs.max-model-len }} + steps: + - id: calc + shell: python + run: | + import os + import sys + try: + isl = int("${{ inputs.isl }}") + osl = int("${{ inputs.osl }}") + except ValueError: + print("Error: ISL and OSL must be integers") + sys.exit(1) + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write(f"max-model-len={isl + osl}\n") + + benchmark-gb200: + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gb200 test + secrets: inherit + with: + runner: gb200 + image: ${{ inputs.image }} + model: ${{ inputs.model }} + framework: ${{ inputs.framework }} + precision: ${{ inputs.precision }} + exp-name: dsr1_1k1k + isl: ${{ inputs.isl }} + osl: ${{ inputs.osl }} + max-model-len: ${{ needs.pre-run.outputs.max-model-len }} + mtp-mode: ${{ inputs.mtp }} From 04b614aafc88eca5b20ea2f6e8132210f65d7981 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 15:10:00 -0500 Subject: [PATCH 093/149] adding gb200 test --- .github/workflows/gb200-tests.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml index 8cc4d775f..ff6936a56 100644 --- a/.github/workflows/gb200-tests.yml +++ b/.github/workflows/gb200-tests.yml @@ -5,9 +5,6 @@ name: End-to-End Tests # cancel-in-progress: false on: - push: - branches: - - initial-refactor workflow_dispatch: inputs: image: @@ -81,6 +78,7 @@ jobs: f.write(f"max-model-len={isl + osl}\n") benchmark-gb200: + needs: pre-run uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: gb200 test secrets: inherit From ab052fdbf93c60de1405bae0076759820a47bb94 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 15:10:44 -0500 Subject: [PATCH 094/149] adding gb200 test --- .github/workflows/gb200-tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml index ff6936a56..9de931457 100644 --- a/.github/workflows/gb200-tests.yml +++ b/.github/workflows/gb200-tests.yml @@ -1,10 +1,13 @@ -name: End-to-End Tests +name: GB200 Tests # concurrency: # group: benchmark-lock # cancel-in-progress: false on: + push: + branches: + - initial-refactor workflow_dispatch: inputs: image: From 6495caa484370dc9a23ca0196eb39f55cf7cbd9a Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 15:10:52 -0500 Subject: [PATCH 095/149] adding gb200 test --- .github/workflows/gb200-tests.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml index 9de931457..5fc7b6636 100644 --- a/.github/workflows/gb200-tests.yml +++ b/.github/workflows/gb200-tests.yml @@ -5,9 +5,6 @@ name: GB200 Tests # cancel-in-progress: false on: - push: - branches: - - initial-refactor workflow_dispatch: inputs: image: From 589382d7eaf8b595a90351c15818fdb2b4c16302 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 17:15:29 -0500 Subject: [PATCH 096/149] adding full sweep test --- .github/workflows/full-sweep-test.yml | 371 +++++++++++++++++++ .github/workflows/gb200-tests.yml | 6 +- utils/matrix-logic/generate_sweep_configs.py | 13 + 3 files changed, 385 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/full-sweep-test.yml diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml new file mode 100644 index 000000000..d5b1894b8 --- /dev/null +++ b/.github/workflows/full-sweep-test.yml @@ -0,0 +1,371 @@ +name: Test - Full Sweep + +concurrency: + group: benchmark-lock + cancel-in-progress: false + +on: + push: + branches: + - initial-refactor + workflow_dispatch: + inputs: + run_1k1k: + type: boolean + required: false + run_8k1k: + type: boolean + required: false + run_1k8k: + type: boolean + required: false + + use_h100: + type: boolean + required: false + use_h200: + type: boolean + required: false + use_b200: + type: boolean + required: false + use_mi300x: + type: boolean + required: false + use_mi325x: + type: boolean + required: false + use_mi355x: + type: boolean + required: false + use_gb200: + type: boolean + required: false + +jobs: + get-configs: + runs-on: ubuntu-latest + outputs: + dsr1-1k1k: ${{ steps.generate-configs.outputs.dsr1-1k1k }} + dsr1-1k8k: ${{ steps.generate-configs.outputs.dsr1-1k8k }} + dsr1-8k1k: ${{ steps.generate-configs.outputs.dsr1-8k1k }} + gptoss-1k1k: ${{ steps.generate-configs.outputs.gptoss-1k1k }} + gptoss-1k8k: ${{ steps.generate-configs.outputs.gptoss-1k8k }} + gptoss-8k1k: ${{ steps.generate-configs.outputs.gptoss-8k1k }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: generate-configs + run: | + pip install pydantic + + # Build runner type filter based on inputs + RUNNER_TYPES="" + if [ "${{ inputs.use_h100 }}" = "true" ]; then + RUNNER_TYPES="$RUNNER_TYPES h100" + fi + if [ "${{ inputs.use_h200 }}" = "true" ]; then + RUNNER_TYPES="$RUNNER_TYPES h200 h200-trt" + fi + if [ "${{ inputs.use_b200 }}" = "true" ]; then + RUNNER_TYPES="$RUNNER_TYPES b200 b200-trt b200-nvs" + fi + if [ "${{ inputs.use_mi300x }}" = "true" ]; then + RUNNER_TYPES="$RUNNER_TYPES mi300x" + fi + if [ "${{ inputs.use_mi325x }}" = "true" ]; then + RUNNER_TYPES="$RUNNER_TYPES mi325x" + fi + if [ "${{ inputs.use_mi355x }}" = "true" ]; then + RUNNER_TYPES="$RUNNER_TYPES mi355x" + fi + + # Build runner filter argument if runner types specified + RUNNER_FILTER="" + if [ -n "$RUNNER_TYPES" ]; then + RUNNER_FILTER="--runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml" + fi + + # Generate dsr1 configs + if [ "${{ inputs.run_1k1k }}" = "true" ]; then + DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 $RUNNER_FILTER) + echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT + else + echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_1k8k }}" = "true" ]; then + DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 $RUNNER_FILTER) + echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT + else + echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_8k1k }}" = "true" ]; then + DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 $RUNNER_FILTER) + echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT + else + echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT + fi + + # Generate gptoss configs + if [ "${{ inputs.run_1k1k }}" = "true" ]; then + GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss $RUNNER_FILTER) + echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT + else + echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_1k8k }}" = "true" ]; then + GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss $RUNNER_FILTER) + echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT + else + echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_8k1k }}" = "true" ]; then + GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss $RUNNER_FILTER) + echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT + else + echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT + fi + + # DSR1 1K1K Benchmarks + benchmark-dsr1-1k1k: + needs: get-configs + if: ${{ needs.get-configs.outputs.dsr1-1k1k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k1k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-dsr1-1k1k-results: + needs: benchmark-dsr1-1k1k + if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "dsr1_1k1k" + + # GPTOSS 1K1K Benchmarks + benchmark-gptoss-1k1k: + needs: get-configs + if: ${{ needs.get-configs.outputs.gptoss-1k1k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k1k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-gptoss-1k1k-results: + needs: benchmark-gptoss-1k1k + if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "gptoss_1k1k" + + # DSR1 8K1K Benchmarks + benchmark-dsr1-8k1k: + needs: get-configs + if: ${{ needs.get-configs.outputs.dsr1-8k1k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.dsr1-8k1k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-dsr1-8k1k-results: + needs: benchmark-dsr1-8k1k + if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "dsr1_8k1k" + + # GPTOSS 8K1K Benchmarks + benchmark-gptoss-8k1k: + needs: get-configs + if: ${{ needs.get-configs.outputs.gptoss-8k1k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.gptoss-8k1k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-gptoss-8k1k-results: + needs: benchmark-gptoss-8k1k + if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "gptoss_8k1k" + + # DSR1 1K8K Benchmarks + benchmark-dsr1-1k8k: + needs: get-configs + if: ${{ needs.get-configs.outputs.dsr1-1k8k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k8k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-dsr1-1k8k-results: + needs: benchmark-dsr1-1k8k + if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "dsr1_1k8k" + + # GPTOSS 1K8K Benchmarks + benchmark-gptoss-1k8k: + needs: get-configs + if: ${{ needs.get-configs.outputs.gptoss-1k8k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k8k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-gptoss-1k8k-results: + needs: benchmark-gptoss-1k8k + if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "gptoss_1k8k" + + calc-success-rate: + needs: + [ + collect-dsr1-1k1k-results, + collect-dsr1-1k8k-results, + collect-dsr1-8k1k-results, + collect-gptoss-1k1k-results, + collect-gptoss-1k8k-results, + collect-gptoss-8k1k-results, + ] + if: ${{ always() }} + runs-on: ubuntu-latest + + env: + RESULTS_DIR: "results/" + STATS_FILENAME: "run_stats" + GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download results artifacts + uses: actions/download-artifact@v4 + with: + path: ${{ env.RESULTS_DIR }} + pattern: results_* + + - name: Install python dependencies + run: pip install PyGithub + + - name: Calculate success rate + run: python3 utils/calc_success_rate.py $STATS_FILENAME + + - uses: actions/upload-artifact@v4 + with: + name: "run-stats" + path: ${{ env.STATS_FILENAME }}.json diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml index 5fc7b6636..c700599d9 100644 --- a/.github/workflows/gb200-tests.yml +++ b/.github/workflows/gb200-tests.yml @@ -1,14 +1,10 @@ name: GB200 Tests -# concurrency: -# group: benchmark-lock -# cancel-in-progress: false - on: workflow_dispatch: inputs: image: - description: "Docker Image" + description: "Serving Image" required: true type: choice options: diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index a768554e1..c43a1759e 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -551,11 +551,19 @@ def generate_runner_sweep_config(args, all_config_data): raise ValueError( f"Runner config file '{args.runner_config}' does not exist.") + if not runner_config.get(args.runner_type): + raise ValueError( + f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") + + matrix_values = [] for key, val in all_config_data.items(): # Only consider configs with specified runner if not key.startswith(args.model_prefix): continue + + if not val['runner'] == args.runner_type: + continue # Optionally filter by precision and framework if (args.precision and val['precision'] != args.precision) or (args.framework and val['framework'] != args.framework): @@ -807,6 +815,11 @@ def main(): add_help=False, help='Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b actually do so successfully.' ) + test_config_parser.add_argument( + '--runner-type', + required=True, + help='Runner type (e.g., h200-trt, h100)' + ) test_config_parser.add_argument( '--model-prefix', required=True, From b920ec4d48b201447295a64c3a61e71c86397072 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 17:15:53 -0500 Subject: [PATCH 097/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index d5b1894b8..f7fdf9fc2 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -5,9 +5,6 @@ concurrency: cancel-in-progress: false on: - push: - branches: - - initial-refactor workflow_dispatch: inputs: run_1k1k: From d4c5dbc64871915c2ab0708c9001d208462b4f38 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:06:53 -0500 Subject: [PATCH 098/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 49 +++++++++++++++++++-------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index f7fdf9fc2..eb1abbd25 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -1,8 +1,8 @@ name: Test - Full Sweep -concurrency: - group: benchmark-lock - cancel-in-progress: false +# concurrency: +# group: benchmark-lock +# cancel-in-progress: false on: workflow_dispatch: @@ -77,30 +77,39 @@ jobs: if [ "${{ inputs.use_mi355x }}" = "true" ]; then RUNNER_TYPES="$RUNNER_TYPES mi355x" fi - - # Build runner filter argument if runner types specified - RUNNER_FILTER="" - if [ -n "$RUNNER_TYPES" ]; then - RUNNER_FILTER="--runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml" + if [ "${{ inputs.use_gb200 }}" = "true" ]; then + RUNNER_TYPES="$RUNNER_TYPES gb200" fi # Generate dsr1 configs if [ "${{ inputs.run_1k1k }}" = "true" ]; then - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 $RUNNER_FILTER) + if [ -n "$RUNNER_TYPES" ]; then + DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) + fi echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT else echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT fi if [ "${{ inputs.run_1k8k }}" = "true" ]; then - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 $RUNNER_FILTER) + if [ -n "$RUNNER_TYPES" ]; then + DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) + fi echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT else echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT fi if [ "${{ inputs.run_8k1k }}" = "true" ]; then - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 $RUNNER_FILTER) + if [ -n "$RUNNER_TYPES" ]; then + DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) + fi echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT else echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT @@ -108,21 +117,33 @@ jobs: # Generate gptoss configs if [ "${{ inputs.run_1k1k }}" = "true" ]; then - GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss $RUNNER_FILTER) + if [ -n "$RUNNER_TYPES" ]; then + GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) + fi echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT else echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT fi if [ "${{ inputs.run_1k8k }}" = "true" ]; then - GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss $RUNNER_FILTER) + if [ -n "$RUNNER_TYPES" ]; then + GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) + fi echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT else echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT fi if [ "${{ inputs.run_8k1k }}" = "true" ]; then - GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss $RUNNER_FILTER) + if [ -n "$RUNNER_TYPES" ]; then + GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) + fi echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT else echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT From 02deb3dae328104abe34dbce90ef321aabc29c8a Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:11:19 -0500 Subject: [PATCH 099/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index eb1abbd25..bb5246a71 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -60,27 +60,30 @@ jobs: # Build runner type filter based on inputs RUNNER_TYPES="" if [ "${{ inputs.use_h100 }}" = "true" ]; then - RUNNER_TYPES="$RUNNER_TYPES h100" + RUNNER_TYPES="${RUNNER_TYPES} h100" fi if [ "${{ inputs.use_h200 }}" = "true" ]; then - RUNNER_TYPES="$RUNNER_TYPES h200 h200-trt" + RUNNER_TYPES="${RUNNER_TYPES} h200 h200-trt" fi if [ "${{ inputs.use_b200 }}" = "true" ]; then - RUNNER_TYPES="$RUNNER_TYPES b200 b200-trt b200-nvs" + RUNNER_TYPES="${RUNNER_TYPES} b200 b200-trt b200-nvs" fi if [ "${{ inputs.use_mi300x }}" = "true" ]; then - RUNNER_TYPES="$RUNNER_TYPES mi300x" + RUNNER_TYPES="${RUNNER_TYPES} mi300x" fi if [ "${{ inputs.use_mi325x }}" = "true" ]; then - RUNNER_TYPES="$RUNNER_TYPES mi325x" + RUNNER_TYPES="${RUNNER_TYPES} mi325x" fi if [ "${{ inputs.use_mi355x }}" = "true" ]; then - RUNNER_TYPES="$RUNNER_TYPES mi355x" + RUNNER_TYPES="${RUNNER_TYPES} mi355x" fi if [ "${{ inputs.use_gb200 }}" = "true" ]; then - RUNNER_TYPES="$RUNNER_TYPES gb200" + RUNNER_TYPES="${RUNNER_TYPES} gb200" fi + # Trim leading whitespace + RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs) + # Generate dsr1 configs if [ "${{ inputs.run_1k1k }}" = "true" ]; then if [ -n "$RUNNER_TYPES" ]; then From 18c26b3a0af0dc6f76561ec594720449dfe5a271 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:11:58 -0500 Subject: [PATCH 100/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index bb5246a71..a89f61421 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -56,6 +56,7 @@ jobs: - id: generate-configs run: | pip install pydantic + set -x # Build runner type filter based on inputs RUNNER_TYPES="" From f1477e53b1e1301f070c1f2aa5ddd4d57c732cc9 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:16:09 -0500 Subject: [PATCH 101/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index a89f61421..e81a71f42 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -85,10 +85,13 @@ jobs: # Trim leading whitespace RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs) + # DSR1 does not run on h100, so filter it out for dsr1 configs + DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) + # Generate dsr1 configs if [ "${{ inputs.run_1k1k }}" = "true" ]; then - if [ -n "$RUNNER_TYPES" ]; then - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + if [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) else DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) fi @@ -98,8 +101,8 @@ jobs: fi if [ "${{ inputs.run_1k8k }}" = "true" ]; then - if [ -n "$RUNNER_TYPES" ]; then - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + if [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) else DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) fi @@ -109,8 +112,8 @@ jobs: fi if [ "${{ inputs.run_8k1k }}" = "true" ]; then - if [ -n "$RUNNER_TYPES" ]; then - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + if [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) else DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) fi From d64d907e621cf3c4097ab1c73a2c73c8ecb35c62 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:16:27 -0500 Subject: [PATCH 102/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index e81a71f42..8edbf8712 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -56,7 +56,6 @@ jobs: - id: generate-configs run: | pip install pydantic - set -x # Build runner type filter based on inputs RUNNER_TYPES="" From d6bf37e3e5ded92de33c67b02b65dcf1c338351a Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:24:16 -0500 Subject: [PATCH 103/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 8edbf8712..73a230beb 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -16,7 +16,6 @@ on: run_1k8k: type: boolean required: false - use_h100: type: boolean required: false @@ -56,7 +55,8 @@ jobs: - id: generate-configs run: | pip install pydantic - + + set -x # Build runner type filter based on inputs RUNNER_TYPES="" if [ "${{ inputs.use_h100 }}" = "true" ]; then From dba3b4cd04704f0a28304b6fb6c666e8f0e8fb8a Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:35:17 -0500 Subject: [PATCH 104/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 126 ++++++++++++++------------ 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 73a230beb..9abfe4118 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -55,10 +55,10 @@ jobs: - id: generate-configs run: | pip install pydantic - - set -x - # Build runner type filter based on inputs + + # Build runner type filters based on inputs RUNNER_TYPES="" + if [ "${{ inputs.use_h100 }}" = "true" ]; then RUNNER_TYPES="${RUNNER_TYPES} h100" fi @@ -84,76 +84,88 @@ jobs: # Trim leading whitespace RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs) - # DSR1 does not run on h100, so filter it out for dsr1 configs + # DSR1 doesn't support H100, so exclude it DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) # Generate dsr1 configs if [ "${{ inputs.run_1k1k }}" = "true" ]; then if [ -n "$DSR1_RUNNER_TYPES" ]; then DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT else - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) + echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT fi - echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT else echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT fi - if [ "${{ inputs.run_1k8k }}" = "true" ]; then - if [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) - fi - echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT - fi + # Generate dsr1 configs + if [ "${{ inputs.run_1k1k }}" = "true" ]; then + if [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) + fi + echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT + else + echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT + fi - if [ "${{ inputs.run_8k1k }}" = "true" ]; then - if [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) - fi - echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT - fi + if [ "${{ inputs.run_1k8k }}" = "true" ]; then + if [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) + fi + echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT + else + echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT + fi - # Generate gptoss configs - if [ "${{ inputs.run_1k1k }}" = "true" ]; then - if [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) - fi - echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT - fi + if [ "${{ inputs.run_8k1k }}" = "true" ]; then + if [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) + fi + echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT + else + echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT + fi - if [ "${{ inputs.run_1k8k }}" = "true" ]; then - if [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) - fi - echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT - fi + # Generate gptoss configs + if [ "${{ inputs.run_1k1k }}" = "true" ]; then + if [ -n "$RUNNER_TYPES" ]; then + GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) + fi + echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT + else + echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT + fi - if [ "${{ inputs.run_8k1k }}" = "true" ]; then - if [ -n "$RUNNER_TYPES" ]; then - GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) - fi - echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT - fi + if [ "${{ inputs.run_1k8k }}" = "true" ]; then + if [ -n "$RUNNER_TYPES" ]; then + GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) + fi + echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT + else + echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_8k1k }}" = "true" ]; then + if [ -n "$RUNNER_TYPES" ]; then + GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + else + GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) + fi + echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT + else + echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT + fi # DSR1 1K1K Benchmarks benchmark-dsr1-1k1k: From a45e4bf7df1e1e4a9bec45dc458b903b7a2e5834 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:36:12 -0500 Subject: [PATCH 105/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 9abfe4118..759980b44 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -56,6 +56,7 @@ jobs: run: | pip install pydantic + set -x # Build runner type filters based on inputs RUNNER_TYPES="" From 60233aa37c21deb54c7acf3ca5185fbdf55a920b Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:38:49 -0500 Subject: [PATCH 106/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 115 +++++++++----------------- 1 file changed, 39 insertions(+), 76 deletions(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 759980b44..4de3595ab 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -56,7 +56,6 @@ jobs: run: | pip install pydantic - set -x # Build runner type filters based on inputs RUNNER_TYPES="" @@ -88,85 +87,49 @@ jobs: # DSR1 doesn't support H100, so exclude it DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) - # Generate dsr1 configs - if [ "${{ inputs.run_1k1k }}" = "true" ]; then - if [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT - fi + # Generate dsr1 configs (only if we have valid runner types for DSR1) + if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT else echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT fi - # Generate dsr1 configs - if [ "${{ inputs.run_1k1k }}" = "true" ]; then - if [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) - fi - echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_1k8k }}" = "true" ]; then - if [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) - fi - echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_8k1k }}" = "true" ]; then - if [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) - fi - echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT - fi - - # Generate gptoss configs - if [ "${{ inputs.run_1k1k }}" = "true" ]; then - if [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) - fi - echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_1k8k }}" = "true" ]; then - if [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) - fi - echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_8k1k }}" = "true" ]; then - if [ -n "$RUNNER_TYPES" ]; then - GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - else - GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) - fi - echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT - fi + if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT + else + echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT + else + echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT + fi + + # Generate gptoss configs (only if we have runner types selected) + if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then + GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT + else + echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then + GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT + else + echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then + GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT + else + echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT + fi # DSR1 1K1K Benchmarks benchmark-dsr1-1k1k: From c1b5ddd77ba1ab3f5f07592652eb880a19907b2d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:42:49 -0500 Subject: [PATCH 107/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 4de3595ab..645af1cdb 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -55,7 +55,8 @@ jobs: - id: generate-configs run: | pip install pydantic - + + set -x # Build runner type filters based on inputs RUNNER_TYPES="" From 2cd02954309bbc837085d6d2cd80cbc6963090c5 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 20:51:51 -0500 Subject: [PATCH 108/149] adding full sweep test pt 2 --- .github/workflows/1k8k-sweep.yml | 4 +- .github/workflows/8k1k-sweep.yml | 4 +- .github/workflows/full-sweep-test.yml | 92 ++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 6 deletions(-) diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index 78d9b939b..837033312 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -127,11 +127,11 @@ jobs: exp-name: ${{ matrix.config.model-prefix }}_1k8k isl: 1024 osl: 8192 - max-model-len: 2048 + max-model-len: 9216 mtp-mode: ${{ matrix.config.mtp }} collect-dsr1-results: - needs: benchmark-dsr1 + needs: [benchmark-dsr1, benchmark-gb200] if: ${{ always() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml index 179b542ae..bc7e51e30 100644 --- a/.github/workflows/8k1k-sweep.yml +++ b/.github/workflows/8k1k-sweep.yml @@ -127,11 +127,11 @@ jobs: exp-name: ${{ matrix.config.model-prefix }}_8k1k isl: 8192 osl: 1024 - max-model-len: 2048 + max-model-len: 9216 mtp-mode: ${{ matrix.config.mtp }} collect-dsr1-results: - needs: benchmark-dsr1 + needs: [benchmark-dsr1, benchmark-gb200] if: ${{ always() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 645af1cdb..d5340dc60 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -55,7 +55,7 @@ jobs: - id: generate-configs run: | pip install pydantic - + set -x # Build runner type filters based on inputs RUNNER_TYPES="" @@ -289,8 +289,96 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + # This is a workaround until we can integrate GB200 into master configs. + benchmark-gb200-1k1k: + if: ${{ inputs.use_gb200 && inputs.run_1k1k }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gb200 1k1k sweep + strategy: + fail-fast: false + matrix: + config: &dsr1_static_configs + - { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trtllm", + "mtp": "off", + } + - { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trtllm", + "mtp": "on", + } + - { + "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "dynamo-sglang", + "mtp": "off", + } + secrets: inherit + with: + runner: gb200 + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.model-prefix }}_1k1k + isl: 1024 + osl: 1024 + max-model-len: 2048 + mtp-mode: ${{ matrix.config.mtp }} + + benchmark-gb200-1k8k: + if: ${{ inputs.use_gb200 && inputs.run_1k8k }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gb200 1k1k sweep + strategy: + fail-fast: false + matrix: + config: *dsr1_static_configs + secrets: inherit + with: + runner: gb200 + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.model-prefix }}_1k8k + isl: 1024 + osl: 8192 + max-model-len: 9216 + mtp-mode: ${{ matrix.config.mtp }} + + benchmark-gb200-8k1k: + if: ${{ inputs.use_gb200 && inputs.run_8k1k }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gb200 1k1k sweep + strategy: + fail-fast: false + matrix: + config: *dsr1_static_configs + secrets: inherit + with: + runner: gb200 + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.model-prefix }}_8k1k + isl: 1024 + osl: 8192 + max-model-len: 9216 + mtp-mode: ${{ matrix.config.mtp }} + collect-dsr1-1k8k-results: - needs: benchmark-dsr1-1k8k + needs: [benchmark-dsr1-1k8k, benchmark-gb200-1k1k, benchmark-gb200-1k8k, benchmark-gb200-8k1k] if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }} uses: ./.github/workflows/collect-results.yml secrets: inherit From 3065c13acdb77f92066192741fcaeeaac242ba03 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 21:45:07 -0500 Subject: [PATCH 109/149] reverting title --- .github/workflows/1k1k-sweep.yml | 2 +- .github/workflows/1k8k-sweep.yml | 2 +- .github/workflows/8k1k-sweep.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 699b0baff..bc5305460 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -1,4 +1,4 @@ -name: "1K/1K Sweep" +name: "Full Sweep Scheduler - 1k1k" concurrency: group: benchmark-lock-1k1k diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index 837033312..da4d1daf3 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -1,4 +1,4 @@ -name: "1K/8K Sweep" +name: "Full Sweep Scheduler - 1k8k" concurrency: group: benchmark-lock-1k8k diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml index bc7e51e30..fa3249da7 100644 --- a/.github/workflows/8k1k-sweep.yml +++ b/.github/workflows/8k1k-sweep.yml @@ -1,4 +1,4 @@ -name: "8K/1K Sweep" +name: "Full Sweep Scheduler - 8k1k" concurrency: group: benchmark-lock-8k1k From 89d6dc3d08f2b18cb9c625cdcc671748665c6789 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 22:05:28 -0500 Subject: [PATCH 110/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test-orig.yml | 460 +++++++++++++++++++++ .github/workflows/full-sweep-test.yml | 35 +- 2 files changed, 468 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/full-sweep-test-orig.yml diff --git a/.github/workflows/full-sweep-test-orig.yml b/.github/workflows/full-sweep-test-orig.yml new file mode 100644 index 000000000..d5340dc60 --- /dev/null +++ b/.github/workflows/full-sweep-test-orig.yml @@ -0,0 +1,460 @@ +name: Test - Full Sweep + +# concurrency: +# group: benchmark-lock +# cancel-in-progress: false + +on: + workflow_dispatch: + inputs: + run_1k1k: + type: boolean + required: false + run_8k1k: + type: boolean + required: false + run_1k8k: + type: boolean + required: false + use_h100: + type: boolean + required: false + use_h200: + type: boolean + required: false + use_b200: + type: boolean + required: false + use_mi300x: + type: boolean + required: false + use_mi325x: + type: boolean + required: false + use_mi355x: + type: boolean + required: false + use_gb200: + type: boolean + required: false + +jobs: + get-configs: + runs-on: ubuntu-latest + outputs: + dsr1-1k1k: ${{ steps.generate-configs.outputs.dsr1-1k1k }} + dsr1-1k8k: ${{ steps.generate-configs.outputs.dsr1-1k8k }} + dsr1-8k1k: ${{ steps.generate-configs.outputs.dsr1-8k1k }} + gptoss-1k1k: ${{ steps.generate-configs.outputs.gptoss-1k1k }} + gptoss-1k8k: ${{ steps.generate-configs.outputs.gptoss-1k8k }} + gptoss-8k1k: ${{ steps.generate-configs.outputs.gptoss-8k1k }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: generate-configs + run: | + pip install pydantic + + set -x + # Build runner type filters based on inputs + RUNNER_TYPES="" + + if [ "${{ inputs.use_h100 }}" = "true" ]; then + RUNNER_TYPES="${RUNNER_TYPES} h100" + fi + if [ "${{ inputs.use_h200 }}" = "true" ]; then + RUNNER_TYPES="${RUNNER_TYPES} h200 h200-trt" + fi + if [ "${{ inputs.use_b200 }}" = "true" ]; then + RUNNER_TYPES="${RUNNER_TYPES} b200 b200-trt b200-nvs" + fi + if [ "${{ inputs.use_mi300x }}" = "true" ]; then + RUNNER_TYPES="${RUNNER_TYPES} mi300x" + fi + if [ "${{ inputs.use_mi325x }}" = "true" ]; then + RUNNER_TYPES="${RUNNER_TYPES} mi325x" + fi + if [ "${{ inputs.use_mi355x }}" = "true" ]; then + RUNNER_TYPES="${RUNNER_TYPES} mi355x" + fi + if [ "${{ inputs.use_gb200 }}" = "true" ]; then + RUNNER_TYPES="${RUNNER_TYPES} gb200" + fi + + # Trim leading whitespace + RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs) + + # DSR1 doesn't support H100, so exclude it + DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) + + # Generate dsr1 configs (only if we have valid runner types for DSR1) + if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT + else + echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT + else + echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then + DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT + else + echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT + fi + + # Generate gptoss configs (only if we have runner types selected) + if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then + GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT + else + echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then + GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT + else + echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT + fi + + if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then + GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT + else + echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT + fi + + # DSR1 1K1K Benchmarks + benchmark-dsr1-1k1k: + needs: get-configs + if: ${{ needs.get-configs.outputs.dsr1-1k1k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k1k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-dsr1-1k1k-results: + needs: benchmark-dsr1-1k1k + if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "dsr1_1k1k" + + # GPTOSS 1K1K Benchmarks + benchmark-gptoss-1k1k: + needs: get-configs + if: ${{ needs.get-configs.outputs.gptoss-1k1k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k1k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-gptoss-1k1k-results: + needs: benchmark-gptoss-1k1k + if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "gptoss_1k1k" + + # DSR1 8K1K Benchmarks + benchmark-dsr1-8k1k: + needs: get-configs + if: ${{ needs.get-configs.outputs.dsr1-8k1k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.dsr1-8k1k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-dsr1-8k1k-results: + needs: benchmark-dsr1-8k1k + if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "dsr1_8k1k" + + # GPTOSS 8K1K Benchmarks + benchmark-gptoss-8k1k: + needs: get-configs + if: ${{ needs.get-configs.outputs.gptoss-8k1k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.gptoss-8k1k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-gptoss-8k1k-results: + needs: benchmark-gptoss-8k1k + if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "gptoss_8k1k" + + # DSR1 1K8K Benchmarks + benchmark-dsr1-1k8k: + needs: get-configs + if: ${{ needs.get-configs.outputs.dsr1-1k8k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k8k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + # This is a workaround until we can integrate GB200 into master configs. + benchmark-gb200-1k1k: + if: ${{ inputs.use_gb200 && inputs.run_1k1k }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gb200 1k1k sweep + strategy: + fail-fast: false + matrix: + config: &dsr1_static_configs + - { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trtllm", + "mtp": "off", + } + - { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trtllm", + "mtp": "on", + } + - { + "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "dynamo-sglang", + "mtp": "off", + } + secrets: inherit + with: + runner: gb200 + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.model-prefix }}_1k1k + isl: 1024 + osl: 1024 + max-model-len: 2048 + mtp-mode: ${{ matrix.config.mtp }} + + benchmark-gb200-1k8k: + if: ${{ inputs.use_gb200 && inputs.run_1k8k }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gb200 1k1k sweep + strategy: + fail-fast: false + matrix: + config: *dsr1_static_configs + secrets: inherit + with: + runner: gb200 + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.model-prefix }}_1k8k + isl: 1024 + osl: 8192 + max-model-len: 9216 + mtp-mode: ${{ matrix.config.mtp }} + + benchmark-gb200-8k1k: + if: ${{ inputs.use_gb200 && inputs.run_8k1k }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gb200 1k1k sweep + strategy: + fail-fast: false + matrix: + config: *dsr1_static_configs + secrets: inherit + with: + runner: gb200 + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.model-prefix }}_8k1k + isl: 1024 + osl: 8192 + max-model-len: 9216 + mtp-mode: ${{ matrix.config.mtp }} + + collect-dsr1-1k8k-results: + needs: [benchmark-dsr1-1k8k, benchmark-gb200-1k1k, benchmark-gb200-1k8k, benchmark-gb200-8k1k] + if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "dsr1_1k8k" + + # GPTOSS 1K8K Benchmarks + benchmark-gptoss-1k8k: + needs: get-configs + if: ${{ needs.get-configs.outputs.gptoss-1k8k != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k8k) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + collect-gptoss-1k8k-results: + needs: benchmark-gptoss-1k8k + if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "gptoss_1k8k" + + calc-success-rate: + needs: + [ + collect-dsr1-1k1k-results, + collect-dsr1-1k8k-results, + collect-dsr1-8k1k-results, + collect-gptoss-1k1k-results, + collect-gptoss-1k8k-results, + collect-gptoss-8k1k-results, + ] + if: ${{ always() }} + runs-on: ubuntu-latest + + env: + RESULTS_DIR: "results/" + STATS_FILENAME: "run_stats" + GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download results artifacts + uses: actions/download-artifact@v4 + with: + path: ${{ env.RESULTS_DIR }} + pattern: results_* + + - name: Install python dependencies + run: pip install PyGithub + + - name: Calculate success rate + run: python3 utils/calc_success_rate.py $STATS_FILENAME + + - uses: actions/upload-artifact@v4 + with: + name: "run-stats" + path: ${{ env.STATS_FILENAME }}.json diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index d5340dc60..0787b5c2a 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -58,32 +58,7 @@ jobs: set -x # Build runner type filters based on inputs - RUNNER_TYPES="" - - if [ "${{ inputs.use_h100 }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} h100" - fi - if [ "${{ inputs.use_h200 }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} h200 h200-trt" - fi - if [ "${{ inputs.use_b200 }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} b200 b200-trt b200-nvs" - fi - if [ "${{ inputs.use_mi300x }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} mi300x" - fi - if [ "${{ inputs.use_mi325x }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} mi325x" - fi - if [ "${{ inputs.use_mi355x }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} mi355x" - fi - if [ "${{ inputs.use_gb200 }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} gb200" - fi - - # Trim leading whitespace - RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs) + RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200 h200-trt' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}" # DSR1 doesn't support H100, so exclude it DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) @@ -378,7 +353,13 @@ jobs: mtp-mode: ${{ matrix.config.mtp }} collect-dsr1-1k8k-results: - needs: [benchmark-dsr1-1k8k, benchmark-gb200-1k1k, benchmark-gb200-1k8k, benchmark-gb200-8k1k] + needs: + [ + benchmark-dsr1-1k8k, + benchmark-gb200-1k1k, + benchmark-gb200-1k8k, + benchmark-gb200-8k1k, + ] if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }} uses: ./.github/workflows/collect-results.yml secrets: inherit From 68e24620519fe9c59caeb46bea3ec7810525bf4d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 22:07:39 -0500 Subject: [PATCH 111/149] adding full sweep test pt 2 --- .github/workflows/full-sweep-test-orig.yml | 460 --------------------- 1 file changed, 460 deletions(-) delete mode 100644 .github/workflows/full-sweep-test-orig.yml diff --git a/.github/workflows/full-sweep-test-orig.yml b/.github/workflows/full-sweep-test-orig.yml deleted file mode 100644 index d5340dc60..000000000 --- a/.github/workflows/full-sweep-test-orig.yml +++ /dev/null @@ -1,460 +0,0 @@ -name: Test - Full Sweep - -# concurrency: -# group: benchmark-lock -# cancel-in-progress: false - -on: - workflow_dispatch: - inputs: - run_1k1k: - type: boolean - required: false - run_8k1k: - type: boolean - required: false - run_1k8k: - type: boolean - required: false - use_h100: - type: boolean - required: false - use_h200: - type: boolean - required: false - use_b200: - type: boolean - required: false - use_mi300x: - type: boolean - required: false - use_mi325x: - type: boolean - required: false - use_mi355x: - type: boolean - required: false - use_gb200: - type: boolean - required: false - -jobs: - get-configs: - runs-on: ubuntu-latest - outputs: - dsr1-1k1k: ${{ steps.generate-configs.outputs.dsr1-1k1k }} - dsr1-1k8k: ${{ steps.generate-configs.outputs.dsr1-1k8k }} - dsr1-8k1k: ${{ steps.generate-configs.outputs.dsr1-8k1k }} - gptoss-1k1k: ${{ steps.generate-configs.outputs.gptoss-1k1k }} - gptoss-1k8k: ${{ steps.generate-configs.outputs.gptoss-1k8k }} - gptoss-8k1k: ${{ steps.generate-configs.outputs.gptoss-8k1k }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - id: generate-configs - run: | - pip install pydantic - - set -x - # Build runner type filters based on inputs - RUNNER_TYPES="" - - if [ "${{ inputs.use_h100 }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} h100" - fi - if [ "${{ inputs.use_h200 }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} h200 h200-trt" - fi - if [ "${{ inputs.use_b200 }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} b200 b200-trt b200-nvs" - fi - if [ "${{ inputs.use_mi300x }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} mi300x" - fi - if [ "${{ inputs.use_mi325x }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} mi325x" - fi - if [ "${{ inputs.use_mi355x }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} mi355x" - fi - if [ "${{ inputs.use_gb200 }}" = "true" ]; then - RUNNER_TYPES="${RUNNER_TYPES} gb200" - fi - - # Trim leading whitespace - RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs) - - # DSR1 doesn't support H100, so exclude it - DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) - - # Generate dsr1 configs (only if we have valid runner types for DSR1) - if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT - fi - - # Generate gptoss configs (only if we have runner types selected) - if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT - fi - - # DSR1 1K1K Benchmarks - benchmark-dsr1-1k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-1k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-dsr1-1k1k-results: - needs: benchmark-dsr1-1k1k - if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_1k1k" - - # GPTOSS 1K1K Benchmarks - benchmark-gptoss-1k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-1k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-gptoss-1k1k-results: - needs: benchmark-gptoss-1k1k - if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_1k1k" - - # DSR1 8K1K Benchmarks - benchmark-dsr1-8k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-8k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-8k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-dsr1-8k1k-results: - needs: benchmark-dsr1-8k1k - if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_8k1k" - - # GPTOSS 8K1K Benchmarks - benchmark-gptoss-8k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-8k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-8k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-gptoss-8k1k-results: - needs: benchmark-gptoss-8k1k - if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_8k1k" - - # DSR1 1K8K Benchmarks - benchmark-dsr1-1k8k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-1k8k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k8k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - # This is a workaround until we can integrate GB200 into master configs. - benchmark-gb200-1k1k: - if: ${{ inputs.use_gb200 && inputs.run_1k1k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep - strategy: - fail-fast: false - matrix: - config: &dsr1_static_configs - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "off", - } - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "on", - } - - { - "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", - "model": "deepseek-ai/DeepSeek-R1-0528", - "model-prefix": "dsr1", - "precision": "fp8", - "framework": "dynamo-sglang", - "mtp": "off", - } - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_1k1k - isl: 1024 - osl: 1024 - max-model-len: 2048 - mtp-mode: ${{ matrix.config.mtp }} - - benchmark-gb200-1k8k: - if: ${{ inputs.use_gb200 && inputs.run_1k8k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep - strategy: - fail-fast: false - matrix: - config: *dsr1_static_configs - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_1k8k - isl: 1024 - osl: 8192 - max-model-len: 9216 - mtp-mode: ${{ matrix.config.mtp }} - - benchmark-gb200-8k1k: - if: ${{ inputs.use_gb200 && inputs.run_8k1k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep - strategy: - fail-fast: false - matrix: - config: *dsr1_static_configs - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_8k1k - isl: 1024 - osl: 8192 - max-model-len: 9216 - mtp-mode: ${{ matrix.config.mtp }} - - collect-dsr1-1k8k-results: - needs: [benchmark-dsr1-1k8k, benchmark-gb200-1k1k, benchmark-gb200-1k8k, benchmark-gb200-8k1k] - if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_1k8k" - - # GPTOSS 1K8K Benchmarks - benchmark-gptoss-1k8k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-1k8k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k8k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-gptoss-1k8k-results: - needs: benchmark-gptoss-1k8k - if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_1k8k" - - calc-success-rate: - needs: - [ - collect-dsr1-1k1k-results, - collect-dsr1-1k8k-results, - collect-dsr1-8k1k-results, - collect-gptoss-1k1k-results, - collect-gptoss-1k8k-results, - collect-gptoss-8k1k-results, - ] - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@v3 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@v4 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@v4 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json From 04992c4cc79808de725c7a420f25b2edaddb1c82 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 30 Oct 2025 22:10:47 -0500 Subject: [PATCH 112/149] reverting title --- .github/workflows/full-sweep-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 0787b5c2a..ed3b13f59 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -58,7 +58,7 @@ jobs: set -x # Build runner type filters based on inputs - RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200 h200-trt' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}" + RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_h200 && 'h200 h200-trt' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}" # DSR1 doesn't support H100, so exclude it DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) From f2f1a5ea16326cc1248799b8c01134f30b255701 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 31 Oct 2025 08:55:53 -0500 Subject: [PATCH 113/149] fixing test files --- utils/matrix-logic/test_generate_sweep_configs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py index cd5ff5b46..15c5f25a3 100644 --- a/utils/matrix-logic/test_generate_sweep_configs.py +++ b/utils/matrix-logic/test_generate_sweep_configs.py @@ -1236,6 +1236,7 @@ def test_generate_runner_sweep_config(sample_master_config, temp_config_files): class Args: model_prefix = "70b" + runner_type = "h200" precision = None framework = None runner_config = runner_file @@ -1250,6 +1251,7 @@ def test_generate_runner_sweep_config_with_filters(sample_master_config, temp_co class Args: model_prefix = "70b" + runner_type = "h200" precision = "fp8" framework = "vllm" runner_config = runner_file @@ -1265,6 +1267,7 @@ def test_generate_runner_sweep_config_no_matches(sample_master_config, temp_conf class Args: model_prefix = "nonexistent" + runner_type = "h200" precision = None framework = None runner_config = runner_file @@ -1393,6 +1396,7 @@ def test_main_runner_sweep(temp_config_files): "runner-sweep", "--config-files", master_file, "--runner-config", runner_file, + "--runner-type", "h200", "--model-prefix", "70b" ] From 9d2cbbba6e75077cda460597a5f39f8d43c4daa0 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 31 Oct 2025 08:57:40 -0500 Subject: [PATCH 114/149] fixing gha syntax error --- .github/workflows/full-sweep-test.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index ed3b13f59..bbbd574c2 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -9,33 +9,43 @@ on: inputs: run_1k1k: type: boolean + description: "When true, run 1k1k" required: false run_8k1k: type: boolean + description: "When true, run 8k1k" required: false run_1k8k: type: boolean + description: "When true, run 1k8k" required: false use_h100: type: boolean + description: "When true, run H100" required: false use_h200: type: boolean + description: "When true, run H200" required: false use_b200: type: boolean + description: "When true, run B200" required: false use_mi300x: type: boolean + description: "When true, run MI300X" required: false use_mi325x: type: boolean + description: "When true, run MI325X" required: false use_mi355x: type: boolean + description: "When true, run MI355X" required: false use_gb200: type: boolean + description: "When true, run GB200" required: false jobs: From 7164cdef8c5bd21e5b8369f9caea24e530e934b8 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 31 Oct 2025 09:52:43 -0500 Subject: [PATCH 115/149] fixing gha syntax error --- .github/workflows/e2e-tests.yml | 1 + .github/workflows/full-sweep-test.yml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index ff7ecb92b..fef12802d 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -1,4 +1,5 @@ name: End-to-End Tests +run-name: e2e Test - ${{ github.event.inputs.generate-cli-command }} # concurrency: # group: benchmark-lock diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index bbbd574c2..9647dd21d 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -62,6 +62,8 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + # This looks complicated, but it is just calling generate_sweep_configs.py conditioned on + # discrete inputs (i.e., run_1k1k, run_h100, etc.) to split the test sweep into discrete jobs - id: generate-configs run: | pip install pydantic From 5eb1f90d4e32c9de5088098f0f84222008f5a5a6 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 31 Oct 2025 11:24:50 -0500 Subject: [PATCH 116/149] fixing error in multinode script --- .github/workflows/benchmark-multinode-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 0386e7d55..b4d917575 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -94,7 +94,7 @@ jobs: # Extract GPU count from filename for tp_size calculation gpus=$(echo "$result_file" | sed "s/.*_gpus\([0-9]*\)\.json/\1/") if [ -n "$gpus" ]; then - python3 utils/process_result.py ${{ inputs.runner }} $gpus ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE + python3 utils/process_result.py ${{ inputs.runner }} $gpus 1 false ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE fi fi done From 9318ba761044e28b6702f79e020209061c6ba5c9 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 31 Oct 2025 11:25:16 -0500 Subject: [PATCH 117/149] bug fxes --- .github/workflows/full-sweep-test.yml | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 9647dd21d..0f8771b54 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -9,43 +9,33 @@ on: inputs: run_1k1k: type: boolean - description: "When true, run 1k1k" required: false run_8k1k: type: boolean - description: "When true, run 8k1k" required: false run_1k8k: type: boolean - description: "When true, run 1k8k" required: false use_h100: type: boolean - description: "When true, run H100" required: false use_h200: type: boolean - description: "When true, run H200" required: false use_b200: type: boolean - description: "When true, run B200" required: false use_mi300x: type: boolean - description: "When true, run MI300X" required: false use_mi325x: type: boolean - description: "When true, run MI325X" required: false use_mi355x: type: boolean - description: "When true, run MI355X" required: false use_gb200: type: boolean - description: "When true, run GB200" required: false jobs: @@ -325,7 +315,7 @@ jobs: benchmark-gb200-1k8k: if: ${{ inputs.use_gb200 && inputs.run_1k8k }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep + name: gb200 1k8k sweep strategy: fail-fast: false matrix: @@ -346,7 +336,7 @@ jobs: benchmark-gb200-8k1k: if: ${{ inputs.use_gb200 && inputs.run_8k1k }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep + name: gb200 8k1k sweep strategy: fail-fast: false matrix: From 5a56794b517a3f77240964c472c1d5c581e9ce0b Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 31 Oct 2025 11:26:30 -0500 Subject: [PATCH 118/149] debug --- .github/workflows/1k1k-sweep.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index bc5305460..20e1f0c2d 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -5,7 +5,9 @@ concurrency: cancel-in-progress: false on: - # pull_request: + push: + branches: + - initial-refactor workflow_dispatch: # schedule: # - cron: '0 23 * * *' From 912d70d3a9bf65ce847f369d2d29c2303cb51df0 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 31 Oct 2025 11:26:36 -0500 Subject: [PATCH 119/149] debug --- .github/workflows/1k1k-sweep.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 20e1f0c2d..c1d3ff72b 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -5,9 +5,6 @@ concurrency: cancel-in-progress: false on: - push: - branches: - - initial-refactor workflow_dispatch: # schedule: # - cron: '0 23 * * *' From 98362f1119ee4a1435fdcae8ec5d4b28d5ef666b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 11:57:49 -0500 Subject: [PATCH 120/149] celaning up the full sweep sched --- .github/workflows/1k1k-sweep.yml | 4 ++-- .github/workflows/1k8k-sweep.yml | 9 ++++----- .github/workflows/8k1k-sweep.yml | 9 ++++----- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index c1d3ff72b..f6ec37562 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -6,8 +6,8 @@ concurrency: on: workflow_dispatch: -# schedule: -# - cron: '0 23 * * *' + schedule: + - cron: "0 23 * * *" jobs: get-dsr1-configs: diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index da4d1daf3..82bc48817 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -1,14 +1,13 @@ name: "Full Sweep Scheduler - 1k8k" concurrency: - group: benchmark-lock-1k8k - cancel-in-progress: false + group: benchmark-lock-1k8k + cancel-in-progress: false on: - # pull_request: workflow_dispatch: -# schedule: -# - cron: '0 23 * * *' + schedule: + - cron: "0 23 * * *" jobs: get-dsr1-configs: diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml index fa3249da7..8863112af 100644 --- a/.github/workflows/8k1k-sweep.yml +++ b/.github/workflows/8k1k-sweep.yml @@ -1,14 +1,13 @@ name: "Full Sweep Scheduler - 8k1k" concurrency: - group: benchmark-lock-8k1k - cancel-in-progress: false + group: benchmark-lock-8k1k + cancel-in-progress: false on: - # pull_request: workflow_dispatch: -# schedule: -# - cron: '0 23 * * *' + schedule: + - cron: "0 23 * * *" jobs: get-dsr1-configs: From 1eb74b9b820c2872253132a2f00407d89a6af631 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 11:58:18 -0500 Subject: [PATCH 121/149] celaning up other workflows --- .github/workflows/e2e-tests.yml | 4 ---- .github/workflows/full-sweep-test.yml | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index fef12802d..1d13b3a87 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -1,10 +1,6 @@ name: End-to-End Tests run-name: e2e Test - ${{ github.event.inputs.generate-cli-command }} -# concurrency: -# group: benchmark-lock -# cancel-in-progress: false - on: workflow_dispatch: inputs: diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index 0f8771b54..a2ff06d18 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -1,8 +1,8 @@ name: Test - Full Sweep -# concurrency: -# group: benchmark-lock -# cancel-in-progress: false +concurrency: + group: benchmark-lock + cancel-in-progress: false on: workflow_dispatch: From f78de57cb04361258a721a7efa80462fbd56f2c5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 14:43:14 -0500 Subject: [PATCH 122/149] docs --- .github/README.md | 116 ++++++ utils/matrix-logic/generate_sweep_configs.py | 407 ++++++++++--------- 2 files changed, 334 insertions(+), 189 deletions(-) create mode 100644 .github/README.md diff --git a/.github/README.md b/.github/README.md new file mode 100644 index 000000000..f4539dd5d --- /dev/null +++ b/.github/README.md @@ -0,0 +1,116 @@ +# How to Test Workflows + +In order to test configurations described in `.github/configs`, the primary workflow file used is `.github/workflows/e2e-tests.yml`. As input, this workflow takes in the CLI arguments for the `utils/matrix-logic/generate_sweep_configs.py` script. The usage for this script is shown below: + +``` +usage: generate_sweep_configs.py [-h] {full-sweep,test-config,runner-model-sweep,runner-sweep,custom} ... + +Generate benchmark configurations from YAML config files + +positional arguments: + {full-sweep,test-config,runner-model-sweep,runner-sweep,custom} + Available commands + full-sweep Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths + test-config Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config. + runner-model-sweep Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate + that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner + nodes. + runner-sweep Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is + meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b + actually do so successfully. + custom Enter custom values + +options: + -h, --help show this help message and exit +``` + +Instead of explaining each command at a high level, let's just walk through some common testing scenarios and describe how to run them. + +**Scenario 1**: I want to change increase the concurrency from 128 to 256 in the 1k1k scenario for the `dsr1-fp4-b200-sglang` config (from `.github/configs/nvidia-master.yaml`) and then test it. + +Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input: +``` +test-config --key dsr1-fp4-b200-sglang --seq-len 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` + +If we wanted to also test 1k8k or 8k1k scenarios, we would simply append `1k8k` or `8k1k` to `--seq-len`, respectively. + +Further, if we wanted to run that config on *one specific* runner node, we could specify that by appending `--runner-node` to the argument list. Note that if the specified runner node is not compatible with the specified config key (as dictated by `.github/configs/runners.yaml`), then the workflow will error: + +``` +test-config --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml --key dsr1-fp4-b200-sglang --seq-len 1k1k --runner-node mi300x-amd_0 + +ValueError: Runner node 'mi300x-amd_0' is not compatible with config 'dsr1-fp4-b200-sglang' which runs on runner type 'b200'. Available runner nodes for this config are 'b200-nb_0, b200-nb_1, b200-nvd_0, b200-nvd_1, b200-nvd_2, b200-nvd_3, b200-tg_0'. +``` + +**Scenario 2**: I just made a change to the `benchmarks/dsr1_fp8_b200_docker.sh` and I need to verify that these changes work across all B200 runners. + +Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input: +``` +runner-sweep --runner-type b200 --model-prefix dsr1 --precision fp8 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` + +This will run a test (just the highest available parallelism and lowest available concurrency) for each B200 runner node for each Deepseek config that runs on B200 with fp8 precision. I.e., this can be used to "sweep" across runners for a particular model to test that all runners still work with changes that have been made. + +**Scenario 3**: I just upgraded the CUDA drivers on all H200 runners and need to verify that all models that use H200 still work correctly across all H200 nodes. + +Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the following command as the text input: +``` +runner-model-sweep --runner-type h200 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` + +This will run a test (just the highest available parallelism and lowest available concurrency) for each configuration that specifies the `h200` runner type, across all H200 runner nodes defined in `.github/configs/runners.yaml`. + +For example, if you have configs `dsr1-fp8-h200-sglang`, `dsr1-fp8-h200-trt`, and `gptoss-fp4-h200-vllm` that all use `runner: h200`, and you have 8 H200 nodes (`h200-cw_0`, `h200-cw_1`, etc.), this will run all 3 configs on all 8 nodes (24 total test runs). + +This is particularly useful when: +- You've made infrastructure changes to a specific runner type (driver updates, system configuration, Docker setup) +- You've added new runner nodes and want to validate they work with all existing model configurations +- You want to verify that all models remain compatible with a specific GPU type after system updates + +**Key difference from Scenario 2**: +- `runner-sweep`: Fix a **model**, sweep across runners → "Does this model work on all its runners?" +- `runner-model-sweep`: Fix a **runner type**, sweep across models → "Do all models work on this runner type?" + +## Additional Use Cases with `full-sweep` + +The `full-sweep` command supports multiple filters that can be combined for targeted testing: + +**Test all gptoss configurations on B200 with 1k1k sequence lengths:** +``` +full-sweep --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` + +**Test all fp8 precision configs across all runners for 1k8k workloads:** +``` +full-sweep --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml +``` + +**Test all TRT configs on H200 runners:** +``` +full-sweep --framework trt --runner-type h200 h200-trt --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` + +**Quick smoke test of all configs (highest TP, lowest concurrency only):** +``` +full-sweep --test-mode --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml +``` + +**Test specific model on specific hardware with specific sequence lengths:** +``` +full-sweep --model-prefix dsr1 --runner-type b200 --precision fp4 --framework sglang --seq-lens 1k1k 8k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` + +## Custom One-off Tests + +**Scenario 4**: I want to run a quick test with a custom image, model, or configuration that isn't in the config files yet. + +Use the `custom` command to specify all parameters manually: +``` +custom --runner-label b200-nb_0 --image vllm/vllm-openai:v0.11.0 --model meta-llama/Llama-3.1-70B --framework vllm --precision fp8 --exp-name llama70b_test --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` + +This runs a single 1k1k test job with your custom parameters on the specified runner node. Useful for: +- Testing new images before adding them to config files +- Quick validation of new models +- Experimenting with different frameworks or precisions diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index c43a1759e..bb0e22911 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -4,6 +4,33 @@ from pydantic import BaseModel, Field, ValidationError, ConfigDict from typing import List +# Field name constants +# Top-level config fields +FIELD_IMAGE = 'image' +FIELD_MODEL = 'model' +FIELD_MODEL_PREFIX = 'model-prefix' +FIELD_PRECISION = 'precision' +FIELD_FRAMEWORK = 'framework' +FIELD_RUNNER = 'runner' +FIELD_SEQ_LEN_CONFIGS = 'seq-len-configs' + +# Seq-len-config fields +FIELD_ISL = 'isl' +FIELD_OSL = 'osl' +FIELD_SEARCH_SPACE = 'search-space' + +# Search-space/benchmark fields +FIELD_TP = 'tp' +FIELD_CONC_START = 'conc-start' +FIELD_CONC_END = 'conc-end' +FIELD_EP = 'ep' +FIELD_DP_ATTN = 'dp-attn' + +# Matrix entry fields +FIELD_CONC = 'conc' +FIELD_MAX_MODEL_LEN = 'max-model-len' +FIELD_EXP_NAME = 'exp-name' + seq_len_stoi = { "1k1k": (1024, 1024), "1k8k": (1024, 8192), @@ -65,13 +92,13 @@ def validate_master_configs_structure(all_config_data): for key, val in all_config_data.items(): # Check for required top-level fields and their types required_fields = { - 'image': str, - 'model': str, - 'model-prefix': str, - 'precision': str, - 'framework': str, - 'runner': str, - 'seq-len-configs': list + FIELD_IMAGE: str, + FIELD_MODEL: str, + FIELD_MODEL_PREFIX: str, + FIELD_PRECISION: str, + FIELD_FRAMEWORK: str, + FIELD_RUNNER: str, + FIELD_SEQ_LEN_CONFIGS: list } for field, expected_type in required_fields.items(): @@ -82,42 +109,42 @@ def validate_master_configs_structure(all_config_data): raise ValueError( f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}") - seq_len_configs = val['seq-len-configs'] + seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] if len(seq_len_configs) == 0: raise ValueError( - f"'seq-len-configs' must be a non-empty list for key '{key}'") + f"'{FIELD_SEQ_LEN_CONFIGS}' must be a non-empty list for key '{key}'") # Validate each seq-len-config for i, seq_config in enumerate(seq_len_configs): # Check isl - if 'isl' not in seq_config or seq_config['isl'] is None: + if FIELD_ISL not in seq_config or seq_config[FIELD_ISL] is None: raise ValueError( - f"Missing 'isl' in seq-len-config[{i}] for key '{key}'") - if not isinstance(seq_config['isl'], int): + f"Missing '{FIELD_ISL}' in seq-len-config[{i}] for key '{key}'") + if not isinstance(seq_config[FIELD_ISL], int): raise ValueError( - f"'isl' must be int in seq-len-config[{i}] for key '{key}'") + f"'{FIELD_ISL}' must be int in seq-len-config[{i}] for key '{key}'") # Check osl - if 'osl' not in seq_config or seq_config['osl'] is None: + if FIELD_OSL not in seq_config or seq_config[FIELD_OSL] is None: raise ValueError( - f"Missing 'osl' in seq-len-config[{i}] for key '{key}'") - if not isinstance(seq_config['osl'], int): + f"Missing '{FIELD_OSL}' in seq-len-config[{i}] for key '{key}'") + if not isinstance(seq_config[FIELD_OSL], int): raise ValueError( - f"'osl' must be int in seq-len-config[{i}] for key '{key}'") + f"'{FIELD_OSL}' must be int in seq-len-config[{i}] for key '{key}'") - bmk_space = seq_config.get('search-space') + bmk_space = seq_config.get(FIELD_SEARCH_SPACE) if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0: raise ValueError( - f"Missing or invalid 'search-space' in seq-len-config[{i}] for key '{key}'") + f"Missing or invalid '{FIELD_SEARCH_SPACE}' in seq-len-config[{i}] for key '{key}'") # Validate each benchmark in search-space for j, bmk in enumerate(bmk_space): # Define allowed fields - allowed_fields = {'tp', 'conc-start', - 'conc-end', 'ep', 'dp-attn'} - required_bmk_fields = {'tp': int, - 'conc-start': int, 'conc-end': int} - optional_bmk_fields = {'ep': int, 'dp-attn': bool} + allowed_fields = {FIELD_TP, FIELD_CONC_START, + FIELD_CONC_END, FIELD_EP, FIELD_DP_ATTN} + required_bmk_fields = {FIELD_TP: int, + FIELD_CONC_START: int, FIELD_CONC_END: int} + optional_bmk_fields = {FIELD_EP: int, FIELD_DP_ATTN: bool} # Check for extra fields extra_fields = set(bmk.keys()) - allowed_fields @@ -186,98 +213,98 @@ def generate_full_sweep(args, all_config_data): continue # Filter by precision if specified - if args.precision and val['precision'] not in args.precision: + if args.precision and val[FIELD_PRECISION] not in args.precision: continue # Filter by framework if specified - if args.framework and val['framework'] not in args.framework: + if args.framework and val[FIELD_FRAMEWORK] not in args.framework: continue # Filter by runner type if specified - if args.runner_type and val['runner'] not in args.runner_type: + if args.runner_type and val[FIELD_RUNNER] not in args.runner_type: continue - seq_len_configs = val['seq-len-configs'] - image = val['image'] - model = val['model'] - precision = val['precision'] - framework = val['framework'] - runner = val['runner'] - model_code = val['model-prefix'] + seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] + image = val[FIELD_IMAGE] + model = val[FIELD_MODEL] + precision = val[FIELD_PRECISION] + framework = val[FIELD_FRAMEWORK] + runner = val[FIELD_RUNNER] + model_code = val[FIELD_MODEL_PREFIX] for seq_config in seq_len_configs: - isl = seq_config['isl'] - osl = seq_config['osl'] + isl = seq_config[FIELD_ISL] + osl = seq_config[FIELD_OSL] # Filter by sequence lengths if specified if seq_lens_filter and (isl, osl) not in seq_lens_filter: continue - bmk_space = seq_config['search-space'] + bmk_space = seq_config[FIELD_SEARCH_SPACE] if args.test_mode: # In test mode, use highest TP with lowest concurrency - highest_tp_bmk = max(bmk_space, key=lambda x: x['tp']) - tp = highest_tp_bmk['tp'] - conc = highest_tp_bmk['conc-start'] - ep = highest_tp_bmk.get('ep') - dp_attn = highest_tp_bmk.get('dp-attn') + highest_tp_bmk = max(bmk_space, key=lambda x: x[FIELD_TP]) + tp = highest_tp_bmk[FIELD_TP] + conc = highest_tp_bmk[FIELD_CONC_START] + ep = highest_tp_bmk.get(FIELD_EP) + dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) seq_len_str = seq_len_to_str(isl, osl) entry = { - 'image': image, - 'model': model, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'ep': 1, # Default - 'dp-attn': False, # Default - 'conc': conc, - 'max-model-len': isl + osl + 200, - 'exp-name': f"{model_code}_{seq_len_str}", + FIELD_IMAGE: image, + FIELD_MODEL: model, + FIELD_PRECISION: precision, + FIELD_FRAMEWORK: framework, + FIELD_RUNNER: runner, + FIELD_ISL: isl, + FIELD_OSL: osl, + FIELD_TP: tp, + FIELD_EP: 1, # Default + FIELD_DP_ATTN: False, # Default + FIELD_CONC: conc, + FIELD_MAX_MODEL_LEN: isl + osl + 200, + FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", } if ep is not None: - entry['ep'] = ep + entry[FIELD_EP] = ep if dp_attn is not None: - entry['dp-attn'] = dp_attn + entry[FIELD_DP_ATTN] = dp_attn matrix_values.append(entry) else: # Full sweep mode for bmk in bmk_space: - tp = bmk['tp'] - conc_start = bmk['conc-start'] - conc_end = bmk['conc-end'] - ep = bmk.get('ep') - dp_attn = bmk.get('dp-attn') + tp = bmk[FIELD_TP] + conc_start = bmk[FIELD_CONC_START] + conc_end = bmk[FIELD_CONC_END] + ep = bmk.get(FIELD_EP) + dp_attn = bmk.get(FIELD_DP_ATTN) conc = conc_start while conc <= conc_end: seq_len_str = seq_len_to_str(isl, osl) entry = { - 'image': image, - 'model': model, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'conc': conc, - 'max-model-len': isl + osl + 200, - 'ep': 1, # Default - 'dp-attn': False, # Default - 'exp-name': f"{model_code}_{seq_len_str}", + FIELD_IMAGE: image, + FIELD_MODEL: model, + FIELD_PRECISION: precision, + FIELD_FRAMEWORK: framework, + FIELD_RUNNER: runner, + FIELD_ISL: isl, + FIELD_OSL: osl, + FIELD_TP: tp, + FIELD_CONC: conc, + FIELD_MAX_MODEL_LEN: isl + osl + 200, + FIELD_EP: 1, # Default + FIELD_DP_ATTN: False, # Default + FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", } if ep is not None: - entry['ep'] = ep + entry[FIELD_EP] = ep if dp_attn is not None: - entry['dp-attn'] = dp_attn + entry[FIELD_DP_ATTN] = dp_attn matrix_values.append(entry) @@ -323,20 +350,20 @@ def generate_test_config(args, all_config_data): f"Specified key '{args.key}' does not exist in config files.") # Extract model code from config - model_code = val['model-prefix'] + model_code = val[FIELD_MODEL_PREFIX] - runner_nodes = runner_config.get(val['runner'], []) - if args.runner_node not in runner_nodes: + runner_nodes = runner_config.get(val[FIELD_RUNNER], []) + if args.runner_node and args.runner_node not in runner_nodes: raise ValueError( - f"Runner node '{args.runner_node}' is not compatible with config '{args.key}' which runs on runner type '{val['runner']}'. Available runner nodes for this config are '{', '.join(runner_nodes)}'.") + f"Runner node '{args.runner_node}' is not compatible with config '{args.key}' which runs on runner type '{val[FIELD_RUNNER]}'. Available runner nodes for this config are '{', '.join(runner_nodes)}'.") - seq_len_configs = val['seq-len-configs'] - image = val['image'] - model = val['model'] - precision = val['precision'] - framework = val['framework'] + seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] + image = val[FIELD_IMAGE] + model = val[FIELD_MODEL] + precision = val[FIELD_PRECISION] + framework = val[FIELD_FRAMEWORK] # Use default runner or specific runner node if input by user - runner = val['runner'] if not args.runner_node else args.runner_node + runner = val[FIELD_RUNNER] if not args.runner_node else args.runner_node # Convert seq-lens to set of (isl, osl) tuples for filtering seq_lens_filter = None @@ -347,71 +374,73 @@ def generate_test_config(args, all_config_data): # Process each sequence length configuration for seq_config in seq_len_configs: - isl = seq_config['isl'] - osl = seq_config['osl'] + isl = seq_config[FIELD_ISL] + osl = seq_config[FIELD_OSL] # Filter by sequence lengths if specified if seq_lens_filter and (isl, osl) not in seq_lens_filter: continue - bmk_space = seq_config['search-space'] + bmk_space = seq_config[FIELD_SEARCH_SPACE] for bmk in bmk_space: - tp = bmk['tp'] - conc_start = bmk['conc-start'] - conc_end = bmk['conc-end'] - ep = bmk.get('ep') - dp_attn = bmk.get('dp-attn') + tp = bmk[FIELD_TP] + conc_start = bmk[FIELD_CONC_START] + conc_end = bmk[FIELD_CONC_END] + ep = bmk.get(FIELD_EP) + dp_attn = bmk.get(FIELD_DP_ATTN) # In test mode, only use the lowest concurrency (conc_start) if args.test_mode: entry = { - 'image': image, - 'model': model, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'ep': 1, # Default, - 'dp-attn': False, # Default - 'conc': conc_start, - 'max-model-len': isl + osl, - 'exp-name': f"{model_code}_test", + FIELD_IMAGE: image, + FIELD_MODEL: model, + FIELD_PRECISION: precision, + FIELD_FRAMEWORK: framework, + FIELD_RUNNER: runner, + FIELD_ISL: isl, + FIELD_OSL: osl, + FIELD_TP: tp, + FIELD_EP: 1, # Default, + FIELD_DP_ATTN: False, # Default + FIELD_CONC: conc_start, + FIELD_MAX_MODEL_LEN: isl + osl, + FIELD_EXP_NAME: f"{model_code}_test", } # Add optional fields if they exist if ep is not None: - entry['ep'] = ep + entry[FIELD_EP] = ep if dp_attn is not None: - entry['dp-attn'] = dp_attn + entry[FIELD_DP_ATTN] = dp_attn matrix_values.append(entry) else: # Generate entries for each concurrency value in the range conc = conc_start while conc <= conc_end: + seq_len_str = seq_len_to_str(isl, osl) entry = { - 'image': image, - 'model': model, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'ep': 1, # Default, - 'dp-attn': False, # Default - 'conc': conc, - 'max-model-len': isl + osl, + FIELD_IMAGE: image, + FIELD_MODEL: model, + FIELD_PRECISION: precision, + FIELD_FRAMEWORK: framework, + FIELD_RUNNER: runner, + FIELD_ISL: isl, + FIELD_OSL: osl, + FIELD_TP: tp, + FIELD_EP: 1, # Default, + FIELD_DP_ATTN: False, # Default + FIELD_CONC: conc, + FIELD_MAX_MODEL_LEN: isl + osl, + FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", } # Add optional fields if they exist if ep is not None: - entry['ep'] = ep + entry[FIELD_EP] = ep if dp_attn is not None: - entry['dp-attn'] = dp_attn + entry[FIELD_DP_ATTN] = dp_attn matrix_values.append(entry) @@ -445,52 +474,52 @@ def generate_runner_model_sweep_config(args, all_config_data): matrix_values = [] for key, val in all_config_data.items(): # Only consider configs with specified runner - if val['runner'] != args.runner_type: + if val[FIELD_RUNNER] != args.runner_type: continue # Get model code for exp_name - model_code = val['model-prefix'] + model_code = val[FIELD_MODEL_PREFIX] # Find 1k1k config target_config = None - for config in val['seq-len-configs']: - if config['isl'] == 1024 and config['osl'] == 1024: + for config in val[FIELD_SEQ_LEN_CONFIGS]: + if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024: target_config = config break - highest_tp_bmk = max(target_config['search-space'], key=lambda x: x['tp']) + highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP]) # Since we are just testing, pick the highest TP for this config and just test # on that TP with the lowest concurrency available - highest_tp = highest_tp_bmk['tp'] - lowest_conc = highest_tp_bmk['conc-start'] + highest_tp = highest_tp_bmk[FIELD_TP] + lowest_conc = highest_tp_bmk[FIELD_CONC_START] - ep = highest_tp_bmk.get('ep') - dp_attn = highest_tp_bmk.get('dp-attn') + ep = highest_tp_bmk.get(FIELD_EP) + dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) for node in runner_nodes: entry = { - 'image': val['image'], - 'model': val['model'], - 'precision': val['precision'], - 'framework': val['framework'], + FIELD_IMAGE: val[FIELD_IMAGE], + FIELD_MODEL: val[FIELD_MODEL], + FIELD_PRECISION: val[FIELD_PRECISION], + FIELD_FRAMEWORK: val[FIELD_FRAMEWORK], # Add one entry for each node under specified runner type - 'runner': node, + FIELD_RUNNER: node, # Again, just use 1k1k since this is just meant to smoke test all runners - 'isl': 1024, - 'osl': 1024, - 'tp': highest_tp, - 'ep': 1, # Default, - 'dp-attn': False, # Default - 'conc': lowest_conc, - 'max-model-len': 2048, - 'exp-name': f"{model_code}_test", + FIELD_ISL: 1024, + FIELD_OSL: 1024, + FIELD_TP: highest_tp, + FIELD_EP: 1, # Default, + FIELD_DP_ATTN: False, # Default + FIELD_CONC: lowest_conc, + FIELD_MAX_MODEL_LEN: 2048, + FIELD_EXP_NAME: f"{model_code}_test", } # Add optional fields if they exist if ep is not None: - entry['ep'] = ep + entry[FIELD_EP] = ep if dp_attn is not None: - entry['dp-attn'] = dp_attn + entry[FIELD_DP_ATTN] = dp_attn matrix_values.append(entry) @@ -521,20 +550,20 @@ def generate_custom_test(args): return [ { - 'image': args.image, - 'model': args.model, - 'precision': args.precision, - 'framework': args.framework, - 'runner': args.runner_label, + FIELD_IMAGE: args.image, + FIELD_MODEL: args.model, + FIELD_PRECISION: args.precision, + FIELD_FRAMEWORK: args.framework, + FIELD_RUNNER: args.runner_label, # Again, just use 1k1k since this is just meant to smoke test all runners - 'isl': 1024, - 'osl': 1024, - 'tp': 8, - 'ep': 1, - 'dp-attn': False, - 'conc': 4, - 'exp-name': args.exp_name, - 'max-model-len': 2048, + FIELD_ISL: 1024, + FIELD_OSL: 1024, + FIELD_TP: 8, + FIELD_EP: 1, + FIELD_DP_ATTN: False, + FIELD_CONC: 4, + FIELD_EXP_NAME: args.exp_name, + FIELD_MAX_MODEL_LEN: 2048, } ] @@ -561,62 +590,62 @@ def generate_runner_sweep_config(args, all_config_data): # Only consider configs with specified runner if not key.startswith(args.model_prefix): continue - - if not val['runner'] == args.runner_type: + + if not val[FIELD_RUNNER] == args.runner_type: continue # Optionally filter by precision and framework - if (args.precision and val['precision'] != args.precision) or (args.framework and val['framework'] != args.framework): + if (args.precision and val[FIELD_PRECISION] != args.precision) or (args.framework and val[FIELD_FRAMEWORK] != args.framework): continue # Get model code for exp_name - model_code = val['model-prefix'] + model_code = val[FIELD_MODEL_PREFIX] - runner_nodes = runner_config.get(val['runner']) + runner_nodes = runner_config.get(val[FIELD_RUNNER]) if not runner_nodes: raise ValueError( - f"Runner '{val['runner']}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") + f"Runner '{val[FIELD_RUNNER]}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") # Find 1k1k config target_config = None - for config in val['seq-len-configs']: - if config['isl'] == 1024 and config['osl'] == 1024: + for config in val[FIELD_SEQ_LEN_CONFIGS]: + if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024: target_config = config break - highest_tp_bmk = max(target_config['search-space'], key=lambda x: x['tp']) + highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP]) # Since we are just testing, pick the highest TP for this config and just test # on that TP with the lowest concurrency available - highest_tp = highest_tp_bmk['tp'] - lowest_conc = highest_tp_bmk['conc-start'] + highest_tp = highest_tp_bmk[FIELD_TP] + lowest_conc = highest_tp_bmk[FIELD_CONC_START] - ep = highest_tp_bmk.get('ep') - dp_attn = highest_tp_bmk.get('dp-attn') + ep = highest_tp_bmk.get(FIELD_EP) + dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) for node in runner_nodes: entry = { - 'image': val['image'], - 'model': val['model'], - 'precision': val['precision'], - 'framework': val['framework'], + FIELD_IMAGE: val[FIELD_IMAGE], + FIELD_MODEL: val[FIELD_MODEL], + FIELD_PRECISION: val[FIELD_PRECISION], + FIELD_FRAMEWORK: val[FIELD_FRAMEWORK], # Add one entry for each node under specified runner type - 'runner': node, + FIELD_RUNNER: node, # Again, just use 1k1k since this is just meant to smoke test all runners - 'isl': 1024, - 'osl': 1024, - 'tp': highest_tp, - 'ep': 1, # Default, - 'dp-attn': False, # Default - 'conc': lowest_conc, - 'exp-name': f"{model_code}_test", - 'max-model-len': 2048, + FIELD_ISL: 1024, + FIELD_OSL: 1024, + FIELD_TP: highest_tp, + FIELD_EP: 1, # Default, + FIELD_DP_ATTN: False, # Default + FIELD_CONC: lowest_conc, + FIELD_EXP_NAME: f"{model_code}_test", + FIELD_MAX_MODEL_LEN: 2048, } # Add optional fields if they exist if ep is not None: - entry['ep'] = ep + entry[FIELD_EP] = ep if dp_attn is not None: - entry['dp-attn'] = dp_attn + entry[FIELD_DP_ATTN] = dp_attn matrix_values.append(entry) From d233ea2fa50f641aad99a6246bc63b79089f560b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 14:54:52 -0500 Subject: [PATCH 123/149] remove concurrency locks --- .github/workflows/1k1k-sweep.yml | 4 ---- .github/workflows/1k8k-sweep.yml | 4 ---- .github/workflows/8k1k-sweep.yml | 4 ---- .github/workflows/full-sweep-test.yml | 4 ---- 4 files changed, 16 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index f6ec37562..0930f8a9a 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -1,9 +1,5 @@ name: "Full Sweep Scheduler - 1k1k" -concurrency: - group: benchmark-lock-1k1k - cancel-in-progress: false - on: workflow_dispatch: schedule: diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index 82bc48817..c3bcf9662 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -1,9 +1,5 @@ name: "Full Sweep Scheduler - 1k8k" -concurrency: - group: benchmark-lock-1k8k - cancel-in-progress: false - on: workflow_dispatch: schedule: diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml index 8863112af..fdb6b6112 100644 --- a/.github/workflows/8k1k-sweep.yml +++ b/.github/workflows/8k1k-sweep.yml @@ -1,9 +1,5 @@ name: "Full Sweep Scheduler - 8k1k" -concurrency: - group: benchmark-lock-8k1k - cancel-in-progress: false - on: workflow_dispatch: schedule: diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml index a2ff06d18..3657971ac 100644 --- a/.github/workflows/full-sweep-test.yml +++ b/.github/workflows/full-sweep-test.yml @@ -1,9 +1,5 @@ name: Test - Full Sweep -concurrency: - group: benchmark-lock - cancel-in-progress: false - on: workflow_dispatch: inputs: From 4e1228b465129a20168c0fbb772bf3a14d13cfea Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 14:56:31 -0500 Subject: [PATCH 124/149] add dpa to results filename --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 571b39888..d1acf16c7 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -127,7 +127,7 @@ jobs: - name: Launch job script env: RUNNER_NAME: ${{ runner.name }} - RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_conc${{ env.CONC }}_${{ runner.name }} + RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_${{ runner.name }} run: | bash ./runners/launch_${RUNNER_NAME%%_*}.sh if [ -f "$RESULT_FILENAME.json" ]; then From d816ef46244c8a5ed135692ae46aea83d254c650 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 14:59:49 -0500 Subject: [PATCH 125/149] add back plotting --- .github/workflows/collect-results.yml | 12 ++ utils/plot_perf.py | 197 ++++++++++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100644 utils/plot_perf.py diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index 1afe9f049..c1799117e 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -35,3 +35,15 @@ jobs: with: name: results_${{ inputs.exp-name }} path: agg_${{ inputs.exp-name }}.json + + - name: Plot performance + run: | + pip install -q matplotlib + python3 utils/plot_perf.py results/ ${{ inputs.exp-name }} + - name: Upload performance graphs + uses: actions/upload-artifact@v4 + with: + name: graphs_${{ inputs.exp-name }} + path: | + tput_vs_intvty_*_${{ inputs.exp-name }}.png + tput_vs_e2el_*_${{ inputs.exp-name }}.png diff --git a/utils/plot_perf.py b/utils/plot_perf.py new file mode 100644 index 000000000..1cab81cdc --- /dev/null +++ b/utils/plot_perf.py @@ -0,0 +1,197 @@ +import sys +import json +from pathlib import Path +import matplotlib.pyplot as plt + + +results_dir = Path(sys.argv[1]) +exp_name = sys.argv[2] +hw_color = { + 'h100': 'lightgreen', + 'h200': 'green', # H200 VLLM + 'h200-trt': 'darkgreen', # H200 TRT-LLM + 'b200': 'black', # B200 VLLM + 'b200-trt': 'gray', # B200 TRT-LLM + 'mi300x': 'pink', + 'mi325x': 'red', + 'mi355x': 'purple', + 'gb200': 'orange', # GB200 TRT-LLM and SGlang +} + +results = [] +for result_path in results_dir.rglob(f'*.json'): + with open(result_path) as f: + result = json.load(f) + results.append(result) + + +def plot_tput_vs_e2el(precision_filter=None): + fig, ax = plt.subplots() + + # Filter results by precision if specified + filtered_results = results + if precision_filter is not None: + filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter] + + for hw_label, color in hw_color.items(): + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_e2el'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_e2el'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) + + for result in filtered_results: + x, y = result['median_e2el'], result['tput_per_gpu'] + ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) + + ax.set_xlabel('End-to-end Latency (s)') + ax.set_ylabel('Throughput per GPU (tok/s)') + ax.legend(title='GPU Type') + fig.tight_layout() + + precision_suffix = f"_{precision_filter}" if precision_filter else "" + fig.savefig(f'tput_vs_e2el_{exp_name}{precision_suffix}.png', bbox_inches='tight') + plt.close(fig) + + +def plot_tput_vs_intvty(precision_filter=None): + fig, ax = plt.subplots() + + # Filter results by precision if specified + filtered_results = results + if precision_filter is not None: + filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter] + + for hw_label, color in hw_color.items(): + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_intvty'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_intvty'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) + + for result in filtered_results: + x, y = result['median_intvty'], result['tput_per_gpu'] + ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) + + ax.set_xlabel('Interactivity (tok/s/user)') + ax.set_ylabel('Throughput per GPU (tok/s)') + ax.legend(title='GPU Type') + fig.tight_layout() + + precision_suffix = f"_{precision_filter}" if precision_filter else "" + fig.savefig(f'tput_vs_intvty_{exp_name}{precision_suffix}.png', bbox_inches='tight') + plt.close(fig) + + +def plot_tput_vs_e2el_for_model(model_results, model_name): + fig, ax = plt.subplots() + + for hw_label, color in hw_color.items(): + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_e2el'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_e2el'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) + + for result in model_results: + x, y = result['median_e2el'], result['tput_per_gpu'] + ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) + + ax.set_xlabel('End-to-end Latency (s)') + ax.set_ylabel('Throughput per GPU (tok/s)') + ax.legend(title='Hardware + Framework') + ax.set_title(f'{model_name} - All Frameworks') + fig.tight_layout() + + # Extract model identifier from model name + model_id = model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name + fig.savefig(f'tput_vs_e2el_{model_id}_{exp_name}.png', bbox_inches='tight') + plt.close(fig) + + +def plot_tput_vs_intvty_for_model(model_results, model_name): + fig, ax = plt.subplots() + + for hw_label, color in hw_color.items(): + # Separate fp8 and fp4 results for this hardware + fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8'] + fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4'] + + # Plot fp8 results with circles + if fp8_results: + xs_fp8 = [r['median_intvty'] for r in fp8_results] + ys_fp8 = [r['tput_per_gpu'] for r in fp8_results] + ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60) + + # Plot fp4 results with squares + if fp4_results: + xs_fp4 = [r['median_intvty'] for r in fp4_results] + ys_fp4 = [r['tput_per_gpu'] for r in fp4_results] + ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60) + + for result in model_results: + x, y = result['median_intvty'], result['tput_per_gpu'] + ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8) + + ax.set_xlabel('Interactivity (tok/s/user)') + ax.set_ylabel('Throughput per GPU (tok/s)') + ax.legend(title='Hardware + Framework') + ax.set_title(f'{model_name} - All Frameworks') + fig.tight_layout() + + # Extract model identifier from model name + model_id = model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name + fig.savefig(f'tput_vs_intvty_{model_id}_{exp_name}.png', bbox_inches='tight') + plt.close(fig) + + +# Create one plot per model showing all frameworks and hardware +# Group results by model family (70b, dsr1, etc.) instead of full model name +def get_model_family(model_name): + if '70b' in model_name.lower() or 'llama-3.3-70b' in model_name.lower(): + return '70b' + elif 'dsr1' in model_name.lower() or 'deepseek-r1' in model_name.lower(): + return 'dsr1' + else: + # Fallback to first part of model name + return model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name + +model_families = set(get_model_family(r.get('model', 'unknown')) for r in results) + +for model_family in model_families: + # Filter results for this model family + model_results = [r for r in results if get_model_family(r.get('model', 'unknown')) == model_family] + + # Create plots for this model family + plot_tput_vs_e2el_for_model(model_results, model_family) + plot_tput_vs_intvty_for_model(model_results, model_family) From 249a94c24c7be7ab43d49668ea9f1d264e55dd79 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 18:49:34 -0500 Subject: [PATCH 126/149] testing concurrency From 6589e53621fae686b97a13e24345ccbf5d0db06d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 14:42:41 -0500 Subject: [PATCH 127/149] adding more workflows --- .github/workflows/test.yml | 147 ++++++++++++++++++ utils/matrix-logic/get_test_sweep_configs.py | 151 +++++++++++++++++++ 2 files changed, 298 insertions(+) create mode 100644 .github/workflows/test.yml create mode 100644 utils/matrix-logic/get_test_sweep_configs.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 000000000..0d92952da --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,147 @@ +name: Test - Full Sweep + +concurrency: + group: benchmark-lock + cancel-in-progress: false + +on: + pull_request: + workflow_dispatch: + inputs: + name: + description: "Name of benchmark from master configs" + required: true + type: string + default: 70b-fp4-mi355x-vllm + + run_1k1k: + description: "Run ISL/OSL 1k/1k" + type: boolean + required: true + run_1k8k: + description: "Run ISL/OSL 1k/8k" + type: boolean + required: true + run_8k1k: + description: "Run ISL/OSL 8k/1k" + type: boolean + required: true + + runner: + description: "Specific runner node to run on" + required: false + type: choice + options: + - "h100-cr_0" + - "h100-cr_1" + - "h100-cw_0" + - "h100-cw_1" + - "h200-cw_0" + - "h200-cw_1" + - "h200-nb_0" + - "h200-nb_1" + - "h200-nb_2" + - "h200-nb_3" + - "h200-nv_0" + - "h200-nv_1" + - "h200-nv_2" + - "h200-nv_3" + - "b200-nv_0" + - "b200-nv_1" + - "b200-nb_0" + - "b200-nb_1" + - "b200-nvd_0" + - "b200-nvd_1" + - "b200-nvd_2" + - "b200-nvd_3" + - "b200-tg_0" + - "mi300x-amd_0" + - "mi300x-amd_1" + - "mi300x-amd_2" + - "mi300x-amd_3" + - "mi300x-amd_4" + - "mi300x-cr_0" + - "mi300x-oci_0" + - "mi325x-amd_0" + - "mi325x-tw_0" + - "mi325x-tw_1" + - "mi325x-tw_2" + - "mi325x-tw_3" + - "mi355x-amd_0" + - "mi355x-amd_1" + - "mi355x-amd_2" + - "mi355x-amd_3" + +jobs: + get-jobs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-jobs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \ + --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ + --key ${{ inputs.name }} \ + ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }}) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + test-sweep: + needs: get-jobs + uses: ./.github/workflows/benchmark-tmpl.yml + name: test sweep - ${{ inputs.name }} + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "dsr1_1k1k" + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} + + calc-success-rate: + needs: test-sweep + if: ${{ always() }} + runs-on: ubuntu-latest + + env: + RESULTS_DIR: "results/" + STATS_FILENAME: "run_stats" + GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download results artifacts + uses: actions/download-artifact@v4 + with: + path: ${{ env.RESULTS_DIR }} + pattern: results_* + + - name: Install python dependencies + run: pip install PyGithub + + - name: Calculate success rate + run: python3 utils/calc_success_rate.py $STATS_FILENAME + + - uses: actions/upload-artifact@v4 + with: + name: "run-stats" + path: ${{ env.STATS_FILENAME }}.json diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py new file mode 100644 index 000000000..87ab0457b --- /dev/null +++ b/utils/matrix-logic/get_test_sweep_configs.py @@ -0,0 +1,151 @@ +import json +import yaml +import sys +import argparse + +seq_len_stoi = { + "1k1k": (1024, 1024), + "1k8k": (1024, 8192), + "8k1k": (8192, 1024) +} + +def main(): + parser = argparse.ArgumentParser( + description='Generate benchmark matrix from a specific configuration key' + ) + parser.add_argument( + '--config-files', + nargs='+', + required=True, + help='One or more configuration files (YAML format)' + ) + parser.add_argument( + '--key', + required=True, + help='Configuration key to use' + ) + parser.add_argument( + '--seq-lens', + nargs='+', + choices=list(seq_len_stoi.keys()), + required=False, + help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." + ) + parser.add_argument( + '--step-size', + type=int, + default=2, + help='Step size for concurrency values (default: 2)' + ) + + args = parser.parse_args() + + # Convert seq-lens to set of (isl, osl) tuples for filtering + seq_lens_filter = None + if args.seq_lens: + seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} + + # Load and merge all config files + all_config_data = {} + for config_file in args.config_files: + try: + with open(config_file, 'r') as f: + config_data = yaml.safe_load(f) + assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" + + # Check for duplicate keys + duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) + if duplicate_keys: + raise ValueError( + f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" + ) + + all_config_data.update(config_data) + except FileNotFoundError: + raise ValueError(f"Input file '{config_file}' does not exist.") + + # Check if the key exists + if args.key not in all_config_data: + available_keys = ', '.join(sorted(all_config_data.keys())) + raise ValueError( + f"Key '{args.key}' not found in configuration files. " + f"Available keys: {available_keys}" + ) + + val = all_config_data[args.key] + + # Validate required fields + seq_len_configs = val.get('seq-len-configs') + assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'" + + image = val.get('image') + model = val.get('model') + precision = val.get('precision') + framework = val.get('framework') + runner = val.get('runner') + + assert None not in (image, model, precision, framework, runner), \ + f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" + + matrix_values = [] + + # Process each sequence length configuration + for seq_config in seq_len_configs: + isl = seq_config.get('isl') + osl = seq_config.get('osl') + + assert None not in (isl, osl), \ + f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'" + + # Filter by sequence lengths if specified + if seq_lens_filter and (isl, osl) not in seq_lens_filter: + continue + + bmk_space = seq_config.get('bmk-space') + assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'" + + for bmk in bmk_space: + tp = bmk.get('tp') + conc_start = bmk.get('conc-start') + conc_end = bmk.get('conc-end') + ep = bmk.get('ep') + dp_attn = bmk.get('dp-attn') + + assert None not in (tp, conc_start, conc_end), \ + f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" + + # Generate entries for each concurrency value in the range + conc = conc_start + while conc <= conc_end: + entry = { + 'image': image, + 'model': model, + 'precision': precision, + 'framework': framework, + 'runner': runner, + 'isl': isl, + 'osl': osl, + 'tp': tp, + 'conc': conc, + 'max-model-len': isl + osl, + } + + # Add optional fields if they exist + if ep is not None: + entry['ep'] = ep + if dp_attn is not None: + entry['dp-attn'] = dp_attn + + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + print(json.dumps(matrix_values)) + return matrix_values + +if __name__ == "__main__": + main() \ No newline at end of file From 3695ed50007215c6342906042d3ab76d8eca7ef2 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 17:05:27 -0500 Subject: [PATCH 128/149] deleting files --- .github/workflows/test.yml | 147 ------------------------------------- 1 file changed, 147 deletions(-) delete mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml deleted file mode 100644 index 0d92952da..000000000 --- a/.github/workflows/test.yml +++ /dev/null @@ -1,147 +0,0 @@ -name: Test - Full Sweep - -concurrency: - group: benchmark-lock - cancel-in-progress: false - -on: - pull_request: - workflow_dispatch: - inputs: - name: - description: "Name of benchmark from master configs" - required: true - type: string - default: 70b-fp4-mi355x-vllm - - run_1k1k: - description: "Run ISL/OSL 1k/1k" - type: boolean - required: true - run_1k8k: - description: "Run ISL/OSL 1k/8k" - type: boolean - required: true - run_8k1k: - description: "Run ISL/OSL 8k/1k" - type: boolean - required: true - - runner: - description: "Specific runner node to run on" - required: false - type: choice - options: - - "h100-cr_0" - - "h100-cr_1" - - "h100-cw_0" - - "h100-cw_1" - - "h200-cw_0" - - "h200-cw_1" - - "h200-nb_0" - - "h200-nb_1" - - "h200-nb_2" - - "h200-nb_3" - - "h200-nv_0" - - "h200-nv_1" - - "h200-nv_2" - - "h200-nv_3" - - "b200-nv_0" - - "b200-nv_1" - - "b200-nb_0" - - "b200-nb_1" - - "b200-nvd_0" - - "b200-nvd_1" - - "b200-nvd_2" - - "b200-nvd_3" - - "b200-tg_0" - - "mi300x-amd_0" - - "mi300x-amd_1" - - "mi300x-amd_2" - - "mi300x-amd_3" - - "mi300x-amd_4" - - "mi300x-cr_0" - - "mi300x-oci_0" - - "mi325x-amd_0" - - "mi325x-tw_0" - - "mi325x-tw_1" - - "mi325x-tw_2" - - "mi325x-tw_3" - - "mi355x-amd_0" - - "mi355x-amd_1" - - "mi355x-amd_2" - - "mi355x-amd_3" - -jobs: - get-jobs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - id: get-jobs - run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \ - --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ - --key ${{ inputs.name }} \ - ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }}) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - - test-sweep: - needs: get-jobs - uses: ./.github/workflows/benchmark-tmpl.yml - name: test sweep - ${{ inputs.name }} - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "dsr1_1k1k" - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} - conc: ${{ matrix.config.conc }} - - calc-success-rate: - needs: test-sweep - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@v3 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@v4 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@v4 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json From b328c7f59db27fd41f10331a6e8032401d5d9fb7 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 17:37:58 -0500 Subject: [PATCH 129/149] temp fix (#148) --- benchmarks/70b_fp4_b200_trt_slurm.sh | 75 ++++++++++++++++++++++++ benchmarks/70b_fp8_b200_trt_slurm.sh | 75 ++++++++++++++++++++++++ benchmarks/70b_fp8_h200_slurm.sh | 69 ++++++++++++++++++++++ benchmarks/70b_fp8_h200_trt_slurm.sh | 70 ++++++++++++++++++++++ benchmarks/70b_fp8_mi325x_slurm.sh | 86 ++++++++++++++++++++++++++++ 5 files changed, 375 insertions(+) create mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh create mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh create mode 100644 benchmarks/70b_fp8_h200_slurm.sh create mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh create mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh new file mode 100644 index 000000000..ad24453b3 --- /dev/null +++ b/benchmarks/70b_fp4_b200_trt_slurm.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + + +set -x + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh new file mode 100644 index 000000000..ad24453b3 --- /dev/null +++ b/benchmarks/70b_fp8_b200_trt_slurm.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + + +set -x + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh new file mode 100644 index 000000000..094fbd19c --- /dev/null +++ b/benchmarks/70b_fp8_h200_slurm.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +set -x +hf download $MODEL +pip install datasets pandas + +# Calculate max-model-len based on ISL and OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) +elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then + CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) +else + CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} +fi + +# Create config.yaml +cat > config.yaml << EOF +kv-cache-dtype: fp8 +async-scheduling: true +no-enable-prefix-caching: true +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +export TORCH_CUDA_ARCH_LIST="9.0" + +PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ + --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ + --disable-log-requests > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh new file mode 100644 index 000000000..dfb2324b9 --- /dev/null +++ b/benchmarks/70b_fp8_h200_trt_slurm.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +hf download $MODEL +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +# Create llama-config.yml inline +# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros +if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then +cat > llama-config.yml << 'EOF' +batch_wait_max_tokens_ratio: 0.9 +batch_wait_timeout_iters: 20 +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +else +cat > llama-config.yml << 'EOF' +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 + enable_block_reuse: false +stream_interval: 10 +EOF +fi + +mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend openai \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh new file mode 100644 index 000000000..1febeff13 --- /dev/null +++ b/benchmarks/70b_fp8_mi325x_slurm.sh @@ -0,0 +1,86 @@ +#!/usr/bin/bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT_OFFSET + +echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" + +huggingface-cli download $MODEL + +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +PORT=$(( 8888 + $PORT_OFFSET )) + +# Reference +# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark + +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + +if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + export VLLM_ROCM_USE_AITER_MHA=0 +elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ "$CONC" -ge "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi +fi + +# Patch the aiter config script to deal +# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch. +file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py' +sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch + + +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). +set -x +vllm serve $MODEL --port=$PORT \ +--swap-space=64 \ +--gpu-memory-utilization=0.94 \ +--dtype=float16 --kv-cache-dtype=fp8 \ +--distributed-executor-backend=mp --tensor-parallel-size=$TP \ +--max-model-len=$MAX_MODEL_LEN \ +--max-seq-len-to-capture=$MAX_MODEL_LEN \ +--max-num-seqs=$CONC \ +--max-num-batched-tokens=131072 \ +--no-enable-prefix-caching \ +--config config.yaml \ +--async-scheduling \ +--disable-log-requests \ +> $SERVER_LOG 2>&1 & + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" == *"Application startup complete"* ]]; then + break + fi +done < <(tail -F -n0 "$SERVER_LOG") + +set -x +git clone https://github.com/kimbochen/bench_serving.git +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm \ +--base-url http://0.0.0.0:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ \ +--result-filename $RESULT_FILENAME.json From 264186fb128e2c10e8ec8dadce41696560854060 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 18:49:34 -0500 Subject: [PATCH 130/149] testing concurrency From e9e0e70d83383af48fede768ac4f7aa34ce5fd24 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 16:21:44 -0500 Subject: [PATCH 131/149] update random range ratio default --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index d1acf16c7..2eef0e18f 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -44,7 +44,7 @@ on: random-range-ratio: required: false type: string - default: '0.2' + default: '0.8' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} From bbc22209e03dca7a9acfe2f545503de003b723b8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 16:31:06 -0500 Subject: [PATCH 132/149] get process results vals from env vars instead of argv --- .../workflows/benchmark-multinode-tmpl.yml | 2 +- .github/workflows/benchmark-tmpl.yml | 2 +- utils/process_result.py | 22 ++++++++++--------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index b4d917575..bfbd5a1cf 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -94,7 +94,7 @@ jobs: # Extract GPU count from filename for tp_size calculation gpus=$(echo "$result_file" | sed "s/.*_gpus\([0-9]*\)\.json/\1/") if [ -n "$gpus" ]; then - python3 utils/process_result.py ${{ inputs.runner }} $gpus 1 false ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE + TP=$gpus RESULT_FILENAME=${result_file%.json} EP_SIZE=1 DP_ATTENTION=false python3 utils/process_result.py fi fi done diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 2eef0e18f..754cbb969 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -139,7 +139,7 @@ jobs: - name: Process result run: | - python3 utils/process_result.py ${{ inputs.runner }} $TP $EP_SIZE $DP_ATTENTION $RESULT_FILENAME $FRAMEWORK $PRECISION + python3 utils/process_result.py ${{ inputs.runner }} - name: Upload result uses: actions/upload-artifact@v4 with: diff --git a/utils/process_result.py b/utils/process_result.py index a59d1f7f3..d59a61790 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -1,15 +1,17 @@ import sys import json +import os from pathlib import Path -hw = sys.argv[1] -tp_size = int(sys.argv[2]) -ep_size = int(sys.argv[3]) -dp_attention = sys.argv[4] -result_filename = sys.argv[5] -framework = sys.argv[6] -precision = sys.argv[7] +hw = os.environ.get('RUNNER_NAME') +tp_size = int(os.environ.get('TP')) +ep_size = int(os.environ.get('EP_SIZE')) +dp_attention = os.environ.get('DP_ATTENTION') +result_filename = os.environ.get('RESULT_FILENAME') +framework = os.environ.get('FRAMEWORK') +precision = os.environ.get('PRECISION') +mtp_mode = os.environ.get('MTP_MODE') with open(f'{result_filename}.json') as f: bmk_result = json.load(f) @@ -18,8 +20,8 @@ 'hw': hw, 'tp': tp_size, 'ep': ep_size, - 'conc': int(bmk_result['max_concurrency']), 'dp_attention': dp_attention, # true or false + 'conc': int(bmk_result['max_concurrency']), 'model': bmk_result['model_id'], 'framework': framework, 'precision': precision, @@ -27,8 +29,8 @@ 'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size } -if len(sys.argv) == 9: # MTP - data['mtp'] = sys.argv[8] +if mtp_mode: # MTP + data['mtp'] = mtp_mode for key, value in bmk_result.items(): if key.endswith('ms'): From d5ec7dec14da7103feea08ca14601bf5975b79b4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 16:42:39 -0500 Subject: [PATCH 133/149] get process results vals from env vars instead of argv pt 2 --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 754cbb969..293e3ac49 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -139,7 +139,7 @@ jobs: - name: Process result run: | - python3 utils/process_result.py ${{ inputs.runner }} + python3 utils/process_result.py - name: Upload result uses: actions/upload-artifact@v4 with: From 6af36effae2cad1e021c3600efb73822a868d744 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 17:57:15 -0500 Subject: [PATCH 134/149] editing runners yaml --- .github/README.md | 8 ++++++++ .github/configs/runners.yaml | 1 - 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/README.md b/.github/README.md index f4539dd5d..69fc1069f 100644 --- a/.github/README.md +++ b/.github/README.md @@ -33,6 +33,8 @@ Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter test-config --key dsr1-fp4-b200-sglang --seq-len 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` +Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986046399 + If we wanted to also test 1k8k or 8k1k scenarios, we would simply append `1k8k` or `8k1k` to `--seq-len`, respectively. Further, if we wanted to run that config on *one specific* runner node, we could specify that by appending `--runner-node` to the argument list. Note that if the specified runner node is not compatible with the specified config key (as dictated by `.github/configs/runners.yaml`), then the workflow will error: @@ -43,6 +45,8 @@ test-config --config-files .github/configs/nvidia-master.yaml --runner-config .g ValueError: Runner node 'mi300x-amd_0' is not compatible with config 'dsr1-fp4-b200-sglang' which runs on runner type 'b200'. Available runner nodes for this config are 'b200-nb_0, b200-nb_1, b200-nvd_0, b200-nvd_1, b200-nvd_2, b200-nvd_3, b200-tg_0'. ``` +Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986053019/job/54229839736 + **Scenario 2**: I just made a change to the `benchmarks/dsr1_fp8_b200_docker.sh` and I need to verify that these changes work across all B200 runners. Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input: @@ -50,6 +54,8 @@ Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter runner-sweep --runner-type b200 --model-prefix dsr1 --precision fp8 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` +Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986283169 + This will run a test (just the highest available parallelism and lowest available concurrency) for each B200 runner node for each Deepseek config that runs on B200 with fp8 precision. I.e., this can be used to "sweep" across runners for a particular model to test that all runners still work with changes that have been made. **Scenario 3**: I just upgraded the CUDA drivers on all H200 runners and need to verify that all models that use H200 still work correctly across all H200 nodes. @@ -59,6 +65,8 @@ Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter runner-model-sweep --runner-type h200 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` +Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986292917 + This will run a test (just the highest available parallelism and lowest available concurrency) for each configuration that specifies the `h200` runner type, across all H200 runner nodes defined in `.github/configs/runners.yaml`. For example, if you have configs `dsr1-fp8-h200-sglang`, `dsr1-fp8-h200-trt`, and `gptoss-fp4-h200-vllm` that all use `runner: h200`, and you have 8 H200 nodes (`h200-cw_0`, `h200-cw_1`, etc.), this will run all 3 configs on all 8 nodes (24 total test runs). diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 692ade8dd..cdd865561 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -38,7 +38,6 @@ b200: - 'b200-nvd_1' - 'b200-nvd_2' - 'b200-nvd_3' -- 'b200-tg_0' mi300x: - 'mi300x-amd_0' - 'mi300x-amd_1' From cefcf15268b6ade1fc275e89854a3ce4b95e0602 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 18:49:34 -0500 Subject: [PATCH 135/149] testing concurrency From 46545a910a5befb8869ca57759b7b0f7e467bf84 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Mon, 27 Oct 2025 14:42:41 -0500 Subject: [PATCH 136/149] adding more workflows --- .github/workflows/1k1k-sweep.yml | 6 +- .github/workflows/1k8k-sweep.yml | 6 +- .github/workflows/test.yml | 147 +++++++++++++++++++++++++++++++ 3 files changed, 155 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index 0930f8a9a..e806f4c70 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -1,4 +1,8 @@ -name: "Full Sweep Scheduler - 1k1k" +name: "1K/1K Sweep" + +concurrency: + group: benchmark-lock-1k1k + cancel-in-progress: false on: workflow_dispatch: diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index c3bcf9662..f4bb4338e 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -1,9 +1,9 @@ name: "Full Sweep Scheduler - 1k8k" on: - workflow_dispatch: - schedule: - - cron: "0 23 * * *" + workflow_dispatch: + schedule: + - cron: '0 23 * * *' jobs: get-dsr1-configs: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 000000000..0d92952da --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,147 @@ +name: Test - Full Sweep + +concurrency: + group: benchmark-lock + cancel-in-progress: false + +on: + pull_request: + workflow_dispatch: + inputs: + name: + description: "Name of benchmark from master configs" + required: true + type: string + default: 70b-fp4-mi355x-vllm + + run_1k1k: + description: "Run ISL/OSL 1k/1k" + type: boolean + required: true + run_1k8k: + description: "Run ISL/OSL 1k/8k" + type: boolean + required: true + run_8k1k: + description: "Run ISL/OSL 8k/1k" + type: boolean + required: true + + runner: + description: "Specific runner node to run on" + required: false + type: choice + options: + - "h100-cr_0" + - "h100-cr_1" + - "h100-cw_0" + - "h100-cw_1" + - "h200-cw_0" + - "h200-cw_1" + - "h200-nb_0" + - "h200-nb_1" + - "h200-nb_2" + - "h200-nb_3" + - "h200-nv_0" + - "h200-nv_1" + - "h200-nv_2" + - "h200-nv_3" + - "b200-nv_0" + - "b200-nv_1" + - "b200-nb_0" + - "b200-nb_1" + - "b200-nvd_0" + - "b200-nvd_1" + - "b200-nvd_2" + - "b200-nvd_3" + - "b200-tg_0" + - "mi300x-amd_0" + - "mi300x-amd_1" + - "mi300x-amd_2" + - "mi300x-amd_3" + - "mi300x-amd_4" + - "mi300x-cr_0" + - "mi300x-oci_0" + - "mi325x-amd_0" + - "mi325x-tw_0" + - "mi325x-tw_1" + - "mi325x-tw_2" + - "mi325x-tw_3" + - "mi355x-amd_0" + - "mi355x-amd_1" + - "mi355x-amd_2" + - "mi355x-amd_3" + +jobs: + get-jobs: + runs-on: ubuntu-latest + outputs: + search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - id: get-jobs + run: | + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \ + --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ + --key ${{ inputs.name }} \ + ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }}) + echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + test-sweep: + needs: get-jobs + uses: ./.github/workflows/benchmark-tmpl.yml + name: test sweep - ${{ inputs.name }} + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} + secrets: inherit + with: + exp-name: "dsr1_1k1k" + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep || 1 }} + dp-attn: ${{ matrix.config.dp-attn || false }} + conc: ${{ matrix.config.conc }} + + calc-success-rate: + needs: test-sweep + if: ${{ always() }} + runs-on: ubuntu-latest + + env: + RESULTS_DIR: "results/" + STATS_FILENAME: "run_stats" + GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download results artifacts + uses: actions/download-artifact@v4 + with: + path: ${{ env.RESULTS_DIR }} + pattern: results_* + + - name: Install python dependencies + run: pip install PyGithub + + - name: Calculate success rate + run: python3 utils/calc_success_rate.py $STATS_FILENAME + + - uses: actions/upload-artifact@v4 + with: + name: "run-stats" + path: ${{ env.STATS_FILENAME }}.json From e59f2d712648f91434fcc99cc14c47ba3c7711f6 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 29 Oct 2025 17:05:27 -0500 Subject: [PATCH 137/149] deleting files --- .github/workflows/test.yml | 147 ------------------------------------- 1 file changed, 147 deletions(-) delete mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml deleted file mode 100644 index 0d92952da..000000000 --- a/.github/workflows/test.yml +++ /dev/null @@ -1,147 +0,0 @@ -name: Test - Full Sweep - -concurrency: - group: benchmark-lock - cancel-in-progress: false - -on: - pull_request: - workflow_dispatch: - inputs: - name: - description: "Name of benchmark from master configs" - required: true - type: string - default: 70b-fp4-mi355x-vllm - - run_1k1k: - description: "Run ISL/OSL 1k/1k" - type: boolean - required: true - run_1k8k: - description: "Run ISL/OSL 1k/8k" - type: boolean - required: true - run_8k1k: - description: "Run ISL/OSL 8k/1k" - type: boolean - required: true - - runner: - description: "Specific runner node to run on" - required: false - type: choice - options: - - "h100-cr_0" - - "h100-cr_1" - - "h100-cw_0" - - "h100-cw_1" - - "h200-cw_0" - - "h200-cw_1" - - "h200-nb_0" - - "h200-nb_1" - - "h200-nb_2" - - "h200-nb_3" - - "h200-nv_0" - - "h200-nv_1" - - "h200-nv_2" - - "h200-nv_3" - - "b200-nv_0" - - "b200-nv_1" - - "b200-nb_0" - - "b200-nb_1" - - "b200-nvd_0" - - "b200-nvd_1" - - "b200-nvd_2" - - "b200-nvd_3" - - "b200-tg_0" - - "mi300x-amd_0" - - "mi300x-amd_1" - - "mi300x-amd_2" - - "mi300x-amd_3" - - "mi300x-amd_4" - - "mi300x-cr_0" - - "mi300x-oci_0" - - "mi325x-amd_0" - - "mi325x-tw_0" - - "mi325x-tw_1" - - "mi325x-tw_2" - - "mi325x-tw_3" - - "mi355x-amd_0" - - "mi355x-amd_1" - - "mi355x-amd_2" - - "mi355x-amd_3" - -jobs: - get-jobs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-jobs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - id: get-jobs - run: | - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \ - --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \ - --key ${{ inputs.name }} \ - ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }}) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - - test-sweep: - needs: get-jobs - uses: ./.github/workflows/benchmark-tmpl.yml - name: test sweep - ${{ inputs.name }} - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "dsr1_1k1k" - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep || 1 }} - dp-attn: ${{ matrix.config.dp-attn || false }} - conc: ${{ matrix.config.conc }} - - calc-success-rate: - needs: test-sweep - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@v3 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@v4 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@v4 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json From fe445a1a9dfcc5d1cb7cca8505ba3e50f8b4f766 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 18:49:34 -0500 Subject: [PATCH 138/149] testing concurrency From d1540496a2eaed5c218a1582b579230f80277861 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 18:49:34 -0500 Subject: [PATCH 139/149] testing concurrency From 880d3c8276eb8ce839776d2876f5fa43b85c7aae Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Sun, 26 Oct 2025 18:49:34 -0500 Subject: [PATCH 140/149] testing concurrency From 026d16b82a25399f3c68cfa49b10c66ba3f9566c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 18:46:45 -0500 Subject: [PATCH 141/149] remove 70b --- benchmarks/70b_fp4_b200_trt_slurm.sh | 75 ------------------------ benchmarks/70b_fp8_b200_trt_slurm.sh | 75 ------------------------ benchmarks/70b_fp8_h200_slurm.sh | 69 ---------------------- benchmarks/70b_fp8_h200_trt_slurm.sh | 70 ---------------------- benchmarks/70b_fp8_mi325x_slurm.sh | 86 ---------------------------- 5 files changed, 375 deletions(-) delete mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh delete mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh delete mode 100644 benchmarks/70b_fp8_h200_slurm.sh delete mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh delete mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh deleted file mode 100644 index ad24453b3..000000000 --- a/benchmarks/70b_fp4_b200_trt_slurm.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - -set -x - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -# Launch TRT-LLM server -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh deleted file mode 100644 index ad24453b3..000000000 --- a/benchmarks/70b_fp8_b200_trt_slurm.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - - -set -x - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -# Launch TRT-LLM server -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh deleted file mode 100644 index 094fbd19c..000000000 --- a/benchmarks/70b_fp8_h200_slurm.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -set -x -hf download $MODEL -pip install datasets pandas - -# Calculate max-model-len based on ISL and OSL -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20)) -elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then - CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) -else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} -fi - -# Create config.yaml -cat > config.yaml << EOF -kv-cache-dtype: fp8 -async-scheduling: true -no-enable-prefix-caching: true -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -export TORCH_CUDA_ARCH_LIST="9.0" - -PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ - --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ - --disable-log-requests > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh deleted file mode 100644 index dfb2324b9..000000000 --- a/benchmarks/70b_fp8_h200_trt_slurm.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -hf download $MODEL -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -# Create llama-config.yml inline -# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros -if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then -cat > llama-config.yml << 'EOF' -batch_wait_max_tokens_ratio: 0.9 -batch_wait_timeout_iters: 20 -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -else -cat > llama-config.yml << 'EOF' -cuda_graph_config: - enable_padding: true - max_batch_size: 1024 -kv_cache_config: - dtype: fp8 - enable_block_reuse: false -stream_interval: 10 -EOF -fi - -mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend openai \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json \ No newline at end of file diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh deleted file mode 100644 index 1febeff13..000000000 --- a/benchmarks/70b_fp8_mi325x_slurm.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT_OFFSET - -echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" - -huggingface-cli download $MODEL - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -PORT=$(( 8888 + $PORT_OFFSET )) - -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark - -cat > config.yaml << EOF -compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' -EOF - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - export VLLM_ROCM_USE_AITER_MHA=0 -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -ge "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi -fi - -# Patch the aiter config script to deal -# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch. -file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py' -sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch - - -# In this specific case, float16 performs better than the datatype -# picked by vllm when using auto for --dtype (bfloat16). -set -x -vllm serve $MODEL --port=$PORT \ ---swap-space=64 \ ---gpu-memory-utilization=0.94 \ ---dtype=float16 --kv-cache-dtype=fp8 \ ---distributed-executor-backend=mp --tensor-parallel-size=$TP \ ---max-model-len=$MAX_MODEL_LEN \ ---max-seq-len-to-capture=$MAX_MODEL_LEN \ ---max-num-seqs=$CONC \ ---max-num-batched-tokens=131072 \ ---no-enable-prefix-caching \ ---config config.yaml \ ---async-scheduling \ ---disable-log-requests \ -> $SERVER_LOG 2>&1 & - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" == *"Application startup complete"* ]]; then - break - fi -done < <(tail -F -n0 "$SERVER_LOG") - -set -x -git clone https://github.com/kimbochen/bench_serving.git -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm \ ---base-url http://0.0.0.0:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ \ ---result-filename $RESULT_FILENAME.json From 4a81cd4c5619deacd89df54476af5b07cddca18d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 18:51:12 -0500 Subject: [PATCH 142/149] cleaning up after rebase --- .github/workflows/1k1k-sweep.yml | 6 +----- .github/workflows/1k8k-sweep.yml | 7 ++++--- .github/workflows/8k1k-sweep.yml | 1 + 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml index e806f4c70..0930f8a9a 100644 --- a/.github/workflows/1k1k-sweep.yml +++ b/.github/workflows/1k1k-sweep.yml @@ -1,8 +1,4 @@ -name: "1K/1K Sweep" - -concurrency: - group: benchmark-lock-1k1k - cancel-in-progress: false +name: "Full Sweep Scheduler - 1k1k" on: workflow_dispatch: diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml index f4bb4338e..9dacb5a9f 100644 --- a/.github/workflows/1k8k-sweep.yml +++ b/.github/workflows/1k8k-sweep.yml @@ -1,9 +1,9 @@ name: "Full Sweep Scheduler - 1k8k" on: - workflow_dispatch: - schedule: - - cron: '0 23 * * *' + workflow_dispatch: + schedule: + - cron: "0 23 * * *" jobs: get-dsr1-configs: @@ -16,6 +16,7 @@ jobs: - id: get-dsr1-configs run: | + pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml index fdb6b6112..3a0ae47c3 100644 --- a/.github/workflows/8k1k-sweep.yml +++ b/.github/workflows/8k1k-sweep.yml @@ -16,6 +16,7 @@ jobs: - id: get-dsr1-configs run: | + pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT From cac35bc5de887b2097b0f9fb3c0e8fdabb4c2b4b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 31 Oct 2025 18:54:05 -0500 Subject: [PATCH 143/149] changing name of files from XkYk to shceduler --- .../workflows/{1k1k-sweep.yml => full-sweep-1k1k-scheduler.yml} | 0 .../workflows/{1k8k-sweep.yml => full-sweep-1k8k-scheduler.yml} | 0 .../workflows/{8k1k-sweep.yml => full-sweep-8k1k-scheduler.yml} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{1k1k-sweep.yml => full-sweep-1k1k-scheduler.yml} (100%) rename .github/workflows/{1k8k-sweep.yml => full-sweep-1k8k-scheduler.yml} (100%) rename .github/workflows/{8k1k-sweep.yml => full-sweep-8k1k-scheduler.yml} (100%) diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml similarity index 100% rename from .github/workflows/1k1k-sweep.yml rename to .github/workflows/full-sweep-1k1k-scheduler.yml diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml similarity index 100% rename from .github/workflows/1k8k-sweep.yml rename to .github/workflows/full-sweep-1k8k-scheduler.yml diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml similarity index 100% rename from .github/workflows/8k1k-sweep.yml rename to .github/workflows/full-sweep-8k1k-scheduler.yml From b60289e52d67d54e656369e5bb44aa3c1ea3f963 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 3 Nov 2025 09:39:03 -0600 Subject: [PATCH 144/149] double check and update master configs --- .github/configs/nvidia-master.yaml | 39 +++++++++++++++--------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 92dfb5bbd..e9af1ce19 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -188,7 +188,7 @@ gptoss-fp4-b200-trt: - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -208,14 +208,14 @@ gptoss-fp4-b200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } @@ -225,7 +225,7 @@ gptoss-fp4-b200-vllm: - { tp: 1, conc-start: 4, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.10.2 @@ -252,7 +252,7 @@ gptoss-fp4-h100-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev @@ -261,28 +261,29 @@ gptoss-fp4-h200-trt: runner: h200-trt precision: fp4 framework: trt + # For all sequence lengths, EP=TP, DP_ATTENTION=false seq-len-configs: - isl: 1024 osl: 1024 search-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 } + - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 search-space: - - { tp: 1, ep: 1, conc-start: 32, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } - isl: 8192 osl: 1024 search-space: - - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } gptoss-fp4-h200-vllm: image: vllm/vllm-openai:v0.10.2 @@ -295,14 +296,14 @@ gptoss-fp4-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 1, conc-start: 4, conc-end: 4 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 search-space: - - { tp: 1, conc-start: 4, conc-end: 16 } + - { tp: 1, conc-start: 4, conc-end: 4 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } From 9fba14ae5a9137dd797cdce18ae68d50a51ace27 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 3 Nov 2025 09:44:57 -0600 Subject: [PATCH 145/149] double check and update master configs pt 2 --- .github/configs/amd-master.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d9558f284..82251c8be 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -155,17 +155,17 @@ gptoss-fp4-mi355x-vllm: osl: 1024 search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, conc-start: 4, conc-end: 8 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 1024 osl: 8192 search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, conc-start: 4, conc-end: 8 } - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 search-space: - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 4, conc-start: 4, conc-end: 4 } + - { tp: 8, conc-start: 4, conc-end: 8 } From c33187411db4335070ae11c0b19ae8111c56832a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 3 Nov 2025 09:46:16 -0600 Subject: [PATCH 146/149] add pydantic pip install --- .github/workflows/full-sweep-1k8k-scheduler.yml | 1 + .github/workflows/full-sweep-8k1k-scheduler.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index 9dacb5a9f..a8ee10d00 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -30,6 +30,7 @@ jobs: - id: get-gptoss-configs run: | + pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 3a0ae47c3..cd9cd0531 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -30,6 +30,7 @@ jobs: - id: get-gptoss-configs run: | + pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT From 582e1b1702a91ba274b753779833b7b1838eff11 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 3 Nov 2025 11:19:54 -0600 Subject: [PATCH 147/149] bug fix --- .github/workflows/full-sweep-1k8k-scheduler.yml | 6 +++--- .github/workflows/full-sweep-8k1k-scheduler.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index a8ee10d00..4d7e5cc22 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -17,7 +17,7 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT get-gptoss-configs: @@ -31,7 +31,7 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT benchmark-dsr1: @@ -85,7 +85,7 @@ jobs: # This is a workaround until we can integrate GB200 into master configs. benchmark-gb200: uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep + name: gb200 1k8k sweep strategy: fail-fast: false matrix: diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index cd9cd0531..a4a492178 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -85,7 +85,7 @@ jobs: # This is a workaround until we can integrate GB200 into master configs. benchmark-gb200: uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep + name: gb200 8k1k sweep strategy: fail-fast: false matrix: From 4b78c4abbff4fcda58247158ab0fb2f236cd7c57 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 3 Nov 2025 11:23:26 -0600 Subject: [PATCH 148/149] update cron trigger to 9:00 PM CDT --- .github/workflows/full-sweep-1k1k-scheduler.yml | 2 +- .github/workflows/full-sweep-1k8k-scheduler.yml | 2 +- .github/workflows/full-sweep-8k1k-scheduler.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 0930f8a9a..6e2128218 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -3,7 +3,7 @@ name: "Full Sweep Scheduler - 1k1k" on: workflow_dispatch: schedule: - - cron: "0 23 * * *" + - cron: "0 3 * * *" jobs: get-dsr1-configs: diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index 4d7e5cc22..b8437969e 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -3,7 +3,7 @@ name: "Full Sweep Scheduler - 1k8k" on: workflow_dispatch: schedule: - - cron: "0 23 * * *" + - cron: "0 3 * * *" jobs: get-dsr1-configs: diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index a4a492178..bc3cd07dc 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -3,7 +3,7 @@ name: "Full Sweep Scheduler - 8k1k" on: workflow_dispatch: schedule: - - cron: "0 23 * * *" + - cron: "0 3 * * *" jobs: get-dsr1-configs: From 7c4c931a0660a760c9f2a9020737285eae1b4907 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 3 Nov 2025 12:02:08 -0600 Subject: [PATCH 149/149] runner name bug in process result python script --- .github/workflows/benchmark-multinode-tmpl.yml | 2 ++ .github/workflows/benchmark-tmpl.yml | 2 ++ utils/process_result.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index bfbd5a1cf..4b079f578 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -86,6 +86,8 @@ jobs: fi - name: Process results + env: + RUNNER_TYPE: ${{ inputs.runner }} run: | # Process each result file for result_file in ${RESULT_FILENAME}_*.json; do diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 293e3ac49..8d041bc73 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -138,6 +138,8 @@ jobs: fi - name: Process result + env: + RUNNER_TYPE: ${{ inputs.runner }} run: | python3 utils/process_result.py - name: Upload result diff --git a/utils/process_result.py b/utils/process_result.py index d59a61790..94ca30f24 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -4,7 +4,7 @@ from pathlib import Path -hw = os.environ.get('RUNNER_NAME') +hw = os.environ.get('RUNNER_TYPE') tp_size = int(os.environ.get('TP')) ep_size = int(os.environ.get('EP_SIZE')) dp_attention = os.environ.get('DP_ATTENTION')