diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index b2379df46..f1e43a5c4 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -50,6 +50,14 @@ on: disagg: required: true type: string + run-eval: + type: boolean + required: true + default: false + random-range-ratio: + required: false + type: string + default: '0.8' ref: description: "Git ref (branch/sha) to checkout" required: false @@ -74,6 +82,7 @@ env: CONC: ${{ inputs.conc }} SPEC_DECODING: ${{ inputs.spec-decoding }} DISAGG: ${{ inputs.disagg }} + RUN_EVAL: ${{ inputs.run-eval }} permissions: contents: read @@ -82,7 +91,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 180 - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }} spec=${{ inputs.spec-decoding }}' + name: "${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && 'eval ' || '' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }} spec=${{ inputs.spec-decoding }}" steps: - name: Resource cleanup run: | @@ -113,7 +122,11 @@ jobs: - name: Launch job script env: RUNNER_NAME: ${{ runner.name }} + RUNNER_TYPE: ${{ inputs.runner }} RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_specdecode_${{ env.SPEC_DECODING }}_${{ runner.name }} + # Suppress per-job eval markdown from being appended to the step summary. + # We'll publish a single combined eval table in the collection job instead. + GITHUB_STEP_SUMMARY: '' run: | bash ./runners/launch_${RUNNER_NAME%%_*}.sh FOUND_RESULT_FILE= @@ -137,8 +150,27 @@ jobs: RUNNER_TYPE: ${{ inputs.runner }} run: | python3 utils/process_result.py + - name: Upload result uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 with: name: bmk_${{ env.RESULT_FILENAME }} path: agg_${{ env.RESULT_FILENAME }}.json + + - name: Upload eval results (if any) + if: ${{ env.RUN_EVAL == 'true' }} + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + with: + name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }} + path: | + meta_env.json + results*.json + sample*.jsonl + if-no-files-found: ignore + + - name: Cleanup eval outputs (post-upload) + if: ${{ env.RUN_EVAL == 'true' }} + run: | + rm -f meta_env.json || true + # Remove any eval results JSONs that were moved into workspace + rm -f results*.json || true diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml new file mode 100644 index 000000000..606117e79 --- /dev/null +++ b/.github/workflows/collect-evals.yml @@ -0,0 +1,46 @@ +name: Template - Collect Evals + +on: + workflow_call: + inputs: + result-prefix: + required: false + type: string + default: '' + +permissions: + contents: read + +jobs: + collect-evals: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download eval artifacts + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 + with: + path: eval_results/ + pattern: ${{ inputs.result-prefix && format('eval_{0}_*', inputs.result-prefix) || 'eval_*' }} + + - name: Summarize evals + run: | + pip install tabulate + echo "## Eval Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + python3 utils/collect_eval_results.py eval_results/ ${{ inputs.result-prefix || 'all' }} >> $GITHUB_STEP_SUMMARY + + - name: Upload aggregated evals + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + with: + name: eval_results_${{ inputs.result-prefix || 'all' }} + path: agg_eval_${{ inputs.result-prefix || 'all' }}.json + + - name: Cleanup downloaded eval artifacts + if: ${{ always() }} + run: | + rm -rf eval_results/ || true diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index ccc2ce4e4..5bfbde52e 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -34,7 +34,9 @@ jobs: python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY - name: Aggregate results - run: python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }} + run: | + pip install tabulate + python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }} - name: Upload aggregated results uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 8bc48aed9..d87cba7bd 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -122,6 +122,7 @@ jobs: conc: ${{ matrix.config.conc }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} + run-eval: ${{ matrix.config.run-eval }} ref: ${{ inputs.ref }} collect-results: @@ -129,9 +130,17 @@ jobs: if: ${{ always() }} uses: ./.github/workflows/collect-results.yml secrets: inherit + with: + result-prefix: "bmk" + + collect-evals: + needs: [test-sweep-multi-node, test-sweep-single-node] + if: ${{ always() }} + uses: ./.github/workflows/collect-evals.yml + secrets: inherit calc-success-rate: - needs: collect-results + needs: [collect-results, collect-evals] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index e449942d1..224bae7f9 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -142,6 +142,7 @@ jobs: conc: ${{ matrix.config.conc }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} + run-eval: ${{ matrix.config.run-eval }} sweep-single-node-1k8k: needs: setup @@ -184,6 +185,21 @@ jobs: with: result-prefix: "bmk" + collect-evals: + needs: + [ + sweep-single-node-1k1k, + sweep-single-node-1k8k, + sweep-single-node-8k1k, + sweep-multi-node-1k1k, + sweep-multi-node-1k8k, + sweep-multi-node-8k1k, + setup, + ] + if: ${{ always() && needs.setup.result != 'skipped' }} + uses: ./.github/workflows/collect-evals.yml + secrets: inherit + upload-changelog-metadata: needs: [setup, collect-results] if: ${{ always() && needs.setup.result != 'skipped' }} diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index d4f5f46be..cafa5347f 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -122,7 +122,6 @@ run_benchmark_serving() { local use_chat_template=false local server_pid="" - # Parse arguments while [[ $# -gt 0 ]]; do case $1 in --model) @@ -183,7 +182,7 @@ run_benchmark_serving() { ;; esac done - + # Validate all required parameters if [[ -z "$model" ]]; then echo "Error: --model is required" @@ -287,3 +286,226 @@ run_benchmark_serving() { return $benchmark_exit_code } + + +# ------------------------------ +# Eval (lm-eval-harness) helpers +# ------------------------------ + +_install_lm_eval_deps() { + python3 -m pip install -q --no-cache-dir "lm-eval[api]" || true + python3 -m pip install -q --no-cache-dir --no-deps \ + "git+https://github.com/EleutherAI/lm-evaluation-harness.git@b315ef3b05176acc9732bb7fdec116abe1ecc476" || true +} + +# Patch lm-eval filters to be robust to empty strings via sitecustomize +_patch_lm_eval() { + local patch_dir + patch_dir="$(mktemp -d)" + cat > "$patch_dir/sitecustomize.py" <<'PY' +# --- Patch LocalChatCompletion.parse_generations to handle empty content with reasoning_content --- +import re, sys, unicodedata, json +from lm_eval.filters import extraction as ex +from lm_eval.models.openai_completions import LocalChatCompletion as _LCC + +def _le_parse_generations(outputs, **kwargs): + res = [] + if not isinstance(outputs, list): + outputs = [outputs] + for out in (outputs or []): + try: + choices = out.get("choices", []) + tmp = ["" for _ in choices] + for choice in choices: + idx = choice.get("index", 0) + msg = (choice.get("message") or {}) + content = msg.get("content") + if content in (None, "", []): + content = msg.get("reasoning_content") or "" + tmp[idx] = content + except Exception: + tmp = [""] + res.extend(tmp) + return res + +# Keep staticmethod semantics +_LCC.parse_generations = staticmethod(_le_parse_generations) + +# --- Patch TemplateAPI.apply_chat_template to avoid injecting "type": "text" for TRT --- +try: + from lm_eval.models import api_models as _api_models + _TemplateAPI = _api_models.TemplateAPI + _JsonChatStr = _api_models.JsonChatStr +except Exception: + _TemplateAPI = None + _JsonChatStr = None + +if _TemplateAPI is not None and _JsonChatStr is not None: + _orig_apply_chat_template = _TemplateAPI.apply_chat_template + + def _patched_apply_chat_template( + self, + chat_history, + add_generation_prompt: bool = True, + ): + """Applies a chat template to a list of chat history between user and model.""" + if self.tokenizer_backend == "huggingface" and self.tokenized_requests: + return self.tokenizer.apply_chat_template( + chat_history, + tokenize=False, + add_generation_prompt=add_generation_prompt, + continue_final_message=not add_generation_prompt, + ) + elif self.tokenizer_backend == "remote" and self.tokenized_requests: + return chat_history + else: + # NOTE: we no longer inject `"type": "text"` when tokenizer is None / non-HF + return _JsonChatStr( + json.dumps( + [{**item} for item in chat_history], + ensure_ascii=False, + ) + ) + + _TemplateAPI.apply_chat_template = _patched_apply_chat_template +PY + export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" +} + +run_lm_eval() { + local port="${PORT:-8888}" + local task="${EVAL_TASK:-gsm8k}" + local num_fewshot="${NUM_FEWSHOT:-2}" + local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}" + local gen_max_tokens=16384 + local temperature=0 + local top_p=1 + local concurrent_requests=32 + + while [[ $# -gt 0 ]]; do + case $1 in + --port) port="$2"; shift 2 ;; + --task) task="$2"; shift 2 ;; + --num-fewshot) num_fewshot="$2"; shift 2 ;; + --results-dir) results_dir="$2"; shift 2 ;; + --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;; + --temperature) temperature="$2"; shift 2 ;; + --top-p) top_p="$2"; shift 2 ;; + --concurrent-requests) concurrent_requests="$2"; shift 2 ;; + *) echo "Unknown parameter: $1"; return 1 ;; + esac + done + + _install_lm_eval_deps + _patch_lm_eval + + local openai_server_base="http://0.0.0.0:${port}" + local openai_chat_base="${openai_server_base}/v1/chat/completions" + export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} + MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL + + # Export for append_lm_eval_summary to pick up + export EVAL_RESULT_DIR="$results_dir" + + set -x + python3 -m lm_eval --model local-chat-completions --apply_chat_template \ + --tasks "utils/evals/${task}.yaml" \ + --num_fewshot "${num_fewshot}" \ + --output_path "${results_dir}" --log_samples \ + --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=5,num_concurrent=${concurrent_requests},tokenized_requests=False,max_length=${gen_max_tokens}" \ + --gen_kwargs "max_tokens=${gen_max_tokens},temperature=${temperature},top_p=${top_p}" + local eval_exit=$? + set +x + return $eval_exit +} + +append_lm_eval_summary() { + local results_dir="${EVAL_RESULT_DIR}" + local task="${EVAL_TASK:-gsm8k}" + local out_dir="${results_dir}" + mkdir -p "$out_dir" || true + + # Write minimal meta for collectors that expect it + local meta_json="${out_dir}/meta_env.json" + local model_name="${MODEL_NAME:-$MODEL}" + local dp_json="false" + if [ "${DP_ATTENTION}" = "true" ]; then dp_json="true"; fi + + # Derive framework/precision from env, fallback to parsing RESULT_FILENAME + # RESULT_FILENAME format (from workflow): + # ___tp<...>_ep<...>_dpa_<...>_conc<...>_ + local fw="${FRAMEWORK:-}" + local prec="${PRECISION:-}" + if [[ -z "$fw" || -z "$prec" ]]; then + if [[ -n "${RESULT_FILENAME}" ]]; then + # Extract the two fields immediately before "_tp" + # Handles arbitrary underscores in exp_name by matching from the end + local parsed + parsed=$(echo "${RESULT_FILENAME}" | sed -n 's/.*_\([^_][^_]*\)_\([^_][^_]*\)_tp.*/\1 \2/p') + local p1="${parsed%% *}" + local p2="${parsed#* }" + if [[ -z "$prec" && -n "$p1" && "$p1" != "$parsed" ]]; then + prec="$p1" + fi + if [[ -z "$fw" && -n "$p2" && "$p2" != "$parsed" ]]; then + fw="$p2" + fi + fi + fi + cat > "${meta_json}" </dev/null) + fi + + # Best-effort cleanup of the temp directory + if [ -n "${out_dir}" ] && [ -d "${out_dir}" ]; then + rm -rf --one-file-system "${out_dir}" || rm -rf "${out_dir}" || true + fi + + echo "Moved eval artifacts to: $(pwd)" +} + +# ------------------------------ +# Unified eval entrypoint +# ------------------------------ + +run_eval() { + local framework="${EVAL_FRAMEWORK:-lm-eval}" + local forwarded=() + + while [[ $# -gt 0 ]]; do + case "$1" in + --framework) framework="$2"; shift 2 ;; + *) forwarded+=("$1"); shift ;; + esac + done + + case "$framework" in + lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;; + *) echo "Unknown framework '${framework}'"; return 1 ;; + esac +} diff --git a/benchmarks/dsr1_fp4_b200_docker.sh b/benchmarks/dsr1_fp4_b200_docker.sh index ba4c5a236..30e564dd9 100644 --- a/benchmarks/dsr1_fp4_b200_docker.sh +++ b/benchmarks/dsr1_fp4_b200_docker.sh @@ -58,3 +58,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_b200_slurm.sh b/benchmarks/dsr1_fp4_b200_slurm.sh index b9ce026f1..0da2913d2 100644 --- a/benchmarks/dsr1_fp4_b200_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_slurm.sh @@ -55,3 +55,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh index 33d819efa..dce21701c 100644 --- a/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_mtp_slurm.sh @@ -102,3 +102,10 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ \ --use-chat-template + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 9a1c1ec67..459cff1b3 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -88,6 +88,8 @@ fi set -x MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ @@ -116,3 +118,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh index 412e90a66..a63039af3 100644 --- a/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_atom_slurm.sh @@ -64,3 +64,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_mi355x_docker.sh b/benchmarks/dsr1_fp4_mi355x_docker.sh index 1a2d1099f..ba19b64e3 100644 --- a/benchmarks/dsr1_fp4_mi355x_docker.sh +++ b/benchmarks/dsr1_fp4_mi355x_docker.sh @@ -54,3 +54,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp4_mi355x_slurm.sh b/benchmarks/dsr1_fp4_mi355x_slurm.sh index 9a2db61a7..63856676e 100644 --- a/benchmarks/dsr1_fp4_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp4_mi355x_slurm.sh @@ -55,3 +55,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_b200_docker.sh b/benchmarks/dsr1_fp8_b200_docker.sh index 3c328b267..dd19b94a0 100644 --- a/benchmarks/dsr1_fp8_b200_docker.sh +++ b/benchmarks/dsr1_fp8_b200_docker.sh @@ -90,3 +90,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp8_b200_slurm.sh b/benchmarks/dsr1_fp8_b200_slurm.sh index 5406df238..da1a7f4cd 100644 --- a/benchmarks/dsr1_fp8_b200_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_slurm.sh @@ -87,3 +87,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index fdc70ad2e..1602d802b 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -58,6 +58,8 @@ fi set -x MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ @@ -86,3 +88,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh index 5d7c4d02e..117008a63 100644 --- a/benchmarks/dsr1_fp8_h200_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_slurm.sh @@ -24,7 +24,7 @@ export TORCH_CUDA_ARCH_LIST="9.0" set -x if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then - PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --tokenizer-path $MODEL \ + PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ --disable-radix-cache --max-running-requests 512 --cuda-graph-max-bs 512 \ @@ -33,7 +33,7 @@ if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then --decode-log-interval 1 \ > $SERVER_LOG 2>&1 & else - PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --tokenizer-path $MODEL \ + PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ --disable-radix-cache --max-running-requests 256 --cuda-graph-max-bs 256 \ @@ -59,3 +59,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 9c2a431fc..98a6de420 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -57,7 +57,9 @@ fi set -x -MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) +MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 )) +MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) +MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) # Launch TRT-LLM server PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ @@ -86,3 +88,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index 40481d5d3..c7de3eec5 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -57,3 +57,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index a27c1000e..f4e029fe5 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -62,3 +62,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index aaa3c83d7..c990ef2a1 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -48,3 +48,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index 639b960d0..82f0833ff 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -51,3 +51,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh index 412e90a66..a63039af3 100644 --- a/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_atom_slurm.sh @@ -64,3 +64,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index 906d8060e..f6527e9b7 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -52,3 +52,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index 4f51837fc..078a9ec48 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -50,3 +50,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_b200_docker.sh b/benchmarks/gptoss_fp4_b200_docker.sh index d3772a484..1a4b55a83 100644 --- a/benchmarks/gptoss_fp4_b200_docker.sh +++ b/benchmarks/gptoss_fp4_b200_docker.sh @@ -47,9 +47,12 @@ export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ ---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \ -> $SERVER_LOG 2>&1 & +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size $TP \ +--max-num-seqs 512 \ +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -69,3 +72,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_b200_slurm.sh b/benchmarks/gptoss_fp4_b200_slurm.sh index 559cb5661..5bcfef9a3 100644 --- a/benchmarks/gptoss_fp4_b200_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_slurm.sh @@ -67,3 +67,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_b200_trt_docker.sh b/benchmarks/gptoss_fp4_b200_trt_docker.sh index 2829a8900..9fdeb4b1f 100644 --- a/benchmarks/gptoss_fp4_b200_trt_docker.sh +++ b/benchmarks/gptoss_fp4_b200_trt_docker.sh @@ -87,3 +87,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC )) + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 703a2a93e..954d7ca93 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -103,3 +103,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC )) + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index e67486d90..2fd6fc67f 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -31,7 +31,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ -> $SERVER_LOG 2>&1 & +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -51,3 +51,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index 73f1f0a70..1b4da9cce 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -25,15 +25,17 @@ EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) export TORCH_CUDA_ARCH_LIST="9.0" +PORT=${PORT:-8888} + export VLLM_MXFP4_USE_MARLIN=1 set -x PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---config config.yaml \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=$CONC \ - > $SERVER_LOG 2>&1 & + --config config.yaml \ + --gpu-memory-utilization=0.9 \ + --tensor-parallel-size=$TP \ + --max-num-seqs=$CONC \ + --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -53,3 +55,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 795fd846f..cfea22b9e 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -15,7 +15,6 @@ check_env_vars \ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -set -x hf download $MODEL pip install datasets pandas @@ -38,14 +37,17 @@ max-model-len: $CALCULATED_MAX_MODEL_LEN EOF SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) +export TORCH_CUDA_ARCH_LIST="9.0" PORT=$(( 8888 + $PORT_OFFSET )) -export TORCH_CUDA_ARCH_LIST="9.0" export VLLM_MXFP4_USE_MARLIN=1 -PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \ - --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \ - > $SERVER_LOG 2>&1 & +PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ + --config config.yaml \ + --gpu-memory-utilization 0.9 \ + --tensor-parallel-size $TP \ + --max-num-seqs $CONC \ + --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -63,3 +65,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh index e151ee93c..875e6ae72 100644 --- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh @@ -43,7 +43,7 @@ print_iter_log: true stream_interval: 20 EOF -mpirun -n 1 --oversubscribe --allow-run-as-root \ +PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL \ --max_batch_size $CONC \ --max_num_tokens 20000 \ @@ -74,3 +74,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index e66004f9d..467a32a58 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -59,3 +59,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index ca7b7fc2f..bc385c264 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -35,6 +35,8 @@ export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +# +## Start up vllm server set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -64,3 +66,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 8b7dd5c87..054f6c377 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -58,3 +58,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 4b8532aa6..c0c9597c2 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -44,8 +44,7 @@ vllm serve $MODEL --port $PORT \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling \ -> $SERVER_LOG 2>&1 & +--async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -63,3 +62,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh index 778a78310..85052b1bc 100644 --- a/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_atom_slurm.sh @@ -65,3 +65,9 @@ run_benchmark_serving \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index bff0be872..7c708ae62 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -55,3 +55,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 18aa13fff..1e5d87dba 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -28,6 +28,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 +# +## Start up vllm server set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -56,3 +58,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index a6854173c..76fcfa83b 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -31,7 +31,7 @@ docker run --rm --init --network host --name $server_name \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index c6889aa1c..a0b704d03 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -32,7 +32,7 @@ docker run --rm --init --network host --name $server_name \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index d1ddc26de..0174087e4 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -10,7 +10,7 @@ docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index 780e5a2f0..55fffdb7c 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -14,7 +14,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh index 8fbdaee63..5bd6bd0e2 100644 --- a/runners/launch_mi300x-cr.sh +++ b/runners/launch_mi300x-cr.sh @@ -14,7 +14,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ --e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME \ +-e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $IMAGE \ benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x_docker.sh" diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py new file mode 100644 index 000000000..8b471034c --- /dev/null +++ b/utils/collect_eval_results.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 +import sys +import json +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple +from tabulate import tabulate + +# Import shared utilities from summarize +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from summarize import ( + load_json, MODEL, HARDWARE, FRAMEWORK, PRECISION, + TP, EP, CONC, DP_ATTENTION, TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF, + SPEC_DECODING +) + + +def find_eval_sets(root: Path) -> List[Path]: + """Return directories that contain a meta_env.json (one set per job). + + Structure: eval_results//meta_env.json + """ + out: List[Path] = [] + try: + for d in root.iterdir(): + if d.is_dir() and (d / 'meta_env.json').exists(): + out.append(d) + except Exception: + pass + return out + + +def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: + """Return (lm_eval_json) if present. + + Checks immediate directory for result JSONs. + """ + immediate_jsons = list(d.glob('results*.json')) + [ + p for p in d.glob('*.json') if p.name != 'meta_env.json' + ] + + lm_path = None + le_path = None + + for p in immediate_jsons: + data = load_json(p) + if not isinstance(data, dict): + continue + + if 'lm_eval_version' in data: + # lm-eval harness - pick latest if multiple + if lm_path is None or p.stat().st_mtime > lm_path.stat().st_mtime: + lm_path = p + + return lm_path, le_path + + +def extract_lm_metrics(json_path: Path) -> List[Dict[str, Any]]: + """Extract metrics from lm-eval harness result JSON. + + Returns a list of metric dicts, one per task in the results. + + Uses explicit structure from the JSON file: + - Task names from results keys + - Metric name from configs.metric_list + - Filter names from configs.filter_list + - Values from results[task][metric,filter] + """ + data = load_json(json_path) or {} + results = data.get('results', {}) + configs = data.get('configs', {}) + + if not results: + return [] + + extracted = [] + + for task in results.keys(): + task_results = results[task] + task_config = configs.get(task, {}) + + # Base metric: from config's metric_list + metric_list = task_config.get('metric_list', []) + base_metric = metric_list[0]['metric'] if metric_list else 'exact_match' + + # Filters: from config's filter_list + filter_list = task_config.get('filter_list', []) + + strict_val, strict_se = None, None + flex_val, flex_se = None, None + accuracy_val, accuracy_se = None, None + + # Helper to get value/stderr pair for filtered metrics + def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]: + val_key = f"{base_metric},{filter_name}" + se_key = f"{base_metric}_stderr,{filter_name}" + return task_results.get(val_key), task_results.get(se_key) + + # Extract metrics based on filter_list + if not filter_list: + # No filters - check for accuracy or use base metric + if 'acc' in task_results: + accuracy_val = task_results.get('acc') + accuracy_se = task_results.get('acc_stderr') + else: + strict_val = task_results.get(base_metric) + strict_se = task_results.get(f"{base_metric}_stderr") + else: + # Extract metrics for each filter + for f in filter_list: + fname = f['name'] + if 'strict' in fname: + strict_val, strict_se = get_val_se(fname) + elif 'flex' in fname or 'extract' in fname: + flex_val, flex_se = get_val_se(fname) + + # N-samples (effective count) + n_eff = data.get('n-samples', {}).get(task, {}).get('effective') + + # Model name + model = ( + data.get('model_name') + or task_config.get('metadata', {}).get('model') + ) + + extracted.append({ + 'task': task, + 'strict': strict_val, + 'strict_se': strict_se, + 'flex': flex_val, + 'flex_se': flex_se, + 'accuracy': accuracy_val, + 'accuracy_se': accuracy_se, + 'n_eff': n_eff, + 'model': model, + 'source': str(json_path) + }) + + return extracted + + +def pct(x: Any) -> str: + """Format value as percentage.""" + try: + return f"{float(x)*100:.2f}%" + except Exception: + return 'N/A' + + +def se(x: Any) -> str: + """Format stderr as percentage with ± prefix.""" + try: + return f" ±{float(x)*100:.2f}%" + except Exception: + return '' + + +def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: + """Build a result row from metadata and extracted metrics.""" + row = { + 'model': m.get('model') or meta.get('model', 'unknown'), + 'hw': meta.get('hw', 'unknown').upper(), + 'framework': meta.get('framework', 'unknown').lower(), + 'precision': meta.get('precision', 'unknown').lower(), + 'spec_decoding': meta.get('spec_decoding', 'unknown'), + 'tp': int(meta.get('tp', 1)), + 'ep': int(meta.get('ep', 1)), + 'conc': int(meta.get('conc', 0)), + 'dp_attention': str(meta.get('dp_attention', "none")).lower(), + 'task': m.get('task', 'unknown'), + 'em_strict': m.get('strict'), + 'em_strict_se': m.get('strict_se'), + 'em_flexible': m.get('flex'), + 'em_flexible_se': m.get('flex_se'), + 'n_eff': m.get('n_eff'), + 'source': m.get('source'), + } + + # Add universal score field (primary metric for unified comparison) + if m.get('strict') is not None: + row['score'] = m.get('strict') + row['score_name'] = 'em_strict' + row['score_se'] = m.get('strict_se') + elif m.get('accuracy') is not None: + row['score'] = m.get('accuracy') + row['score_name'] = 'accuracy' + row['score_se'] = m.get('accuracy_se') + else: + row['score'] = None + row['score_name'] = None + row['score_se'] = None + + return row + + +def main(): + if len(sys.argv) < 3: + print('Usage: collect_eval_results.py ') + sys.exit(1) + + root = Path(sys.argv[1]) + exp_name = sys.argv[2] + + rows: List[Dict[str, Any]] = [] + for d in find_eval_sets(root): + meta = load_json(d / 'meta_env.json') or {} + lm_path, le_path = detect_eval_jsons(d) + + # Extract metrics (prefer lm-eval) - returns list for multi-task support + if lm_path: + metrics_list = extract_lm_metrics(lm_path) + else: + continue + + if not metrics_list: + continue + + # Build row for each task in the results + for m in metrics_list: + row = build_row(meta, m) + rows.append(row) + + # Sort for stable output + rows.sort(key=lambda r: ( + r['hw'], r['framework'], r['precision'], r.get('spec_decoding', ''), r['tp'], r['ep'], r['conc'] + )) + + if not rows: + print('> No eval results found to summarize.') + else: + # Print table using tabulate + headers = [ + MODEL, HARDWARE, FRAMEWORK, PRECISION, SPEC_DECODING, TP, EP, CONC, DP_ATTENTION, + TASK, SCORE, EM_STRICT, EM_FLEXIBLE, N_EFF + ] + + table_rows = [ + [ + r['model'], + r['hw'], + r['framework'].upper(), + r['precision'].upper(), + r['spec_decoding'], + r['tp'], + r['ep'], + r['conc'], + r['dp_attention'], + r['task'], + f"{pct(r['score'])}{se(r['score_se'])}", + f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", + f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", + r['n_eff'] or '' + ] + for r in rows + ] + + print(tabulate(table_rows, headers=headers, tablefmt="github")) + + # Write JSON aggregate + out_path = Path(f'agg_eval_{exp_name}.json') + with open(out_path, 'w') as f: + json.dump(rows, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md new file mode 100644 index 000000000..fcdcd5360 --- /dev/null +++ b/utils/evals/EVALS.md @@ -0,0 +1,26 @@ +# Evals + +## What? +Quick graded QnA which measures model performance. Examples of test suites: +- **gsm8k**: Grade school math questions +- **gpqa**: Graduate level, Google-Proof multiple choice questions +- **math500**: Math questions spanning topics like probability, algebra, trigonometry, and geometry. + +## When? +At highest concurrency for highest TP and lowest TP, per GPU per model per ISL/OSL. Logic is defined in `mark_eval_entries` of `utils/matrix-logic/generate_sweep_configs.py` + +## Why? +To verify how model outputs are affected by throughput optimizations. +- TP/Conc might affect model outputs +- Check kernel implementations for correctness + +## How? +- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. EleutherAI/lm-evaluation-harness(lmeval), using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`. + +## Misc +Following files are task definitions from lmeval, more info on changes within the files +- `utils/evals/math500.yaml` +- `utils/evals/gsm8k.yaml` + + + diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml new file mode 100644 index 000000000..73a1f7c1e --- /dev/null +++ b/utils/evals/gsm8k.yaml @@ -0,0 +1,48 @@ +# YAML from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml +# Changed doc_to_text so model answers properly. Also see lm-evaluation-harness#3411. +tag: + - math_word_problems +task: gsm8k +dataset_path: gsm8k +dataset_name: main +output_type: generate_until +training_split: train +fewshot_split: train +test_split: test +doc_to_text: "Question: {{question}}\nEnd your answer with: #### \nAnswer:" +doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: false + regexes_to_ignore: + - "," + - "\\$" + - "(?s).*#### " + - "\\.$" +generation_kwargs: + until: + - "Question:" + - "" + - "<|im_end|>" + do_sample: false + temperature: 0.0 +repeats: 1 +num_fewshot: 5 +filter_list: + - name: "strict-match" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "#### (\\-?[0-9\\.\\,]+)" + - function: "take_first" + - name: "flexible-extract" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)" + - function: "take_first" +metadata: + version: 3.0 diff --git a/utils/evals/math500.yaml b/utils/evals/math500.yaml new file mode 100644 index 000000000..2e172e7f0 --- /dev/null +++ b/utils/evals/math500.yaml @@ -0,0 +1,36 @@ +# YAML from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml +# Changed regex and prompt +tag: + - math_word_problems +task: hendrycks_math_algebra +dataset_path: HuggingFaceH4/MATH-500 +process_docs: !function utils.process_docs +dataset_name: algebra +output_type: generate_until +training_split: train +test_split: test +doc_to_text: "You are solving competition math problems.\n\nFormat rules:\n- Answer in a new line that starts with `Answer: `.\n- After `Answer: `, write ONLY the answer as inline LaTeX.\n- Use ONLY ASCII LaTeX commands (e.g. \\pi, \\frac{1}{2}, -). NO Unicode symbols.\n- Do NOT wrap the answer in $, $$, \\( \\), \\[ \\], or any other delimiters.\n- Do NOT use \\displaystyle or any display-style commands. Answer only this problem, the rest are examples. Problem: {{problem}}\n" +process_results: !function utils.process_results +doc_to_target: "{{answer}}" +generation_kwargs: + until: + - "Problem:" + do_sample: false + temperature: 0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + regexes_to_ignore: + - "\\\\left" + - "\\\\right" + - "\\s+" +filter_list: + - name: "strict-match" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "Answer:\\s*([^\\n]+)" + - function: "take_first" +metadata: + version: 1.0 \ No newline at end of file diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 67b406d49..b6c2cf2f2 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -1,3 +1,4 @@ +from ast import For import json import argparse import sys @@ -31,6 +32,77 @@ def seq_len_to_str(isl: int, osl: int) -> str: """ return seq_len_itos.get((isl, osl), f"{isl}_{osl}") +def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: + """Eval selection policy (single-node only): + - Only consider 1k8k (isl=1024, osl=8192). + - For each unique (model, runner, framework, precision, isl, osl, spec-decoding): + - Mark highest TP with highest conc + - Mark lowest TP with highest conc + + Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated + independently. + """ + from collections import defaultdict + + # Only run evals on 1k8k + target_isl, target_osl = seq_len_stoi["1k8k"] + # Group entries by (model, runner, framework, precision, isl, osl) + # Only include entries that have a top-level TP (i.e., single-node schema). + # This avoids relying on structural hints like prefill/decode which may be + # reused by future single-node disaggregated modes. + groups = defaultdict(list) + for i, entry in enumerate(matrix_values): + # Skip entries without a top-level TP field + if Fields.TP.value not in entry: + continue + + if entry.get(Fields.ISL.value) != target_isl or entry.get(Fields.OSL.value) != target_osl: + continue + + key = ( + entry[Fields.MODEL.value], + entry[Fields.RUNNER.value], + entry[Fields.FRAMEWORK.value], + entry[Fields.PRECISION.value], + entry[Fields.ISL.value], + entry[Fields.OSL.value], + entry[Fields.SPEC_DECODING.value] + ) + groups[key].append((i, entry)) + + # For each group, find highest TP/highest conc and lowest TP/highest conc + eval_indices = set() + for key, entries in groups.items(): + if not entries: + continue + + # Find min and max TP values + min_tp = min(e[Fields.TP.value] for _, e in entries) + max_tp = max(e[Fields.TP.value] for _, e in entries) + + # Find highest conc for highest TP + highest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == max_tp] + if highest_tp_entries: + max_conc_highest_tp = max(e[Fields.CONC.value] for _, e in highest_tp_entries) + for i, e in highest_tp_entries: + if e[Fields.CONC.value] == max_conc_highest_tp: + eval_indices.add(i) + + # Find highest conc for lowest TP (only if different from max_tp) + if min_tp != max_tp: + lowest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == min_tp] + if lowest_tp_entries: + max_conc_lowest_tp = max(e[Fields.CONC.value] for _, e in lowest_tp_entries) + for i, e in lowest_tp_entries: + if e[Fields.CONC.value] == max_conc_lowest_tp: + eval_indices.add(i) + + # Mark the selected entries + for i, entry in enumerate(matrix_values): + entry[Fields.RUN_EVAL.value] = i in eval_indices + + return matrix_values + def generate_full_sweep(args, all_config_data, runner_data): """Generate full sweep configurations with optional filtering. @@ -528,6 +600,17 @@ def main(): default='.github/configs/runners.yaml', help='Configuration file holding runner information (YAML format, defaults to .github/configs/runners.yaml)' ) + eval_group = parent_parser.add_mutually_exclusive_group() + eval_group.add_argument( + '--run-evals', + action='store_true', + help='When specified, run evals on a subset of configs (in addition to all configs).' + ) + eval_group.add_argument( + '--evals-only', + action='store_true', + help='When specified, run ONLY the eval subset (excludes non-eval configs).' + ) # Create main parser parser = argparse.ArgumentParser( @@ -696,6 +779,13 @@ def main(): matrix_values = generate_test_config_sweep(args, all_config_data) else: parser.error(f"Unknown command: {args.command}") + + # Handle eval options (mutually exclusive) + if args.run_evals or args.evals_only: + matrix_values = mark_eval_entries(matrix_values) + # IF --evals-only is specified, filter to only eval entries + if args.evals_only: + matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)] print(json.dumps(matrix_values)) return matrix_values diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index ea6aa4ece..b8a102684 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -51,6 +51,9 @@ class Fields(Enum): EXP_NAME = 'exp-name' DISAGG = 'disagg' + # Eval + RUN_EVAL = 'run-eval' + """ Below is the validation logic for the OUTPUT of utils/matrix_logic/generate_sweep_configs.py, i.e., @@ -85,6 +88,7 @@ class SingleNodeMatrixEntry(BaseModel): max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) disagg: bool + run_eval: bool = Field(alias=Fields.RUN_EVAL.value, default=False) class WorkerConfig(BaseModel): @@ -121,6 +125,7 @@ class MultiNodeMatrixEntry(BaseModel): max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) disagg: bool + run_eval: bool = Field(alias=Fields.RUN_EVAL.value, default=False) def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: diff --git a/utils/process_changelog.py b/utils/process_changelog.py index c156e2361..7d21047ff 100644 --- a/utils/process_changelog.py +++ b/utils/process_changelog.py @@ -113,6 +113,7 @@ def main(): *configs_to_run, "--config-files", *MASTER_CONFIGS, + "--run-evals" ], capture_output=True, text=True, diff --git a/utils/summarize.py b/utils/summarize.py index a46c2e02a..b4f4ce6a1 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -1,6 +1,7 @@ import sys import json from pathlib import Path +from typing import Any, Dict, Optional from tabulate import tabulate # Header constants @@ -33,95 +34,122 @@ DECODE_WORKERS = "Decode Workers" DECODE_GPUS = "Decode GPUs" -results = [] -results_dir = Path(sys.argv[1]) -for result_path in results_dir.rglob('*.json'): - with open(result_path) as f: - result = json.load(f) - results.append(result) - -single_node_results = [r for r in results if not r['is_multinode']] -multinode_results = [r for r in results if r['is_multinode']] - -# Single-node and multi-node results have different fields and therefore need to be printed separately -if single_node_results: - single_node_results.sort(key=lambda r: ( - r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) - - single_node_headers = [ - MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION, - CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU - ] - - single_node_rows = [ - [ - r['infmax_model_prefix'], - r['model'], - r['hw'].upper(), - r['framework'].upper(), - r['precision'].upper(), - r['isl'], - r['osl'], - r['tp'], - r['ep'], - r['dp_attention'], - r['conc'], - f"{r['median_ttft'] * 1000:.4f}", - f"{r['median_tpot'] * 1000:.4f}", - f"{r['median_intvty']:.4f}", - f"{r['median_e2el']:.4f}", - f"{r['tput_per_gpu']:.4f}", - f"{r['output_tput_per_gpu']:.4f}", - f"{r['input_tput_per_gpu']:.4f}", +# Eval constants +TASK = "Task" +SCORE = "Score" +EM_STRICT = "EM Strict" +EM_FLEXIBLE = "EM Flexible" +N_EFF = "N (eff)" +SPEC_DECODING = "Spec Decode" + + +def load_json(path: Path) -> Optional[Dict[str, Any]]: + """Load JSON file and return dict, or None on error.""" + try: + with open(path, 'r') as f: + return json.load(f) + except Exception: + return None + + +def main(): + if len(sys.argv) < 2: + print("Usage: python summarize.py ") + sys.exit(1) + + results = [] + results_dir = Path(sys.argv[1]) + for result_path in results_dir.rglob('*.json'): + result = load_json(result_path) + if result and 'is_multinode' in result: + results.append(result) + + single_node_results = [r for r in results if not r['is_multinode']] + multinode_results = [r for r in results if r['is_multinode']] + + # Single-node and multi-node results have different fields and therefore need to be printed separately + if single_node_results: + single_node_results.sort(key=lambda r: ( + r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) + + single_node_headers = [ + MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION, + CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU ] - for r in single_node_results - ] - - print("## Single-Node Results\n") - print(tabulate(single_node_rows, headers=single_node_headers, tablefmt="github")) - print("\n") - -if multinode_results: - multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], - r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) - - multinode_headers = [ - MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, - PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS, - DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS, - CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU - ] - - multinode_rows = [ - [ - r['infmax_model_prefix'], - r['model'], - r['hw'].upper(), - r['framework'].upper(), - r['precision'].upper(), - r['isl'], - r['osl'], - r['prefill_tp'], - r['prefill_ep'], - r['prefill_dp_attention'], - r['prefill_num_workers'], - r['num_prefill_gpu'], - r['decode_tp'], - r['decode_ep'], - r['decode_dp_attention'], - r['decode_num_workers'], - r['num_decode_gpu'], - r['conc'], - f"{r['median_ttft'] * 1000:.4f}", - f"{r['median_tpot'] * 1000:.4f}", - f"{r['median_intvty']:.4f}", - f"{r['median_e2el']:.4f}", - f"{r['tput_per_gpu']:.4f}", - f"{r['output_tput_per_gpu']:.4f}", - f"{r['input_tput_per_gpu']:.4f}", + + single_node_rows = [ + [ + r['infmax_model_prefix'], + r['model'], + r['hw'].upper(), + r['framework'].upper(), + r['precision'].upper(), + r['isl'], + r['osl'], + r['tp'], + r['ep'], + r['dp_attention'], + r['conc'], + f"{r['median_ttft'] * 1000:.4f}", + f"{r['median_tpot'] * 1000:.4f}", + f"{r['median_intvty']:.4f}", + f"{r['median_e2el']:.4f}", + f"{r['tput_per_gpu']:.4f}", + f"{r['output_tput_per_gpu']:.4f}", + f"{r['input_tput_per_gpu']:.4f}", + ] + for r in single_node_results ] - for r in multinode_results - ] - print("## Multi-Node Results\n") - print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github")) + print("## Single-Node Results\n") + print(tabulate(single_node_rows, headers=single_node_headers, tablefmt="github")) + print("\n") + + if multinode_results: + multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], + r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) + + multinode_headers = [ + MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, + PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS, + DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS, + CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU + ] + + multinode_rows = [ + [ + r['infmax_model_prefix'], + r['model'], + r['hw'].upper(), + r['framework'].upper(), + r['precision'].upper(), + r['isl'], + r['osl'], + r['prefill_tp'], + r['prefill_ep'], + r['prefill_dp_attention'], + r['prefill_num_workers'], + r['num_prefill_gpu'], + r['decode_tp'], + r['decode_ep'], + r['decode_dp_attention'], + r['decode_num_workers'], + r['num_decode_gpu'], + r['conc'], + f"{r['median_ttft'] * 1000:.4f}", + f"{r['median_tpot'] * 1000:.4f}", + f"{r['median_intvty']:.4f}", + f"{r['median_e2el']:.4f}", + f"{r['tput_per_gpu']:.4f}", + f"{r['output_tput_per_gpu']:.4f}", + f"{r['input_tput_per_gpu']:.4f}", + ] + for r in multinode_results + ] + + print("## Multi-Node Results\n") + print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github")) + + +if __name__ == "__main__": + main() \ No newline at end of file