From e332f90acb05c7aeeac38c89577d75b42956bd36 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 15:25:06 -0500 Subject: [PATCH 01/78] add agentic trace replay benchmark infrastructure Trace replay benchmarking for agentic coding workloads using real Claude Code traces. Includes: - Trace replay scripts for H200, MI355X, B200 (vLLM-based) - kv-cache-tester submodule (trace replayer + 522 anonymized traces) - AIPerf submodule (alternative synthetic benchmarking) - Pareto frontier plotting and sweep aggregation - Metrics collector (prometheus scraper + visualization) - Workload distribution analysis - GitHub Actions workflow with per-TP sweep configs - MI355X runner SCRIPT_SUFFIX support Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/multiturn-agentic-trace.yaml | 31 + .../workflows/benchmark-multiturn-tmpl.yml | 184 +++ .github/workflows/multiturn-sweep.yml | 231 +++ .gitmodules | 6 + .../multiturn_fp4_b200_trace_replay.sh | 210 +++ .../multiturn_fp8_h200_trace_replay.sh | 206 +++ .../multiturn_fp8_mi355x_trace_replay.sh | 207 +++ .../multiturn/vllm_benchmark/.gitignore | 4 + experimental/multiturn/vllm_benchmark/aiperf | 1 + .../vllm_benchmark/analysis/__init__.py | 0 .../vllm_benchmark/analysis/plot_pareto.py | 1247 +++++++++++++++++ .../vllm_benchmark/bench/__init__.py | 0 .../vllm_benchmark/bench/metrics_collector.py | 957 +++++++++++++ .../bench/run_metrics_collector.py | 124 ++ .../multiturn/vllm_benchmark/kv-cache-tester | 1 + .../multiturn/vllm_benchmark/requirements.txt | 9 + .../analyze_benchmark_distributions.py | 395 ++++++ .../scripts/collect_sweep_results.py | 340 +++++ .../scripts/plot_sweep_overview.py | 222 +++ runners/launch_mi355x-amds.sh | 4 +- 20 files changed, 4377 insertions(+), 2 deletions(-) create mode 100644 .github/configs/multiturn-agentic-trace.yaml create mode 100644 .github/workflows/benchmark-multiturn-tmpl.yml create mode 100644 .github/workflows/multiturn-sweep.yml create mode 100644 .gitmodules create mode 100755 benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh create mode 100755 benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh create mode 100755 benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh create mode 100644 experimental/multiturn/vllm_benchmark/.gitignore create mode 160000 experimental/multiturn/vllm_benchmark/aiperf create mode 100644 experimental/multiturn/vllm_benchmark/analysis/__init__.py create mode 100644 experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py create mode 100644 experimental/multiturn/vllm_benchmark/bench/__init__.py create mode 100644 experimental/multiturn/vllm_benchmark/bench/metrics_collector.py create mode 100644 experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py create mode 160000 experimental/multiturn/vllm_benchmark/kv-cache-tester create mode 100644 experimental/multiturn/vllm_benchmark/requirements.txt create mode 100644 experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py create mode 100755 experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py create mode 100644 experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml new file mode 100644 index 000000000..5ec98b902 --- /dev/null +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -0,0 +1,31 @@ +h200-fp8-llama70b: + tp2: + users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32] + offload: ["on", "off"] + tp4: + users: [2, 4, 6, 8, 16, 24, 32, 40, 48, 56] + offload: ["on", "off"] + tp8: + users: [2, 4, 6, 8, 16, 32, 48, 64, 80, 128, 256] + offload: ["on", "off"] + +mi355x-fp8-llama70b: + tp2: + users: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56] + offload: ["on", "off"] + tp4: + users: [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 112, 256] + offload: ["on", "off"] + tp8: + users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512] + offload: ["on", "off"] + +b200-fp4-dsr1: + tp4: + ep: 4 + users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128] + offload: ["on", "off"] + tp8: + ep: 8 + users: [1, 2, 4, 8, 12, 16, 32, 64, 128] + offload: ["on", "off"] diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml new file mode 100644 index 000000000..a72034b14 --- /dev/null +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -0,0 +1,184 @@ +name: Template - Multi-Turn Benchmark +on: + workflow_call: + inputs: + runner: + required: true + type: string + image: + required: true + type: string + model: + required: true + type: string + precision: + required: false + type: string + default: 'fp4' + exp-name: + required: true + type: string + tp: + required: true + type: string + users: + required: true + type: string + offload-mode: + description: "on = prefix+offload, off = prefix only, noprefix = no prefix caching" + required: true + type: string + duration: + required: false + type: string + default: '' + request-rate: + description: "Request rate per client (Poisson, req/s). 0 = no delay." + required: false + type: string + default: '0' + total-cpu-dram-gb: + required: false + type: string + default: '300' + script-suffix: + description: "Suffix appended to benchmark script name (e.g. '_lmcache')" + required: false + type: string + default: '' + ep: + description: "Expert parallelism size (for MoE models)" + required: false + type: string + default: '0' + ref: + description: "Git ref (branch/sha) to checkout" + required: false + type: string + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_CACHE: '/mnt/hf_hub_cache/' + EXP_NAME: ${{ inputs.exp-name }} + MODEL: ${{ inputs.model }} + IMAGE: ${{ inputs.image }} + PRECISION: ${{ inputs.precision }} + FRAMEWORK: 'vllm' + TP: ${{ inputs.tp }} + EP_SIZE: ${{ inputs.ep }} + USERS: ${{ inputs.users }} + OFFLOAD_MODE: ${{ inputs.offload-mode }} + DURATION: ${{ inputs.duration }} + REQUEST_RATE: ${{ inputs.request-rate }} + TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }} + SCRIPT_SUFFIX: ${{ inputs.script-suffix }} + SPEC_DECODING: 'off' + +permissions: + contents: read + +jobs: + benchmark: + runs-on: ${{ inputs.runner }} + timeout-minutes: 180 + name: "${{ inputs.exp-name }} tp=${{ inputs.tp }} users=${{ inputs.users }} offload=${{ inputs.offload-mode }}" + steps: + - name: Resource cleanup (pre-run) + run: &resource-cleanup | + # Cleanup Docker resources + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + echo "[Docker] Cleaning up resources ..." + docker ps -aq | xargs -r docker rm -f + docker network prune -f + while [ -n "$(docker ps -aq)" ]; do + docker ps -a + sleep 5 + done + fi + + # Cleanup SLURM resources + if command -v squeue >/dev/null 2>&1; then + if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* ]]; then + echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." + scancel --name="${{ runner.name }}" || true + while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do + squeue --name="${{ runner.name }}" + sleep 5 + done + else + echo "[Slurm] Cleaning up jobs for user: $USER ..." + scancel -u "$USER" || true + while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do + squeue -u "$USER" + sleep 5 + done + fi + fi + + - name: Clean stale git locks + run: find . -name 'index.lock' -delete 2>/dev/null || true + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + ref: ${{ inputs.ref || github.ref }} + submodules: true + + + - name: Launch job script + env: + RUNNER_NAME: ${{ runner.name }} + RESULT_DIR: /workspace/results + run: | + bash ./runners/launch_${RUNNER_NAME%%_*}.sh + + # The runner script doesn't propagate exit codes (scancel masks them). + # Check status.txt to determine if the benchmark actually succeeded. + if [ ! -f results/status.txt ]; then + echo "Run failed: results/status.txt not found." >&2 + exit 1 + fi + STATUS=$(cat results/status.txt) + if [ "$STATUS" != "SUCCESS" ]; then + echo "Run failed: status=$STATUS" >&2 + cat results/benchmark.log 2>/dev/null || true + exit 1 + fi + + - name: Upload results + if: always() + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: "multiturn_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}" + path: | + results/metrics_client_metrics.csv + results/metrics_server_metrics.csv + results/metrics_plots.png + results/benchmark.log + results/server.log + results/config.yaml + results/vllm_command.txt + results/benchmark_command.txt + results/benchmark_metadata.json + results/metrics_workload.png + results/responses.json + results/aiperf_artifacts/ + results/conversations.jsonl + results/workload_distribution_summary.txt + results/workload_distribution_plots.png + results/trace_replay/ + results/status.txt + if-no-files-found: ignore + + - name: Upload server logs + if: always() + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: "server_logs_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}" + path: results/server.log + if-no-files-found: ignore + + - name: Resource cleanup (post-run) + if: always() + run: *resource-cleanup diff --git a/.github/workflows/multiturn-sweep.yml b/.github/workflows/multiturn-sweep.yml new file mode 100644 index 000000000..5ed7bf59e --- /dev/null +++ b/.github/workflows/multiturn-sweep.yml @@ -0,0 +1,231 @@ +name: Multi-Turn Benchmark Sweep +run-name: "${{ inputs.run_name || format('Multi-Turn Sweep - tp={0} users={1} offload={2}', inputs.tp_values, inputs.user_values, inputs.offload_values) }}" + +on: + # push: + # branches: + # - experimental/multi-turn-benchmark + # paths: + # - .github/workflows/multiturn-sweep.yml + workflow_dispatch: + inputs: + run_name: + description: 'Custom run name (optional)' + required: false + default: '' + type: string + tp_values: + description: 'TP sizes (JSON array)' + required: true + default: '[1, 2, 4, 8]' + type: string + user_values: + description: 'Concurrent user counts (JSON array). Ignored if config_file is set.' + required: false + default: '[8, 16, 32, 64, 128, 256, 512, 1024, 2048]' + type: string + offload_values: + description: 'Offload modes (JSON array: on/off/noprefix). Ignored if config_file is set.' + required: false + default: '["on", "off", "noprefix"]' + type: string + config_file: + description: 'YAML config with per-TP sweep settings (e.g. .github/configs/multiturn-agentic-trace.yaml). Overrides tp/user/offload values.' + required: false + default: '' + type: string + config_key: + description: 'Top-level key in config_file to use (e.g. h200-fp8-llama70b, b200-fp4-dsr1). Required if config_file has multiple entries.' + required: false + default: '' + type: string + duration: + description: 'Benchmark duration in seconds (optional, runs to completion if omitted)' + required: false + default: '' + type: string + request_rate: + description: 'Request rate per client (Poisson, req/s). 0 = no delay.' + required: false + default: '0' + type: string + total_cpu_dram_gb: + description: 'Total CPU DRAM for KV offload (GB)' + required: true + default: '100' + type: string + image: + description: 'Container image' + required: true + default: 'vllm/vllm-openai:v0.18.0' + type: string + model: + description: 'Model name' + required: true + default: 'nvidia/Llama-3.3-70B-Instruct-FP4' + type: string + precision: + description: 'Model precision (fp4, fp8, etc.) — used to select benchmark script' + required: false + default: 'fp4' + type: string + script_suffix: + description: 'Suffix for benchmark script (e.g. "_lmcache" → multiturn_fp4_b200_lmcache.sh)' + required: false + default: '' + type: string + runner: + description: 'Runner label (e.g. b200, h200-dgxc-slurm)' + required: false + default: 'b200' + type: string + ep: + description: 'Expert parallelism size (for MoE models, default 0 = disabled)' + required: false + default: '0' + type: string + ref: + description: 'Git ref (branch/sha) to checkout' + required: false + type: string + +jobs: + # --------------------------------------------------------------------------- + # Generate matrix from config file or CLI inputs + # --------------------------------------------------------------------------- + generate-matrix: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.gen.outputs.matrix }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + if: ${{ inputs.config_file != '' }} + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 1 + ref: ${{ inputs.ref || github.ref }} + sparse-checkout: ${{ inputs.config_file }} + + - id: gen + run: | + pip install -q pyyaml + python3 << 'PYEOF' + import json, os, sys + + config_file = "${{ inputs.config_file }}".strip() + + if config_file: + import yaml + with open(config_file) as f: + full_config = yaml.safe_load(f) + + config_key = "${{ inputs.config_key }}".strip() + + # If config_key specified, use that section; otherwise auto-detect + if config_key and config_key in full_config: + config = full_config[config_key] + elif config_key: + print(f"ERROR: config_key '{config_key}' not found. Available: {list(full_config.keys())}") + sys.exit(1) + elif len(full_config) == 1: + config = next(iter(full_config.values())) + else: + # Check if top-level keys look like tp entries (tp2, tp4, etc.) + if all(k.startswith("tp") for k in full_config): + config = full_config + else: + print(f"ERROR: Multiple entries in config, specify --config_key. Available: {list(full_config.keys())}") + sys.exit(1) + + includes = [] + for key, settings in config.items(): + tp = int(key.replace("tp", "")) + users = settings.get("users", []) + offloads = settings.get("offload", ["on", "off"]) + ep = settings.get("ep", 0) + for u in users: + for o in offloads: + entry = {"tp": tp, "users": u, "offload": o} + if ep > 0: + entry["ep"] = ep + includes.append(entry) + else: + tp_values = json.loads('${{ inputs.tp_values }}') + user_values = json.loads('${{ inputs.user_values }}') + offload_values = json.loads('${{ inputs.offload_values }}') + includes = [] + for tp in tp_values: + for u in user_values: + for o in offload_values: + includes.append({"tp": tp, "users": u, "offload": o}) + + matrix = {"include": includes} + print(f"Generated {len(includes)} matrix entries") + with open(os.environ["GITHUB_OUTPUT"], "a") as f: + f.write(f"matrix={json.dumps(matrix)}\n") + PYEOF + + # --------------------------------------------------------------------------- + # Matrix benchmark jobs — each cell calls the multiturn template + # --------------------------------------------------------------------------- + sweep: + needs: generate-matrix + uses: ./.github/workflows/benchmark-multiturn-tmpl.yml + name: sweep / + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }} + secrets: inherit + with: + runner: ${{ inputs.runner }} + image: ${{ inputs.image }} + model: ${{ inputs.model }} + precision: ${{ inputs.precision }} + exp-name: "multiturn_tp${{ matrix.tp }}_users${{ matrix.users }}_offload${{ matrix.offload }}" + tp: "${{ matrix.tp }}" + users: "${{ matrix.users }}" + offload-mode: ${{ matrix.offload }} + duration: ${{ inputs.duration }} + request-rate: ${{ inputs.request_rate }} + total-cpu-dram-gb: ${{ inputs.total_cpu_dram_gb }} + script-suffix: ${{ inputs.script_suffix }} + ep: "${{ matrix.ep || inputs.ep }}" + ref: ${{ inputs.ref }} + + # --------------------------------------------------------------------------- + # Collect & aggregate results + # --------------------------------------------------------------------------- + collect: + runs-on: ubuntu-latest + needs: sweep + if: always() + name: Collect results + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 1 + ref: ${{ inputs.ref || github.ref }} + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install pandas matplotlib numpy + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + pattern: 'multiturn_*' + path: results/ + + - name: Run aggregation + run: | + python experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py results/ aggregated/ + + - name: Upload aggregated results + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: multiturn_aggregated + path: aggregated/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..c45593c07 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "experimental/multiturn/vllm_benchmark/aiperf"] + path = experimental/multiturn/vllm_benchmark/aiperf + url = https://github.com/cquil11/aiperf.git +[submodule "experimental/multiturn/vllm_benchmark/kv-cache-tester"] + path = experimental/multiturn/vllm_benchmark/kv-cache-tester + url = https://github.com/cquil11/kv-cache-tester.git diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh new file mode 100755 index 000000000..d22448892 --- /dev/null +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -0,0 +1,210 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Trace replay benchmark for FP4 models on B200. +# Replays real agentic coding traces at a fixed number of concurrent users. +# Uses kv-cache-tester/trace_replay_tester.py with realistic cache patterns. +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (default 1800, benchmark duration in seconds) +# MAX_DELAY (default 60, max gap between requests in seconds) +# ADVANCE_MIN (default 0.0, min trace advancement fraction) +# ADVANCE_MAX (default 0.7, max trace advancement fraction) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" +TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" + +pip install --quiet urllib3 requests 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +# cat > "$RESULT_DIR/config.yaml" << 'EOF' +# kv-cache-dtype: fp8 +# async-scheduling: true +# max-num-batched-tokens: 8192 +# EOF + +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}' +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB +# max_seqs=$USERS + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +# VLLM_CMD+=" --max-num-seqs $max_seqs" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" +if [ "${EP_SIZE:-0}" -gt 1 ]; then + VLLM_CMD+=" --enable-expert-parallel" +fi + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" +pip install -q -r "$KV_CACHE_TESTER_DIR/requirements.txt" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run trace replay benchmark --------------------------------------------- +REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" +REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" +REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" +REPLAY_CMD+=" --start-users $USERS" +REPLAY_CMD+=" --max-users $USERS" +REPLAY_CMD+=" --max-ttft 9999" +REPLAY_CMD+=" --test-duration $DURATION" +REPLAY_CMD+=" --recycle" +REPLAY_CMD+=" --max-delay $MAX_DELAY" +REPLAY_CMD+=" --max-concurrent-requests 0" +REPLAY_CMD+=" --max-new-tokens-per-period 999999999" +REPLAY_CMD+=" --advance-min $ADVANCE_MIN" +REPLAY_CMD+=" --advance-max $ADVANCE_MAX" +REPLAY_CMD+=" --seed 42" +REPLAY_CMD+=" --no-color" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh new file mode 100755 index 000000000..f3f967a82 --- /dev/null +++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh @@ -0,0 +1,206 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Trace replay benchmark for FP8 models on H200. +# Replays real agentic coding traces at a fixed number of concurrent users. +# Uses kv-cache-tester/trace_replay_tester.py with realistic cache patterns. +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (default 1800, benchmark duration in seconds) +# MAX_DELAY (default 60, max gap between requests in seconds) +# ADVANCE_MIN (default 0.0, min trace advancement fraction) +# ADVANCE_MAX (default 0.7, max trace advancement fraction) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" +TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" + +pip install --quiet urllib3 requests 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +# cat > "$RESULT_DIR/config.yaml" << 'EOF' +# kv-cache-dtype: fp8 +# async-scheduling: true +# max-num-batched-tokens: 8192 +# EOF + +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB +# max_seqs=$USERS + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +# VLLM_CMD+=" --max-num-seqs $max_seqs" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" +pip install -q -r "$KV_CACHE_TESTER_DIR/requirements.txt" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run trace replay benchmark --------------------------------------------- +REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" +REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" +REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" +REPLAY_CMD+=" --start-users $USERS" +REPLAY_CMD+=" --max-users $USERS" +REPLAY_CMD+=" --max-ttft 9999" +REPLAY_CMD+=" --test-duration $DURATION" +REPLAY_CMD+=" --recycle" +REPLAY_CMD+=" --max-delay $MAX_DELAY" +REPLAY_CMD+=" --max-concurrent-requests 0" +REPLAY_CMD+=" --max-new-tokens-per-period 999999999" +REPLAY_CMD+=" --advance-min $ADVANCE_MIN" +REPLAY_CMD+=" --advance-max $ADVANCE_MAX" +REPLAY_CMD+=" --seed 42" +REPLAY_CMD+=" --no-color" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh new file mode 100755 index 000000000..4cf20c453 --- /dev/null +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -0,0 +1,207 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Trace replay benchmark for FP8 models on MI355X. +# Replays real agentic coding traces at a fixed number of concurrent users. +# Uses kv-cache-tester/trace_replay_tester.py with realistic cache patterns. +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (default 1800, benchmark duration in seconds) +# MAX_DELAY (default 60, max gap between requests in seconds) +# ADVANCE_MIN (default 0.0, min trace advancement fraction) +# ADVANCE_MAX (default 0.7, max trace advancement fraction) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi 2>/dev/null || rocm-smi 2>/dev/null || true + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" +TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" + +pip install --quiet urllib3 requests 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +# cat > "$RESULT_DIR/config.yaml" << 'EOF' +# kv-cache-dtype: fp8 +# async-scheduling: true +# max-num-batched-tokens: 8192 +# EOF + +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB +# max_seqs=$USERS + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +# VLLM_CMD+=" --max-num-seqs $max_seqs" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +# MI355X is ROCm — no CUDA arch needed +# export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" +pip install -q -r "$KV_CACHE_TESTER_DIR/requirements.txt" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run trace replay benchmark --------------------------------------------- +REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" +REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" +REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" +REPLAY_CMD+=" --start-users $USERS" +REPLAY_CMD+=" --max-users $USERS" +REPLAY_CMD+=" --max-ttft 9999" +REPLAY_CMD+=" --test-duration $DURATION" +REPLAY_CMD+=" --recycle" +REPLAY_CMD+=" --max-delay $MAX_DELAY" +REPLAY_CMD+=" --max-concurrent-requests 0" +REPLAY_CMD+=" --max-new-tokens-per-period 999999999" +REPLAY_CMD+=" --advance-min $ADVANCE_MIN" +REPLAY_CMD+=" --advance-max $ADVANCE_MAX" +REPLAY_CMD+=" --seed 42" +REPLAY_CMD+=" --no-color" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" diff --git a/experimental/multiturn/vllm_benchmark/.gitignore b/experimental/multiturn/vllm_benchmark/.gitignore new file mode 100644 index 000000000..a0c3ca327 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/.gitignore @@ -0,0 +1,4 @@ +*.png +*.json +*.parquet +results/ \ No newline at end of file diff --git a/experimental/multiturn/vllm_benchmark/aiperf b/experimental/multiturn/vllm_benchmark/aiperf new file mode 160000 index 000000000..373218fb3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/aiperf @@ -0,0 +1 @@ +Subproject commit 373218fb3c3d15fada9c4be6465daf8fb5a70ef6 diff --git a/experimental/multiturn/vllm_benchmark/analysis/__init__.py b/experimental/multiturn/vllm_benchmark/analysis/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py new file mode 100644 index 000000000..277bfca7f --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py @@ -0,0 +1,1247 @@ +#!/usr/bin/env python3 +""" +Plot Pareto frontiers for prefix caching modes. +Modes: on (prefix + offload), off (prefix only), noprefix (no prefix caching) +Pareto frontier: throughput vs latency trade-off. + +Usage: + python plot_pareto.py + python plot_pareto.py ~/sweep_results_20260204_062339 +""" + +import json +import sys +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from pathlib import Path + + +def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None: + """Load per-request metrics from aiperf profile_export JSONL.""" + records = [] + with open(jsonl_path) as f: + for line in f: + line = line.strip() + if not line: + continue + entry = json.loads(line) + meta = entry.get("metadata", {}) + metrics = entry.get("metrics", {}) + + if meta.get("benchmark_phase") != "profiling": + continue + if meta.get("was_cancelled", False): + continue + + def val(key, default=0): + m = metrics.get(key) + if m is None: + return default + return m.get("value", default) if isinstance(m, dict) else m + + itl = metrics.get("inter_token_latency") + if itl and isinstance(itl, dict): + tpot_ms = itl.get("value", 0) + else: + osl = val("output_sequence_length", 1) + ttft = val("time_to_first_token", 0) + latency = val("request_latency", 0) + tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0 + + start_ns = meta.get("request_start_ns", 0) + start_ms = start_ns / 1e6 + + records.append({ + "start_time_ms": start_ms, + "ttft_ms": val("time_to_first_token"), + "tpot_ms": tpot_ms, + "latency_ms": val("request_latency"), + "input_num_tokens": val("input_sequence_length"), + "output_num_tokens": val("output_sequence_length"), + }) + + if not records: + return None + return pd.DataFrame(records) + + +def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None: + """Load per-request metrics from trace_replay detailed_results.csv.""" + df = pd.read_csv(csv_path) + if len(df) == 0: + return None + + # Filter to successful requests only + df = df[df["success"] == True].copy() + if len(df) == 0: + return None + + # Convert to the same schema as _load_aiperf_jsonl + latency_s = df["request_complete_time"] - df["request_start_time"] + records = pd.DataFrame({ + "start_time_ms": df["request_start_time"] * 1000, + "ttft_ms": df["ttft"] * 1000, + "tpot_ms": df["itl"] * 1000, + "latency_ms": latency_s * 1000, + "input_num_tokens": df["input_tokens"], + "output_num_tokens": df["output_tokens_actual"], + }) + return records + + +def load_experiment_data(exp_dir: Path) -> dict | None: + """Load and aggregate metrics from an experiment directory.""" + client_metrics_file = exp_dir / "metrics_client_metrics.csv" + server_metrics_file = exp_dir / "metrics_server_metrics.csv" + status_file = exp_dir / "status.txt" + + # Check if experiment completed successfully + if not status_file.exists(): + return None + status = status_file.read_text().strip() + if status != "SUCCESS": + return None + + # Also check for aiperf output + aiperf_jsonl = None + aiperf_artifacts = exp_dir / "aiperf_artifacts" + if aiperf_artifacts.exists(): + candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl")) + if not candidates: + candidates = list(aiperf_artifacts.glob("profile_export*.jsonl")) + if candidates: + aiperf_jsonl = candidates[0] + + # Check for trace replay output + trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" + + if not client_metrics_file.exists() and aiperf_jsonl is None and not trace_replay_csv.exists(): + return None + + try: + if client_metrics_file.exists(): + df = pd.read_csv(client_metrics_file) + elif aiperf_jsonl is not None: + df = _load_aiperf_jsonl(aiperf_jsonl) + elif trace_replay_csv.exists(): + df = _load_trace_replay_csv(trace_replay_csv) + else: + return None + + # Load server metrics for cache hit rates + gpu_hit_rate = None + cpu_hit_rate = None + if server_metrics_file.exists(): + server_df = pd.read_csv(server_metrics_file) + # Get final cumulative values + final_row = server_df.iloc[-1] + if final_row["prefix_cache_queries"] > 0: + gpu_hit_rate = 100 * final_row["prefix_cache_hits"] / final_row["prefix_cache_queries"] + if final_row["cpu_prefix_cache_queries"] > 0: + cpu_hit_rate = 100 * final_row["cpu_prefix_cache_hits"] / final_row["cpu_prefix_cache_queries"] + if len(df) == 0: + return None + + # Parse experiment name: tp{N}_bs{M}_offload{on|off} + exp_name = exp_dir.name + parts = exp_name.split("_") + tp = int(parts[0].replace("tp", "")) + bs = int(parts[1].replace("bs", "")) + offload = parts[2].replace("offload", "") + + # Calculate metrics + # Prefer benchmark_metadata.json for precise wall-clock duration + metadata_file = exp_dir / "benchmark_metadata.json" + total_time_sec = None + if metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + total_time_sec = metadata.get("benchmark_runtime_sec") + except Exception: + pass + + # Fallback: derive from per-request data (first start to last finish) + if not total_time_sec or total_time_sec <= 0: + first_start_ms = df["start_time_ms"].min() + last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() + total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 + if total_time_sec <= 0: + total_time_sec = df["latency_ms"].sum() / 1000 # fallback + + num_requests = len(df) + throughput_rps = num_requests / total_time_sec if total_time_sec > 0 else 0 + + # Input token throughput (prefill) + total_input_tokens = df["input_num_tokens"].sum() + input_throughput_tps = total_input_tokens / total_time_sec if total_time_sec > 0 else 0 + + # Output token throughput (decode only) + total_output_tokens = df["output_num_tokens"].sum() + output_throughput_tps = total_output_tokens / total_time_sec if total_time_sec > 0 else 0 + + # Total token throughput (input + output) + total_tokens = total_input_tokens + total_output_tokens + total_throughput_tps = total_tokens / total_time_sec if total_time_sec > 0 else 0 + + # Normalized throughput (per GPU) + input_tps_per_gpu = input_throughput_tps / tp + output_tps_per_gpu = output_throughput_tps / tp + total_tps_per_gpu = total_throughput_tps / tp + + return { + "exp_name": exp_name, + "tp": tp, + "bs": bs, + "offload": offload, + "num_requests": num_requests, + "throughput_rps": throughput_rps, + "input_throughput_tps": input_throughput_tps, + "total_throughput_tps": total_throughput_tps, + "input_tps_per_gpu": input_tps_per_gpu, + "output_tps_per_gpu": output_tps_per_gpu, + "total_tps_per_gpu": total_tps_per_gpu, + "mean_ttft_ms": df["ttft_ms"].mean(), + "p50_ttft_ms": df["ttft_ms"].median(), + "p90_ttft_ms": df["ttft_ms"].quantile(0.9), + "p99_ttft_ms": df["ttft_ms"].quantile(0.99), + "mean_tpot_ms": df["tpot_ms"].mean(), + "p50_tpot_ms": df["tpot_ms"].median(), + "p90_tpot_ms": df["tpot_ms"].quantile(0.9), + "p99_tpot_ms": df["tpot_ms"].quantile(0.99), + "p999_tpot_ms": df["tpot_ms"].quantile(0.999), + "mean_latency_ms": df["latency_ms"].mean(), + "p50_latency_ms": df["latency_ms"].median(), + "p90_latency_ms": df["latency_ms"].quantile(0.9), + "p99_latency_ms": df["latency_ms"].quantile(0.99), + "p999_latency_ms": df["latency_ms"].quantile(0.999), + "p999_ttft_ms": df["ttft_ms"].quantile(0.999), + # Cache hit rates + "gpu_hit_rate": gpu_hit_rate, + "cpu_hit_rate": cpu_hit_rate, + } + except Exception as e: + print(f"Error loading {exp_dir}: {e}") + return None + + +def compute_pareto_frontier(points: list[tuple[float, float]], maximize_x: bool = False) -> list[tuple[float, float]]: + """ + Compute Pareto frontier for (x, y) points. + Y is always maximized. X is minimized by default, or maximized if maximize_x=True. + + For minimize X, maximize Y (e.g., latency vs throughput): + - Frontier goes bottom-left to top-right + - Low latency = low throughput, high latency = high throughput + + For maximize X, maximize Y (e.g., interactivity vs throughput): + - Frontier goes top-left to bottom-right + - Trade-off between the two "goods" + + Returns points sorted by X ascending for plotting. + """ + if not points: + return [] + + # Remove invalid points + points = [(x, y) for x, y in points if x > 0 and y > 0] + if not points: + return [] + + frontier = [] + sorted_points = sorted(points, key=lambda p: p[0]) + + if maximize_x: + # Maximize both X and Y: frontier goes top-left to bottom-right + # Traverse from high X to low X, keep points with increasing Y + max_y = float('-inf') + for x, y in reversed(sorted_points): + if y > max_y: + frontier.append((x, y)) + max_y = y + return sorted(frontier, key=lambda p: p[0]) + else: + # Minimize X, maximize Y: frontier goes bottom-left to top-right + # Traverse from low X to high X, keep points with increasing Y + max_y = float('-inf') + for x, y in sorted_points: + if y > max_y: + frontier.append((x, y)) + max_y = y + return frontier + + +def compute_pareto_frontier_with_metadata(df_subset: pd.DataFrame, x_col: str, y_col: str, maximize_x: bool = False) -> pd.DataFrame: + """ + Compute Pareto frontier and return the rows from the dataframe that are on the frontier. + """ + if len(df_subset) == 0: + return pd.DataFrame() + + # Get valid points + valid_mask = (df_subset[x_col] > 0) & (df_subset[y_col] > 0) + df_valid = df_subset[valid_mask].copy() + + if len(df_valid) == 0: + return pd.DataFrame() + + # Sort by x + df_sorted = df_valid.sort_values(x_col).reset_index(drop=True) + + frontier_indices = [] + max_y = float('-inf') + + if maximize_x: + # Traverse from high X to low X + for i in range(len(df_sorted) - 1, -1, -1): + y = df_sorted.iloc[i][y_col] + if y > max_y: + frontier_indices.append(i) + max_y = y + frontier_indices = frontier_indices[::-1] # Reverse to get ascending X order + else: + # Traverse from low X to high X + for i in range(len(df_sorted)): + y = df_sorted.iloc[i][y_col] + if y > max_y: + frontier_indices.append(i) + max_y = y + + return df_sorted.iloc[frontier_indices] + + +def generate_pareto_only_figure(df: pd.DataFrame, results_dir: Path): + """Generate a clean figure showing only Pareto frontier points with concurrency labels.""" + + # Compute interactivity + df = df.copy() + df["interactivity"] = 1000.0 / df["p50_tpot_ms"] + + # Get available modes and create subsets + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + # Create figure with columns for each mode + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers Only (with Concurrency Labels)", fontsize=14) + + # Handle single column case + if num_cols == 1: + axes = axes.reshape(-1, 1) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Metrics configs: (row, x_col, y_col, metric_name, x_label, y_label, maximize_x) + metrics_configs = [ + (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + # Get Pareto frontier points with metadata + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + # Plot frontier line + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black") + + # Plot points colored by TP + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors="black", linewidths=1, + label=f"TP={tp}", zorder=5) + + # Add concurrency labels + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=(5, 5), + fontsize=8, + alpha=0.8) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + if len(frontier_df) > 0: + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_clean.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved clean Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_only_figure_p90(df: pd.DataFrame, results_dir: Path): + """Generate a clean figure showing only Pareto frontier points with p90 latencies.""" + + df = df.copy() + df["interactivity_p90"] = 1000.0 / df["p90_tpot_ms"] + + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers (P90 Latencies) with Concurrency Labels", fontsize=14) + + if num_cols == 1: + axes = axes.reshape(-1, 1) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + metrics_configs = [ + (0, "p90_ttft_ms", "input_tps_per_gpu", "TTFT", "P90 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p90", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/P90 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p90_latency_ms", "total_tps_per_gpu", "E2E Latency", "P90 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p90", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/P90 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors="black", linewidths=1, + label=f"TP={tp}", zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=(5, 5), + fontsize=8, + alpha=0.8) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + if len(frontier_df) > 0: + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_clean_p90.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved clean P90 Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_overlay_figure_p90(df: pd.DataFrame, results_dir: Path): + """Generate a figure with all prefix cache modes overlaid using p90 latencies.""" + + df = df.copy() + df["interactivity_p90"] = 1000.0 / df["p90_tpot_ms"] + + available_modes = df["offload"].unique() + + mode_styles = { + "on": ("-", "black", "black", (5, 8), "normal"), + "off": ("--", "none", "gray", (5, -12), "italic"), + "noprefix": (":", "red", "red", (5, -25), "oblique"), + } + mode_labels = { + "on": "Prefix+Offload", + "off": "Prefix Only", + "noprefix": "No Prefix", + } + + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle("Pareto Frontiers (P90 Latencies): Mode Comparison", fontsize=14) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + plot_configs = [ + (0, "p90_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "P90 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p90", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/P90 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p90_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "P90 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p90", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/P90 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs: + ax = axes[row] + + for mode in ["on", "off", "noprefix"]: + if mode not in available_modes: + continue + + df_subset = df[df["offload"] == mode] + linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode] + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color, + label=f"Pareto ({mode_labels[mode]})") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + label = f"TP={tp}" if mode == "on" else None + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5, + label=label, zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=label_offset, + fontsize=7, + alpha=0.7, + style=font_style) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_overlay_p90.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved overlay P90 Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_only_figure_p99(df: pd.DataFrame, results_dir: Path): + """Generate a clean figure showing only Pareto frontier points with p99 latencies.""" + + # Compute interactivity using p99 + df = df.copy() + df["interactivity_p99"] = 1000.0 / df["p99_tpot_ms"] + + # Get available modes and create subsets + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + # Create figure with columns for each mode + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers (P99 Latencies) with Concurrency Labels", fontsize=14) + + # Handle single column case + if num_cols == 1: + axes = axes.reshape(-1, 1) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Metrics configs: (row, x_col, y_col, metric_name, x_label, y_label, maximize_x) + metrics_configs = [ + (0, "p99_ttft_ms", "input_tps_per_gpu", "TTFT", "P99 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p99", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/P99 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p99_latency_ms", "total_tps_per_gpu", "E2E Latency", "P99 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p99", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/P99 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + # Get Pareto frontier points with metadata + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + # Plot frontier line + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black") + + # Plot points colored by TP + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors="black", linewidths=1, + label=f"TP={tp}", zorder=5) + + # Add concurrency labels + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=(5, 5), + fontsize=8, + alpha=0.8) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + if len(frontier_df) > 0: + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_clean_p99.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved clean P99 Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_overlay_figure_p99(df: pd.DataFrame, results_dir: Path): + """Generate a figure with all prefix cache modes overlaid using p99 latencies.""" + + # Compute interactivity using p99 + df = df.copy() + df["interactivity_p99"] = 1000.0 / df["p99_tpot_ms"] + + # Get available modes + available_modes = df["offload"].unique() + + # Mode styles + mode_styles = { + "on": ("-", "black", "black", (5, 8), "normal"), + "off": ("--", "none", "gray", (5, -12), "italic"), + "noprefix": (":", "red", "red", (5, -25), "oblique"), + } + mode_labels = { + "on": "Prefix+Offload", + "off": "Prefix Only", + "noprefix": "No Prefix", + } + + # Create 4x1 figure + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle("Pareto Frontiers (P99 Latencies): Mode Comparison", fontsize=14) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Plot configs + plot_configs = [ + (0, "p99_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "P99 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p99", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/P99 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p99_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "P99 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p99", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/P99 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs: + ax = axes[row] + + for mode in ["on", "off", "noprefix"]: + if mode not in available_modes: + continue + + df_subset = df[df["offload"] == mode] + linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode] + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color, + label=f"Pareto ({mode_labels[mode]})") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + label = f"TP={tp}" if mode == "on" else None + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5, + label=label, zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=label_offset, + fontsize=7, + alpha=0.7, + style=font_style) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_overlay_p99.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved overlay P99 Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_only_figure_p999(df: pd.DataFrame, results_dir: Path): + """Generate a clean figure showing only Pareto frontier points with p99.9 latencies.""" + + df = df.copy() + df["interactivity_p999"] = 1000.0 / df["p999_tpot_ms"] + + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers (P99.9 Latencies) with Concurrency Labels", fontsize=14) + + if num_cols == 1: + axes = axes.reshape(-1, 1) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + metrics_configs = [ + (0, "p999_ttft_ms", "input_tps_per_gpu", "TTFT", "P99.9 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p999", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/P99.9 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p999_latency_ms", "total_tps_per_gpu", "E2E Latency", "P99.9 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p999", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/P99.9 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors="black", linewidths=1, + label=f"TP={tp}", zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=(5, 5), + fontsize=8, + alpha=0.8) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + if len(frontier_df) > 0: + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_clean_p999.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved clean P99.9 Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_overlay_figure_p999(df: pd.DataFrame, results_dir: Path): + """Generate a figure with all prefix cache modes overlaid using p99.9 latencies.""" + + df = df.copy() + df["interactivity_p999"] = 1000.0 / df["p999_tpot_ms"] + + available_modes = df["offload"].unique() + + mode_styles = { + "on": ("-", "black", "black", (5, 8), "normal"), + "off": ("--", "none", "gray", (5, -12), "italic"), + "noprefix": (":", "red", "red", (5, -25), "oblique"), + } + mode_labels = { + "on": "Prefix+Offload", + "off": "Prefix Only", + "noprefix": "No Prefix", + } + + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle("Pareto Frontiers (P99.9 Latencies): Mode Comparison", fontsize=14) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + plot_configs = [ + (0, "p999_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "P99.9 TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity_p999", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/P99.9 TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p999_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "P99.9 E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity_p999", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/P99.9 TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs: + ax = axes[row] + + for mode in ["on", "off", "noprefix"]: + if mode not in available_modes: + continue + + df_subset = df[df["offload"] == mode] + linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode] + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color, + label=f"Pareto ({mode_labels[mode]})") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + label = f"TP={tp}" if mode == "on" else None + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5, + label=label, zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=label_offset, + fontsize=7, + alpha=0.7, + style=font_style) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_overlay_p999.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved overlay P99.9 Pareto plot to {output_file}") + plt.close() + + +def generate_combined_pareto_figure(df: pd.DataFrame, results_dir: Path, + percentile: str = "p50"): + """Generate a combined Pareto frontier across ALL offload modes. + + Points are colored by TP and edge-styled by offload mode so the viewer + can see both the overall optimal frontier and which config each point + comes from. + + percentile: one of "p50", "p90", "p99", "p999" + """ + from matplotlib.lines import Line2D + + pct = percentile # e.g. "p50" + pct_label = {"p50": "Median", "p90": "P90", "p99": "P99", "p999": "P99.9"}[pct] + suffix = f"_{pct}" + + df = df.copy() + interactivity_col = f"interactivity{suffix}" + df[interactivity_col] = 1000.0 / df[f"{pct}_tpot_ms"] + + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle(f"Combined Pareto Frontier — {pct_label} SLA (All Configs)", fontsize=14) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + mode_edge = { + "on": {"edgecolors": "black", "linewidths": 1.8}, + "off": {"edgecolors": "gray", "linewidths": 1.2}, + "noprefix": {"edgecolors": "#cc0000", "linewidths": 1.2}, + } + mode_short = {"on": "P+O", "off": "P", "noprefix": "NP"} + + metrics_configs = [ + (0, f"{pct}_ttft_ms", "input_tps_per_gpu", "TTFT", f"{pct_label} TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, interactivity_col, "total_tps_per_gpu", "Interactivity", f"Interactivity (1000/{pct_label} TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, f"{pct}_latency_ms", "total_tps_per_gpu", "E2E Latency", f"{pct_label} E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, interactivity_col, "output_tps_per_gpu", "Output Throughput", f"Interactivity (1000/{pct_label} TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + ax = axes[row] + + # # All-data scatter (faded background) + # for tp in sorted(df["tp"].unique()): + # tp_data = df[df["tp"] == tp] + # ax.scatter(tp_data[x_col], tp_data[y_col], + # c=tp_colors.get(tp, "purple"), + # marker=tp_markers.get(tp, "x"), + # s=40, alpha=0.15, linewidths=0.3, + # edgecolors="gray") + + # Combined Pareto frontier + frontier_df = compute_pareto_frontier_with_metadata(df, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black", + label="Pareto Frontier", zorder=4) + + for _, pt in frontier_df.iterrows(): + tp = pt["tp"] + mode = pt["offload"] + edge_kw = mode_edge.get(mode, {"edgecolors": "black", "linewidths": 1}) + ax.scatter(pt[x_col], pt[y_col], + c=tp_colors.get(tp, "purple"), + marker=tp_markers.get(tp, "x"), + s=160, alpha=0.9, zorder=5, + **edge_kw) + + for _, pt in frontier_df.iterrows(): + ax.annotate( + f"conc={int(pt['bs'])} {mode_short.get(pt['offload'], '')}", + (pt[x_col], pt[y_col]), + textcoords="offset points", xytext=(5, 5), + fontsize=7, alpha=0.85) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(f"{metric_name} — All Configs Combined") + ax.grid(True, alpha=0.3) + + handles = [Line2D([0], [0], color="black", lw=2, label="Pareto Frontier")] + for tp in sorted(df["tp"].unique()): + handles.append(Line2D([0], [0], marker=tp_markers[tp], color="w", + markerfacecolor=tp_colors[tp], markersize=8, + markeredgecolor="black", label=f"TP={tp}")) + handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="w", + markersize=8, markeredgecolor="black", markeredgewidth=1.8, + label="Edge: P+Offload")) + handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="w", + markersize=8, markeredgecolor="gray", markeredgewidth=1.2, + label="Edge: Prefix Only")) + handles.append(Line2D([0], [0], marker="o", color="w", markerfacecolor="w", + markersize=8, markeredgecolor="#cc0000", markeredgewidth=1.2, + label="Edge: No Prefix")) + ax.legend(handles=handles, fontsize=7, + loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + fname = f"pareto_frontiers_combined{suffix}.png" + output_file = results_dir / fname + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved combined {pct_label} Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_overlay_figure(df: pd.DataFrame, results_dir: Path): + """Generate a figure with all prefix cache modes overlaid for direct comparison.""" + + # Compute interactivity + df = df.copy() + df["interactivity"] = 1000.0 / df["p50_tpot_ms"] + + # Get available modes + available_modes = df["offload"].unique() + + # Mode styles: (linestyle, marker_edge, line_color, label_offset, font_style) + mode_styles = { + "on": ("-", "black", "black", (5, 8), "normal"), # Prefix + Offload + "off": ("--", "none", "gray", (5, -12), "italic"), # Prefix only + "noprefix": (":", "red", "red", (5, -25), "oblique"), # No prefix caching + } + mode_labels = { + "on": "Prefix+Offload", + "off": "Prefix Only", + "noprefix": "No Prefix", + } + + # Create 4x1 figure + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle("Pareto Frontiers: Prefix Caching Mode Comparison", fontsize=14) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Plot configs: (row, x_col, y_col, title, x_label, y_label, maximize_x) + plot_configs = [ + (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs: + ax = axes[row] + + # Plot all available modes + for mode in ["on", "off", "noprefix"]: + if mode not in available_modes: + continue + + df_subset = df[df["offload"] == mode] + linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode] + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + # Plot frontier line + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color, + label=f"Pareto ({mode_labels[mode]})") + + # Plot points colored by TP + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + # Only add TP to legend once (for first mode) + label = f"TP={tp}" if mode == "on" else None + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5, + label=label, zorder=5) + + # Add concurrency labels + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=label_offset, + fontsize=7, + alpha=0.7, + style=font_style) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_overlay.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved overlay Pareto plot to {output_file}") + plt.close() + + +def main(results_dir: Path): + # Load all experiments + experiments = [] + for exp_dir in results_dir.iterdir(): + if exp_dir.is_dir() and exp_dir.name.startswith("tp"): + data = load_experiment_data(exp_dir) + if data: + experiments.append(data) + + if not experiments: + print("No experiment data found!") + return + + df = pd.DataFrame(experiments) + print(f"Loaded {len(df)} experiments") + print(df[["exp_name", "tp", "bs", "offload", "input_tps_per_gpu", "total_tps_per_gpu", "p50_ttft_ms"]].to_string()) + + # Compute interactivity = 1000 / TPOT (tokens per second for decode) + df["interactivity"] = 1000.0 / df["p50_tpot_ms"] + + # Get available modes and create subsets + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + # Create figure with columns for each mode + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers: Throughput/GPU vs Latency (All Points)", fontsize=14) + + # Handle single column case + if num_cols == 1: + axes = axes.reshape(-1, 1) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Metrics configs: (row, x_col, y_col, metric_name, x_label, y_label, maximize_x) + metrics_configs = [ + (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + # Compute and plot Pareto frontier + points = list(zip(df_subset[x_col], df_subset[y_col])) + frontier = compute_pareto_frontier(points, maximize_x=maximize_x) + + if frontier: + fx, fy = zip(*frontier) + ax.plot(fx, fy, linestyle='-', linewidth=2, alpha=0.8, color="black", label="Pareto frontier") + + # Plot points colored by TP + for tp in sorted(df_subset["tp"].unique()): + tp_data = df_subset[df_subset["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=100, alpha=0.8, edgecolors="black", linewidths=0.5, + label=f"TP={tp}") + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"\nSaved plot to {output_file}") + plt.close() + + # Also save summary CSV + summary_file = results_dir / "experiment_summary.csv" + df.to_csv(summary_file, index=False) + print(f"Saved summary to {summary_file}") + + # Generate clean Pareto-only figure + generate_pareto_only_figure(df, results_dir) + + # Generate combined Pareto frontier (all configs pooled) for each SLA percentile + for pct in ("p50", "p90", "p99", "p999"): + generate_combined_pareto_figure(df, results_dir, percentile=pct) + + # Generate overlay figure (on vs off comparison) + generate_pareto_overlay_figure(df, results_dir) + + # Generate P90 versions + generate_pareto_only_figure_p90(df, results_dir) + generate_pareto_overlay_figure_p90(df, results_dir) + + # Generate P99 versions + generate_pareto_only_figure_p99(df, results_dir) + generate_pareto_overlay_figure_p99(df, results_dir) + + # Generate P99.9 versions + generate_pareto_only_figure_p999(df, results_dir) + generate_pareto_overlay_figure_p999(df, results_dir) + + # Generate cache hit rate plot + generate_cache_hit_rate_figure(df, results_dir) + + +def generate_cache_hit_rate_figure(df: pd.DataFrame, results_dir: Path): + """Generate plot showing throughput vs cache hit rates (GPU and CPU).""" + + # Get available modes + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + + # Create 2x3 figure (GPU hit rate row, CPU hit rate row, columns for each mode) + num_cols = len(available_modes) + fig, axes = plt.subplots(2, num_cols, figsize=(6 * num_cols, 10)) + fig.suptitle("Cache Hit Rate vs Throughput", fontsize=14) + + # Handle single column case + if num_cols == 1: + axes = axes.reshape(-1, 1) + + # Color by TP + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + # Plot configs: (row, hit_rate_col, title_prefix) + hit_rate_configs = [ + (0, "gpu_hit_rate", "GPU"), + (1, "cpu_hit_rate", "CPU"), + ] + + for row, hit_rate_col, hit_type in hit_rate_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df[df["offload"] == mode].dropna(subset=[hit_rate_col]) + + if len(df_subset) == 0: + ax.text(0.5, 0.5, "No data", ha='center', va='center', transform=ax.transAxes) + ax.set_title(f"{hit_type} Hit Rate ({mode_titles.get(mode, mode)})") + continue + + # Plot points colored by TP + for tp in sorted(df_subset["tp"].unique()): + tp_data = df_subset[df_subset["tp"] == tp] + ax.scatter(tp_data[hit_rate_col], tp_data["total_tps_per_gpu"], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=100, alpha=0.8, edgecolors="black", linewidths=0.5, + label=f"TP={tp}") + + # Add concurrency labels + for _, point in df_subset.iterrows(): + ax.annotate(f"bs={int(point['bs'])}", + (point[hit_rate_col], point["total_tps_per_gpu"]), + textcoords="offset points", + xytext=(5, 5), + fontsize=7, + alpha=0.7) + + ax.set_xlabel(f"{hit_type} Cache Hit Rate (%)") + ax.set_ylabel("Total Throughput/GPU (tok/s)") + ax.set_title(f"{hit_type} Hit Rate ({mode_titles.get(mode, mode)})") + ax.set_xlim(-5, 105) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right") + + plt.tight_layout() + + output_file = results_dir / "cache_hit_rates.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved cache hit rate plot to {output_file}") + plt.close() + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python plot_pareto.py ") + print("Example: python plot_pareto.py ~/sweep_results_20260204_062339") + sys.exit(1) + + results_dir = Path(sys.argv[1]).expanduser() + if not results_dir.exists(): + print(f"Error: {results_dir} does not exist") + sys.exit(1) + + main(results_dir) diff --git a/experimental/multiturn/vllm_benchmark/bench/__init__.py b/experimental/multiturn/vllm_benchmark/bench/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py new file mode 100644 index 000000000..c129f38b8 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py @@ -0,0 +1,957 @@ +""" +Metrics collector for vLLM server during benchmarks. +Polls /metrics endpoint and generates visualizations. +""" + +import asyncio +import csv +import re +import subprocess +import threading +import time +from dataclasses import dataclass, field +from pathlib import Path + +import aiohttp +import matplotlib.pyplot as plt + + +@dataclass +class GpuTransferSnapshot: + timestamp: float + gpu_id: int = 0 + tx_pci: float = 0.0 # PCIe TX (MB/s) + rx_pci: float = 0.0 # PCIe RX (MB/s) + + +class GpuTransferCollector: + """DEPRECATED: Collects GPU transfer stats using nvidia-smi dmon. + + Replaced by vLLM's native kv_offload metrics (vllm:kv_offload_total_bytes_total, + vllm:kv_offload_total_time_total) which are more precise and don't require + spawning a subprocess. + """ + + def __init__(self, gpu_id: int = 0, poll_interval: int = 1): + self.gpu_id = gpu_id + self.poll_interval = poll_interval + self.snapshots: list[GpuTransferSnapshot] = [] + self._process: subprocess.Popen | None = None + self._thread: threading.Thread | None = None + self._running = False + + def _parse_line(self, line: str) -> GpuTransferSnapshot | None: + """Parse a line of nvidia-smi dmon CSV output. + + Format: gpu, rxpci, txpci (values in MB/s) + Example: 0, 406, 32013 + """ + line = line.strip() + if not line or line.startswith('#'): # Skip header/comments + return None + + parts = [p.strip() for p in line.split(',')] + if len(parts) < 3: + return None + + try: + return GpuTransferSnapshot( + timestamp=time.time(), + gpu_id=int(parts[0]), + rx_pci=float(parts[1]) if parts[1] != '-' else 0.0, + tx_pci=float(parts[2]) if parts[2] != '-' else 0.0, + ) + except (ValueError, IndexError): + return None + + def _reader_thread(self) -> None: + """Background thread to read nvidia-smi output.""" + if self._process is None: + return + + for line in iter(self._process.stdout.readline, ''): + if not self._running: + break + snapshot = self._parse_line(line) + if snapshot and snapshot.gpu_id == self.gpu_id: + self.snapshots.append(snapshot) + + def start(self) -> None: + """Start collecting GPU transfer stats.""" + if self._running: + return + + self._running = True + self.snapshots = [] + + try: + self._process = subprocess.Popen( + [ + 'nvidia-smi', 'dmon', + '-i', str(self.gpu_id), + '-s', 't', + '-d', str(self.poll_interval), + '--format', 'csv', + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + self._thread = threading.Thread(target=self._reader_thread, daemon=True) + self._thread.start() + except FileNotFoundError: + print("nvidia-smi not found, GPU transfer monitoring disabled") + self._running = False + + def stop(self) -> None: + """Stop collecting GPU transfer stats.""" + self._running = False + if self._process: + self._process.terminate() + try: + self._process.wait(timeout=2) + except subprocess.TimeoutExpired: + self._process.kill() + self._process = None + + if self._thread: + self._thread.join(timeout=2) + self._thread = None + + +@dataclass +class MetricsSnapshot: + timestamp: float + kv_cache_usage: float = 0.0 + cpu_kv_cache_usage: float = 0.0 + num_requests_running: int = 0 + num_requests_waiting: int = 0 + prefix_cache_hits: int = 0 + prefix_cache_queries: int = 0 + cpu_prefix_cache_hits: int = 0 + cpu_prefix_cache_queries: int = 0 + prompt_tokens: int = 0 + generation_tokens: int = 0 + num_preemptions: int = 0 + request_success: int = 0 + # KV offload transfer metrics (cumulative) + kv_offload_bytes_gpu_to_cpu: float = 0.0 + kv_offload_bytes_cpu_to_gpu: float = 0.0 + kv_offload_time_gpu_to_cpu: float = 0.0 + kv_offload_time_cpu_to_gpu: float = 0.0 + # Prompt tokens by source (cumulative) + prompt_tokens_local_compute: int = 0 + prompt_tokens_local_cache_hit: int = 0 + prompt_tokens_external_kv_transfer: int = 0 + # Prefill KV computed tokens (cumulative sum from histogram) + prefill_kv_computed_tokens_sum: int = 0 + prefill_kv_computed_tokens_count: int = 0 + + +@dataclass +class MetricsCollector: + base_url: str + poll_interval: float = 1.0 + snapshots: list[MetricsSnapshot] = field(default_factory=list) + _running: bool = False + _task: asyncio.Task | None = None + gpu_transfer_collector: GpuTransferCollector | None = None + gpu_id: int = 0 + + def _parse_metrics(self, text: str) -> MetricsSnapshot: + """Parse Prometheus metrics text format.""" + snapshot = MetricsSnapshot(timestamp=time.time()) + + # Helper to extract gauge/counter value + def get_value(pattern: str, default: float = 0.0) -> float: + match = re.search(pattern, text) + if match: + return float(match.group(1)) + return default + + # KV cache usage (0-1 scale) + snapshot.kv_cache_usage = get_value( + r'vllm:gpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' + ) + # Fallback to old metric name if new one not found + if snapshot.kv_cache_usage == 0.0: + snapshot.kv_cache_usage = get_value( + r'vllm:kv_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' + ) + + # CPU/offloaded KV cache usage + snapshot.cpu_kv_cache_usage = get_value( + r'vllm:cpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' + ) + + # Running/waiting requests + snapshot.num_requests_running = int(get_value( + r'vllm:num_requests_running\{[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.num_requests_waiting = int(get_value( + r'vllm:num_requests_waiting\{[^}]*\}\s+([\d.e+-]+)' + )) + + # Prefix cache (cumulative counters) - GPU + snapshot.prefix_cache_hits = int(get_value( + r'vllm:prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.prefix_cache_queries = int(get_value( + r'vllm:prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)' + )) + + # Prefix cache - external/offloaded (KV connector cross-instance cache) + snapshot.cpu_prefix_cache_hits = int(get_value( + r'vllm:external_prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.cpu_prefix_cache_queries = int(get_value( + r'vllm:external_prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)' + )) + + # Token counters + snapshot.prompt_tokens = int(get_value( + r'vllm:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.generation_tokens = int(get_value( + r'vllm:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)' + )) + + # Preemptions + snapshot.num_preemptions = int(get_value( + r'vllm:num_preemptions_total\{[^}]*\}\s+([\d.e+-]+)' + )) + + # Request success (sum all finish reasons) + for match in re.finditer( + r'vllm:request_success_total\{[^}]*finished_reason="[^"]*"[^}]*\}\s+([\d.e+-]+)', + text + ): + snapshot.request_success += int(float(match.group(1))) + + # KV offload bytes transferred (cumulative counters by direction) + snapshot.kv_offload_bytes_gpu_to_cpu = get_value( + r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)' + ) + snapshot.kv_offload_bytes_cpu_to_gpu = get_value( + r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)' + ) + + # KV offload time (cumulative, seconds) + snapshot.kv_offload_time_gpu_to_cpu = get_value( + r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)' + ) + snapshot.kv_offload_time_cpu_to_gpu = get_value( + r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)' + ) + + # Prompt tokens by source (cumulative) + snapshot.prompt_tokens_local_compute = int(get_value( + r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_compute"[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.prompt_tokens_local_cache_hit = int(get_value( + r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_cache_hit"[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.prompt_tokens_external_kv_transfer = int(get_value( + r'vllm:prompt_tokens_by_source_total\{[^}]*source="external_kv_transfer"[^}]*\}\s+([\d.e+-]+)' + )) + + # Prefill KV computed tokens (histogram sum and count) + snapshot.prefill_kv_computed_tokens_sum = int(get_value( + r'vllm:request_prefill_kv_computed_tokens_sum\{[^}]*\}\s+([\d.e+-]+)' + )) + snapshot.prefill_kv_computed_tokens_count = int(get_value( + r'vllm:request_prefill_kv_computed_tokens_count\{[^}]*\}\s+([\d.e+-]+)' + )) + + return snapshot + + async def _poll_loop(self) -> None: + """Background polling loop.""" + metrics_url = f"{self.base_url}/metrics" + async with aiohttp.ClientSession() as session: + while self._running: + try: + async with session.get(metrics_url, timeout=aiohttp.ClientTimeout(total=5)) as resp: + if resp.status == 200: + text = await resp.text() + snapshot = self._parse_metrics(text) + self.snapshots.append(snapshot) + except Exception as e: + print(f"Metrics poll error: {e}") + + await asyncio.sleep(self.poll_interval) + + def start(self) -> None: + """Start background metrics collection.""" + if self._running: + return + self._running = True + self.snapshots = [] + self._task = asyncio.create_task(self._poll_loop()) + + async def stop(self) -> None: + """Stop metrics collection.""" + self._running = False + if self._task: + self._task.cancel() + try: + await self._task + except asyncio.CancelledError: + pass + + def generate_plots( + self, + output_prefix: str = "metrics", + client_metrics: list | None = None, + ) -> None: + """Generate visualization plots from collected metrics. + + Args: + output_prefix: Prefix for output file names + client_metrics: Optional list of RequestStats from benchmark clients + """ + if len(self.snapshots) < 2: + print("Not enough data points for plots") + return + + # Convert to relative time (seconds from start) + start_time = self.snapshots[0].timestamp + times = [(s.timestamp - start_time) for s in self.snapshots] + + # Create figure with subplots + num_rows = 6 if client_metrics else 4 + fig, axes = plt.subplots(num_rows, 2, figsize=(14, 4 * num_rows)) + fig.suptitle("vLLM Server Metrics During Benchmark", fontsize=14) + + # 1. KV Cache Usage vs Time + ax = axes[0, 0] + kv_usage = [min(s.kv_cache_usage * 100, 100.0) for s in self.snapshots] + ax.scatter(times, kv_usage, alpha=0.15, s=2, c='blue') + kv_window = min(50, len(kv_usage) // 10) if len(kv_usage) > 10 else 1 + if kv_window > 1: + rolling_kv = [ + sum(kv_usage[max(0, i - kv_window):i + 1]) / len(kv_usage[max(0, i - kv_window):i + 1]) + for i in range(len(kv_usage)) + ] + ax.plot(times, rolling_kv, 'b-', label=f'GPU (avg n={kv_window})', linewidth=2) + else: + ax.plot(times, kv_usage, 'b-', label='GPU', linewidth=2) + # Add external cache if available + cpu_kv_usage = [s.cpu_kv_cache_usage * 100 for s in self.snapshots] + if any(v > 0 for v in cpu_kv_usage): + ax.plot(times, cpu_kv_usage, 'r--', label='External', linewidth=1.5) + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("KV Cache Usage (%)") + ax.set_title("KV Cache Utilization Over Time") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + # 2. Running & Waiting Requests vs Time (smoothed + total) + ax = axes[0, 1] + running = [s.num_requests_running for s in self.snapshots] + waiting = [s.num_requests_waiting for s in self.snapshots] + total_queue = [r + w for r, w in zip(running, waiting)] + q_window = min(30, len(running) // 10) if len(running) > 10 else 1 + if q_window > 1: + rolling_running = [ + sum(running[max(0, i - q_window):i + 1]) / len(running[max(0, i - q_window):i + 1]) + for i in range(len(running)) + ] + rolling_waiting = [ + sum(waiting[max(0, i - q_window):i + 1]) / len(waiting[max(0, i - q_window):i + 1]) + for i in range(len(waiting)) + ] + rolling_total = [ + sum(total_queue[max(0, i - q_window):i + 1]) / len(total_queue[max(0, i - q_window):i + 1]) + for i in range(len(total_queue)) + ] + ax.plot(times, rolling_running, 'g-', label=f'Running (avg n={q_window})', linewidth=1.5) + ax.plot(times, rolling_waiting, 'r-', label=f'Waiting (avg n={q_window})', linewidth=1.5) + ax.plot(times, rolling_total, 'b-', label=f'Total (avg n={q_window})', linewidth=1.5) + else: + ax.plot(times, running, 'g-', label='Running', linewidth=1.5) + ax.plot(times, waiting, 'r-', label='Waiting', linewidth=1.5) + ax.plot(times, total_queue, 'b-', label='Total', linewidth=1.5) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Requests") + ax.set_title("Request Queue Depth") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3) + + # 3. Cache Hit Rate vs Time (computed from deltas between polling intervals) + ax = axes[1, 0] + gpu_hit_rates = [] + ext_hit_rates = [] + combined_hit_rates = [] + has_ext_cache = any(s.cpu_prefix_cache_queries > 0 for s in self.snapshots) + for i in range(1, len(self.snapshots)): + # GPU (HBM) cache hit rate for this interval + gpu_delta_hits = self.snapshots[i].prefix_cache_hits - self.snapshots[i-1].prefix_cache_hits + gpu_delta_queries = self.snapshots[i].prefix_cache_queries - self.snapshots[i-1].prefix_cache_queries + if gpu_delta_queries > 0: + gpu_hit_rates.append(100.0 * gpu_delta_hits / gpu_delta_queries) + else: + gpu_hit_rates.append(gpu_hit_rates[-1] if gpu_hit_rates else 0) + + # External cache hit rate for this interval + if has_ext_cache: + ext_delta_hits = self.snapshots[i].cpu_prefix_cache_hits - self.snapshots[i-1].cpu_prefix_cache_hits + ext_delta_queries = self.snapshots[i].cpu_prefix_cache_queries - self.snapshots[i-1].cpu_prefix_cache_queries + if ext_delta_queries > 0: + ext_hit_rates.append(100.0 * ext_delta_hits / ext_delta_queries) + else: + ext_hit_rates.append(ext_hit_rates[-1] if ext_hit_rates else 0) + + # Combined hit rate: (gpu_hits + ext_hits) / (gpu_queries + ext_queries) + total_hits = gpu_delta_hits + ext_delta_hits + total_queries = gpu_delta_queries + ext_delta_queries + if total_queries > 0: + combined_hit_rates.append(100.0 * total_hits / total_queries) + else: + combined_hit_rates.append(combined_hit_rates[-1] if combined_hit_rates else 0) + + # Rolling window size + window = min(50, len(gpu_hit_rates) // 10) if len(gpu_hit_rates) > 10 else 1 + + # Scatter plot for GPU (HBM) cache hit rate + ax.scatter(times[1:], gpu_hit_rates, alpha=0.3, s=5, c='purple', label='GPU (HBM)') + if window > 1: + rolling_gpu = [ + sum(gpu_hit_rates[max(0, i - window):i + 1]) / len(gpu_hit_rates[max(0, i - window):i + 1]) + for i in range(len(gpu_hit_rates)) + ] + ax.plot(times[1:], rolling_gpu, 'purple', linewidth=1.5, label=f'GPU avg (n={window})') + + # External cache scatter + rolling (if available) + if has_ext_cache and ext_hit_rates: + ax.scatter(times[1:], ext_hit_rates, alpha=0.3, s=5, c='orange', label='External') + if window > 1: + rolling_ext = [ + sum(ext_hit_rates[max(0, i - window):i + 1]) / len(ext_hit_rates[max(0, i - window):i + 1]) + for i in range(len(ext_hit_rates)) + ] + ax.plot(times[1:], rolling_ext, 'orange', linewidth=1.5, label=f'External avg (n={window})') + + # Combined/total hit rate (only if external exists) + ax.scatter(times[1:], combined_hit_rates, alpha=0.2, s=3, c='green', label='Combined') + if window > 1: + rolling_combined = [ + sum(combined_hit_rates[max(0, i - window):i + 1]) / len(combined_hit_rates[max(0, i - window):i + 1]) + for i in range(len(combined_hit_rates)) + ] + ax.plot(times[1:], rolling_combined, 'green', linewidth=2, label=f'Combined avg (n={window})') + + ax.legend(loc='best', fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Hit Rate (%)") + ax.set_title("Prefix Cache Hit Rate Per Interval (tokens hit / tokens queried)") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + # 4. Throughput vs Time (tokens/sec) with rolling average — decode + total + ax = axes[1, 1] + decode_throughputs = [] + total_throughputs = [] + for i in range(1, len(self.snapshots)): + delta_gen = self.snapshots[i].generation_tokens - self.snapshots[i-1].generation_tokens + delta_prompt = self.snapshots[i].prompt_tokens - self.snapshots[i-1].prompt_tokens + delta_time = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp + if delta_time > 0: + decode_throughputs.append(delta_gen / delta_time) + total_throughputs.append((delta_gen + delta_prompt) / delta_time) + else: + decode_throughputs.append(0) + total_throughputs.append(0) + # Cumulative running average total throughput (total tokens / elapsed time) + cumulative_total_avg = [] + t0 = self.snapshots[0].timestamp + tokens0 = self.snapshots[0].generation_tokens + self.snapshots[0].prompt_tokens + for i in range(1, len(self.snapshots)): + elapsed = self.snapshots[i].timestamp - t0 + total_tokens = (self.snapshots[i].generation_tokens + self.snapshots[i].prompt_tokens) - tokens0 + cumulative_total_avg.append(total_tokens / elapsed if elapsed > 0 else 0) + + window = min(30, len(decode_throughputs) // 10) if len(decode_throughputs) > 10 else 1 + if window > 1: + rolling_decode = [ + sum(decode_throughputs[max(0, i - window):i + 1]) / len(decode_throughputs[max(0, i - window):i + 1]) + for i in range(len(decode_throughputs)) + ] + rolling_total = [ + sum(total_throughputs[max(0, i - window):i + 1]) / len(total_throughputs[max(0, i - window):i + 1]) + for i in range(len(total_throughputs)) + ] + ax.plot(times[1:], rolling_total, 'steelblue', linewidth=1.5, label=f'Total (avg n={window})') + ax.plot(times[1:], rolling_decode, 'orange', linewidth=1.5, label=f'Decode (avg n={window})') + ax.legend(fontsize=8) + else: + ax.plot(times[1:], total_throughputs, 'steelblue', linewidth=1, alpha=0.8, label='Total') + ax.plot(times[1:], decode_throughputs, 'orange', linewidth=1, alpha=0.8, label='Decode') + ax.legend(fontsize=8) + ax.plot(times[1:], cumulative_total_avg, 'red', linewidth=2, label='Total Running Avg') + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Tokens/sec") + ax.set_title("Throughput (Total & Decode)") + ax.grid(True, alpha=0.3) + + # 5. KV Offload Transfer Rate (from vLLM metrics) + ax = axes[2, 0] + gpu_to_cpu_rates = [] + cpu_to_gpu_rates = [] + for i in range(1, len(self.snapshots)): + dt = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp + if dt > 0: + delta_g2c = self.snapshots[i].kv_offload_bytes_gpu_to_cpu - self.snapshots[i-1].kv_offload_bytes_gpu_to_cpu + delta_c2g = self.snapshots[i].kv_offload_bytes_cpu_to_gpu - self.snapshots[i-1].kv_offload_bytes_cpu_to_gpu + gpu_to_cpu_rates.append(delta_g2c / dt / 1e6) # MB/s + cpu_to_gpu_rates.append(delta_c2g / dt / 1e6) # MB/s + else: + gpu_to_cpu_rates.append(0) + cpu_to_gpu_rates.append(0) + if any(r > 0 for r in gpu_to_cpu_rates) or any(r > 0 for r in cpu_to_gpu_rates): + ax.scatter(times[1:], gpu_to_cpu_rates, alpha=0.15, s=3, c='blue') + ax.scatter(times[1:], cpu_to_gpu_rates, alpha=0.15, s=3, c='red') + xfer_window = min(30, len(gpu_to_cpu_rates) // 10) if len(gpu_to_cpu_rates) > 10 else 1 + if xfer_window > 1: + rolling_g2c = [ + sum(gpu_to_cpu_rates[max(0, i - xfer_window):i + 1]) / len(gpu_to_cpu_rates[max(0, i - xfer_window):i + 1]) + for i in range(len(gpu_to_cpu_rates)) + ] + rolling_c2g = [ + sum(cpu_to_gpu_rates[max(0, i - xfer_window):i + 1]) / len(cpu_to_gpu_rates[max(0, i - xfer_window):i + 1]) + for i in range(len(cpu_to_gpu_rates)) + ] + ax.plot(times[1:], rolling_g2c, 'b-', linewidth=1.5, label=f'GPU→CPU (avg n={xfer_window})') + ax.plot(times[1:], rolling_c2g, 'r-', linewidth=1.5, label=f'CPU→GPU (avg n={xfer_window})') + else: + ax.plot(times[1:], gpu_to_cpu_rates, 'b-', linewidth=1, alpha=0.8, label='GPU→CPU') + ax.plot(times[1:], cpu_to_gpu_rates, 'r-', linewidth=1, alpha=0.8, label='CPU→GPU') + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Transfer Rate (MB/s)") + ax.set_title("KV Offload Transfer Rate") + ax.grid(True, alpha=0.3) + + # 6. Prompt Token Sources Over Time (cumulative percentage) + ax = axes[2, 1] + initial = self.snapshots[0] + cum_compute_pct = [] + cum_cache_pct = [] + cum_ext_pct = [] + for s in self.snapshots: + c = s.prompt_tokens_local_compute - initial.prompt_tokens_local_compute + h = s.prompt_tokens_local_cache_hit - initial.prompt_tokens_local_cache_hit + e = s.prompt_tokens_external_kv_transfer - initial.prompt_tokens_external_kv_transfer + total = c + h + e + if total > 0: + cum_compute_pct.append(100.0 * c / total) + cum_cache_pct.append(100.0 * h / total) + cum_ext_pct.append(100.0 * e / total) + else: + cum_compute_pct.append(0) + cum_cache_pct.append(0) + cum_ext_pct.append(0) + if any(v > 0 for v in cum_compute_pct): + ax.stackplot(times, cum_compute_pct, cum_cache_pct, cum_ext_pct, + labels=['Prefill', 'HBM Cache Hit', 'Offload Cache Hit'], + colors=['coral', 'steelblue', 'mediumseagreen'], alpha=0.8) + ax.legend(fontsize=8, loc='lower left') + ax.set_xlabel("Time (s)") + ax.set_ylabel("% of Prefill Tokens") + ax.set_title("Cumulative Prefill Token Source Breakdown") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + # 7. Cumulative KV Offload Transfers + initial = self.snapshots[0] + # GPU → CPU cumulative + ax = axes[3, 0] + cum_g2c = [(s.kv_offload_bytes_gpu_to_cpu - initial.kv_offload_bytes_gpu_to_cpu) / 1e9 + for s in self.snapshots] + if any(v > 0 for v in cum_g2c): + ax.plot(times, cum_g2c, 'b-', linewidth=1.5) + ax.fill_between(times, cum_g2c, alpha=0.2, color='blue') + ax.set_xlabel("Time (s)") + ax.set_ylabel("Cumulative Transfer (GB)") + ax.set_title("KV Offload: GPU → CPU (Cumulative)") + ax.grid(True, alpha=0.3) + + # CPU → GPU cumulative + ax = axes[3, 1] + cum_c2g = [(s.kv_offload_bytes_cpu_to_gpu - initial.kv_offload_bytes_cpu_to_gpu) / 1e9 + for s in self.snapshots] + if any(v > 0 for v in cum_c2g): + ax.plot(times, cum_c2g, 'r-', linewidth=1.5) + ax.fill_between(times, cum_c2g, alpha=0.2, color='red') + ax.set_xlabel("Time (s)") + ax.set_ylabel("Cumulative Transfer (GB)") + ax.set_title("KV Offload: CPU → GPU (Cumulative)") + ax.grid(True, alpha=0.3) + + # 8 & 9. Client metrics plots (TTFT and Latency vs Time) + if client_metrics and len(client_metrics) > 0: + # Sort by start time + sorted_metrics = sorted(client_metrics, key=lambda x: x.start_time_ms) + # Convert to relative time (seconds from first request) + first_start = sorted_metrics[0].start_time_ms + request_times = [(m.start_time_ms - first_start) / 1000.0 for m in sorted_metrics] + ttfts = [m.ttft_ms for m in sorted_metrics] + latencies = [m.latency_ms for m in sorted_metrics] + + # 8. TTFT vs Time + ax = axes[4, 0] + ax.scatter(request_times, ttfts, alpha=0.3, s=5, c='blue') + # Add rolling average + window = min(50, len(ttfts) // 10) if len(ttfts) > 10 else 1 + if window > 1: + rolling_ttft = [ + sum(ttfts[max(0, i - window):i + 1]) / len(ttfts[max(0, i - window):i + 1]) + for i in range(len(ttfts)) + ] + ax.plot(request_times, rolling_ttft, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})') + ax.legend() + ax.set_xlabel("Time (s)") + ax.set_ylabel("TTFT (ms)") + ax.set_title("Time to First Token vs Time") + ax.grid(True, alpha=0.3) + + # 9. Latency vs Time + ax = axes[4, 1] + ax.scatter(request_times, latencies, alpha=0.3, s=5, c='green') + # Add rolling average + if window > 1: + rolling_latency = [ + sum(latencies[max(0, i - window):i + 1]) / len(latencies[max(0, i - window):i + 1]) + for i in range(len(latencies)) + ] + ax.plot(request_times, rolling_latency, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})') + ax.legend() + ax.set_xlabel("Time (s)") + ax.set_ylabel("Latency (ms)") + ax.set_title("Request Latency vs Time") + ax.grid(True, alpha=0.3) + + # 10. Interactivity (1/TPOT = tokens/sec) vs Time + ax = axes[5, 0] + # Filter out zero TPOT values to avoid division by zero + tpots = [m.tpot_ms for m in sorted_metrics] + interactivity = [1000.0 / t if t > 0 else 0 for t in tpots] # Convert to tokens/sec + ax.scatter(request_times, interactivity, alpha=0.3, s=5, c='purple') + # Add rolling average + if window > 1: + rolling_inter = [ + sum(interactivity[max(0, i - window):i + 1]) / len(interactivity[max(0, i - window):i + 1]) + for i in range(len(interactivity)) + ] + ax.plot(request_times, rolling_inter, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})') + ax.legend() + ax.set_xlabel("Time (s)") + ax.set_ylabel("Interactivity (tokens/sec)") + ax.set_title("Decode Speed (1/TPOT) vs Time") + ax.grid(True, alpha=0.3) + + # 11. Preemptions over time + ax = axes[5, 1] + preemption_rates = [] + for i in range(1, len(self.snapshots)): + dt = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp + delta = self.snapshots[i].num_preemptions - self.snapshots[i-1].num_preemptions + preemption_rates.append(delta / dt if dt > 0 else 0) + if any(r > 0 for r in preemption_rates): + ax.scatter(times[1:], preemption_rates, alpha=0.15, s=3, c='red') + preempt_window = min(30, len(preemption_rates) // 10) if len(preemption_rates) > 10 else 1 + if preempt_window > 1: + rolling_preempt = [ + sum(preemption_rates[max(0, i - preempt_window):i + 1]) / len(preemption_rates[max(0, i - preempt_window):i + 1]) + for i in range(len(preemption_rates)) + ] + ax.plot(times[1:], rolling_preempt, 'r-', linewidth=1.5, label=f'Rolling avg (n={preempt_window})') + # Cumulative on secondary axis + ax2 = ax.twinx() + cumulative = [self.snapshots[i].num_preemptions - self.snapshots[0].num_preemptions + for i in range(1, len(self.snapshots))] + ax2.plot(times[1:], cumulative, 'b--', linewidth=1, alpha=0.5, label='Cumulative') + ax2.set_ylabel("Cumulative Preemptions", color='blue') + ax2.tick_params(axis='y', labelcolor='blue') + ax.set_xlabel("Time (s)") + ax.set_ylabel("Preemptions/sec", color='red') + ax.tick_params(axis='y', labelcolor='red') + ax.set_title("Preemptions Over Time") + ax.grid(True, alpha=0.3) + + plt.tight_layout() + plt.savefig(f"{output_prefix}_plots.png", dpi=150) + print(f"Saved plots to {output_prefix}_plots.png") + plt.close() + + # Also generate a summary + self._print_summary() + + def _print_summary(self) -> None: + """Print summary statistics.""" + if len(self.snapshots) < 2: + return + + duration = self.snapshots[-1].timestamp - self.snapshots[0].timestamp + total_gen_tokens = self.snapshots[-1].generation_tokens - self.snapshots[0].generation_tokens + total_prompt_tokens = self.snapshots[-1].prompt_tokens - self.snapshots[0].prompt_tokens + + final = self.snapshots[-1] + initial = self.snapshots[0] + + print("\n" + "="*60) + print("METRICS SUMMARY") + print("="*60) + print(f"Duration: {duration:.1f}s") + print(f"Total prompt tokens: {total_prompt_tokens:,}") + print(f"Total generation tokens: {total_gen_tokens:,}") + print(f"Avg generation throughput: {total_gen_tokens/duration:.1f} tok/s") + print(f"Peak KV cache usage: {max(s.kv_cache_usage for s in self.snapshots)*100:.1f}%") + print(f"Peak running requests: {max(s.num_requests_running for s in self.snapshots)}") + print(f"Peak waiting requests: {max(s.num_requests_waiting for s in self.snapshots)}") + print(f"Total preemptions: {final.num_preemptions - initial.num_preemptions}") + + if final.prefix_cache_queries > initial.prefix_cache_queries: + delta_hits = final.prefix_cache_hits - initial.prefix_cache_hits + delta_queries = final.prefix_cache_queries - initial.prefix_cache_queries + hit_rate = 100.0 * delta_hits / delta_queries + print(f"Overall GPU cache hit rate: {hit_rate:.1f}%") + print(f" - Cache hits: {delta_hits:,} tokens") + print(f" - Cache queries: {delta_queries:,} tokens") + + # External/offloaded cache stats if available + if final.cpu_prefix_cache_queries > initial.cpu_prefix_cache_queries: + cpu_delta_hits = final.cpu_prefix_cache_hits - initial.cpu_prefix_cache_hits + cpu_delta_queries = final.cpu_prefix_cache_queries - initial.cpu_prefix_cache_queries + cpu_hit_rate = 100.0 * cpu_delta_hits / cpu_delta_queries + print(f"Overall external cache hit rate: {cpu_hit_rate:.1f}%") + print(f" - Cache hits: {cpu_delta_hits:,} tokens") + print(f" - Cache queries: {cpu_delta_queries:,} tokens") + + # Prompt tokens by source + total_compute = final.prompt_tokens_local_compute - initial.prompt_tokens_local_compute + total_cache_hit = final.prompt_tokens_local_cache_hit - initial.prompt_tokens_local_cache_hit + total_ext = final.prompt_tokens_external_kv_transfer - initial.prompt_tokens_external_kv_transfer + total_by_source = total_compute + total_cache_hit + total_ext + if total_by_source > 0: + print(f"Prompt token sources:") + print(f" - Prefill: {total_compute:>12,} ({100*total_compute/total_by_source:.1f}%)") + print(f" - HBM cache hit: {total_cache_hit:>12,} ({100*total_cache_hit/total_by_source:.1f}%)") + print(f" - Offload cache hit: {total_ext:>12,} ({100*total_ext/total_by_source:.1f}%)") + + # KV offload transfer stats + g2c_bytes = final.kv_offload_bytes_gpu_to_cpu - initial.kv_offload_bytes_gpu_to_cpu + c2g_bytes = final.kv_offload_bytes_cpu_to_gpu - initial.kv_offload_bytes_cpu_to_gpu + g2c_time = final.kv_offload_time_gpu_to_cpu - initial.kv_offload_time_gpu_to_cpu + c2g_time = final.kv_offload_time_cpu_to_gpu - initial.kv_offload_time_cpu_to_gpu + if g2c_bytes > 0 or c2g_bytes > 0: + print(f"KV offload transfers:") + print(f" GPU→CPU: {g2c_bytes/1e9:.2f} GB in {g2c_time:.2f}s ({g2c_bytes/g2c_time/1e9:.1f} GB/s)" if g2c_time > 0 else f" GPU→CPU: {g2c_bytes/1e9:.2f} GB") + print(f" CPU→GPU: {c2g_bytes/1e9:.2f} GB in {c2g_time:.2f}s ({c2g_bytes/c2g_time/1e9:.1f} GB/s)" if c2g_time > 0 else f" CPU→GPU: {c2g_bytes/1e9:.2f} GB") + + # Prefill KV computed tokens + delta_kv_sum = final.prefill_kv_computed_tokens_sum - initial.prefill_kv_computed_tokens_sum + delta_kv_count = final.prefill_kv_computed_tokens_count - initial.prefill_kv_computed_tokens_count + if delta_kv_count > 0: + print(f"Prefill KV computed tokens (excluding cached):") + print(f" Total: {delta_kv_sum:,} tokens across {delta_kv_count:,} requests") + print(f" Avg per request: {delta_kv_sum/delta_kv_count:.0f} tokens") + + print("="*60 + "\n") + + def export_csv( + self, + output_prefix: str = "metrics", + client_metrics: list | None = None, + ) -> None: + """Export all time series data to CSV files. + + Args: + output_prefix: Prefix for output file names + client_metrics: Optional list of RequestStats from benchmark clients + + Generates: + - {output_prefix}_server_metrics.csv: vLLM server metrics over time + - {output_prefix}_gpu_transfer.csv: GPU PCIe transfer stats + - {output_prefix}_client_metrics.csv: Per-request client metrics (if provided) + """ + output_dir = Path(output_prefix).parent + if output_dir and not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) + + # 1. Export server metrics (from /metrics endpoint) + if self.snapshots: + server_csv = f"{output_prefix}_server_metrics.csv" + start_time = self.snapshots[0].timestamp + + with open(server_csv, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'timestamp_sec', + 'relative_time_sec', + 'kv_cache_usage_pct', + 'cpu_kv_cache_usage_pct', + 'num_requests_running', + 'num_requests_waiting', + 'prefix_cache_hits', + 'prefix_cache_queries', + 'cpu_prefix_cache_hits', + 'cpu_prefix_cache_queries', + 'prompt_tokens_total', + 'generation_tokens_total', + 'num_preemptions_total', + 'request_success_total', + # KV offload metrics + 'kv_offload_bytes_gpu_to_cpu', + 'kv_offload_bytes_cpu_to_gpu', + 'kv_offload_time_gpu_to_cpu', + 'kv_offload_time_cpu_to_gpu', + # Prompt tokens by source + 'prompt_tokens_local_compute', + 'prompt_tokens_local_cache_hit', + 'prompt_tokens_external_kv_transfer', + # Prefill KV computed + 'prefill_kv_computed_tokens_sum', + 'prefill_kv_computed_tokens_count', + # Computed per-interval metrics + 'interval_cache_hit_rate_pct', + 'interval_throughput_tok_per_sec', + ]) + + for i, s in enumerate(self.snapshots): + relative_time = s.timestamp - start_time + + # Compute per-interval metrics + cache_hit_rate = 0.0 + throughput = 0.0 + if i > 0: + prev = self.snapshots[i - 1] + delta_hits = s.prefix_cache_hits - prev.prefix_cache_hits + delta_queries = s.prefix_cache_queries - prev.prefix_cache_queries + if delta_queries > 0: + cache_hit_rate = 100.0 * delta_hits / delta_queries + + delta_gen = s.generation_tokens - prev.generation_tokens + delta_time = s.timestamp - prev.timestamp + if delta_time > 0: + throughput = delta_gen / delta_time + + writer.writerow([ + f"{s.timestamp:.3f}", + f"{relative_time:.3f}", + f"{s.kv_cache_usage * 100:.2f}", + f"{s.cpu_kv_cache_usage * 100:.2f}", + s.num_requests_running, + s.num_requests_waiting, + s.prefix_cache_hits, + s.prefix_cache_queries, + s.cpu_prefix_cache_hits, + s.cpu_prefix_cache_queries, + s.prompt_tokens, + s.generation_tokens, + s.num_preemptions, + s.request_success, + f"{s.kv_offload_bytes_gpu_to_cpu:.0f}", + f"{s.kv_offload_bytes_cpu_to_gpu:.0f}", + f"{s.kv_offload_time_gpu_to_cpu:.6f}", + f"{s.kv_offload_time_cpu_to_gpu:.6f}", + s.prompt_tokens_local_compute, + s.prompt_tokens_local_cache_hit, + s.prompt_tokens_external_kv_transfer, + s.prefill_kv_computed_tokens_sum, + s.prefill_kv_computed_tokens_count, + f"{cache_hit_rate:.2f}", + f"{throughput:.2f}", + ]) + + print(f"Exported server metrics to {server_csv}") + + # 2. Export GPU transfer stats (DEPRECATED - kept for backward compat) + if self.gpu_transfer_collector and self.gpu_transfer_collector.snapshots: + gpu_csv = f"{output_prefix}_gpu_transfer.csv" + gpu_snaps = self.gpu_transfer_collector.snapshots + gpu_start = gpu_snaps[0].timestamp + + with open(gpu_csv, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'timestamp_sec', + 'relative_time_sec', + 'gpu_id', + 'tx_pci_mb_per_sec', + 'rx_pci_mb_per_sec', + 'cumulative_tx_gb', + 'cumulative_rx_gb', + ]) + + cumulative_tx = 0.0 + cumulative_rx = 0.0 + for i, s in enumerate(gpu_snaps): + relative_time = s.timestamp - gpu_start + if i > 0: + dt = s.timestamp - gpu_snaps[i - 1].timestamp + cumulative_tx += s.tx_pci * dt / 1024 # MB to GB + cumulative_rx += s.rx_pci * dt / 1024 + + writer.writerow([ + f"{s.timestamp:.3f}", + f"{relative_time:.3f}", + s.gpu_id, + f"{s.tx_pci:.2f}", + f"{s.rx_pci:.2f}", + f"{cumulative_tx:.4f}", + f"{cumulative_rx:.4f}", + ]) + + print(f"Exported GPU transfer metrics to {gpu_csv}") + + # 3. Export client metrics (per-request stats) + if client_metrics and len(client_metrics) > 0: + client_csv = f"{output_prefix}_client_metrics.csv" + sorted_metrics = sorted(client_metrics, key=lambda x: x.start_time_ms) + first_start = sorted_metrics[0].start_time_ms + + with open(client_csv, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'start_time_ms', + 'relative_time_sec', + 'ttft_ms', + 'tpot_ms', + 'latency_ms', + 'input_num_turns', + 'input_num_tokens', + 'output_num_tokens', + 'output_num_chunks', + 'output_num_first_chunk_tokens', + 'approx_cached_percent', + 'conversation_id', + 'client_id', + 'interactivity_tok_per_sec', + ]) + + for m in sorted_metrics: + relative_time = (m.start_time_ms - first_start) / 1000.0 + interactivity = 1000.0 / m.tpot_ms if m.tpot_ms > 0 else 0 + + writer.writerow([ + f"{m.start_time_ms:.3f}", + f"{relative_time:.3f}", + f"{m.ttft_ms:.3f}", + f"{m.tpot_ms:.3f}", + f"{m.latency_ms:.3f}", + m.input_num_turns, + m.input_num_tokens, + m.output_num_tokens, + m.output_num_chunks, + m.output_num_first_chunk_tokens, + f"{m.approx_cached_percent:.2f}", + m.conversation_id, + m.client_id, + f"{interactivity:.2f}", + ]) + + print(f"Exported client metrics to {client_csv}") diff --git a/experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py new file mode 100644 index 000000000..ddf605324 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/bench/run_metrics_collector.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Standalone metrics collector for vLLM server. + +Polls the vLLM /metrics endpoint and generates server-side plots. +Designed to run alongside any benchmark client (aiperf, custom, etc.). + +Usage: + # Start collecting, run your benchmark, then Ctrl+C or kill to stop: + python -m bench.run_metrics_collector \ + --url http://localhost:8888 \ + --output-prefix results/metrics \ + --duration 600 + + # Or run in background and signal when done: + python -m bench.run_metrics_collector \ + --url http://localhost:8888 \ + --output-prefix results/metrics \ + --pid-file /tmp/metrics_collector.pid +""" + +import argparse +import asyncio +import os +import signal +import sys + +from bench.metrics_collector import MetricsCollector + + +async def run(args): + collector = MetricsCollector( + base_url=args.url, + poll_interval=args.poll_interval, + ) + + collector.start() + print(f"Metrics collector started (polling {args.url}/metrics every {args.poll_interval}s)") + + if args.pid_file: + with open(args.pid_file, "w") as f: + f.write(str(os.getpid())) + print(f"PID written to {args.pid_file}") + + # Set up graceful shutdown + stop_event = asyncio.Event() + + def handle_signal(*_): + print("\nStopping metrics collector...") + stop_event.set() + + loop = asyncio.get_event_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, handle_signal) + + # Wait for duration or signal + if args.duration: + try: + await asyncio.wait_for(stop_event.wait(), timeout=args.duration) + except asyncio.TimeoutError: + print(f"Duration limit reached ({args.duration}s)") + else: + await stop_event.wait() + + await collector.stop() + + # Generate outputs + if len(collector.snapshots) < 2: + print("Not enough data points collected") + sys.exit(1) + + print(f"Collected {len(collector.snapshots)} snapshots") + + # Generate plots (without client metrics — server-only) + collector.generate_plots(output_prefix=args.output_prefix) + + # Export CSV + collector.export_csv(output_prefix=args.output_prefix) + + # Clean up PID file + if args.pid_file and os.path.exists(args.pid_file): + os.remove(args.pid_file) + + print("Done") + + +def main(): + parser = argparse.ArgumentParser( + description="Standalone vLLM metrics collector" + ) + parser.add_argument( + "--url", "-u", + default="http://localhost:8888", + help="vLLM server base URL (default: http://localhost:8888)", + ) + parser.add_argument( + "--output-prefix", "-o", + default="metrics", + help="Output file prefix (default: metrics)", + ) + parser.add_argument( + "--poll-interval", + type=float, + default=1.0, + help="Polling interval in seconds (default: 1.0)", + ) + parser.add_argument( + "--duration", "-d", + type=float, + default=None, + help="Max collection duration in seconds (default: unlimited, stop with signal)", + ) + parser.add_argument( + "--pid-file", + default=None, + help="Write PID to this file for external signaling", + ) + args = parser.parse_args() + + asyncio.run(run(args)) + + +if __name__ == "__main__": + main() diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester new file mode 160000 index 000000000..a41ee2261 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -0,0 +1 @@ +Subproject commit a41ee2261b743328be84c472b7b97112d046e62f diff --git a/experimental/multiturn/vllm_benchmark/requirements.txt b/experimental/multiturn/vllm_benchmark/requirements.txt new file mode 100644 index 000000000..f4a9625fb --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/requirements.txt @@ -0,0 +1,9 @@ +numpy>=1.24 +pandas>=2.0.0 +aiohttp>=3.10 +transformers>=4.46 +xlsxwriter>=3.2.1 +tqdm>=4.66 +datasets +tiktoken +matplotlib diff --git a/experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py b/experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py new file mode 100644 index 000000000..aa4b639ca --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/analyze_benchmark_distributions.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +"""Analyze ISL/OSL/turn distributions from AIPerf benchmark results. + +Reads profile_export.jsonl and produces summary stats + distribution plots +to verify the benchmark workload matches the intended Qwen trace profile. + +Usage: + python analyze_benchmark_distributions.py path/to/aiperf_artifacts/ -o output_dir/ +""" + +from __future__ import annotations + +import argparse +import json +import math +from collections import Counter, defaultdict +from pathlib import Path + + +def load_records(artifacts_dir: Path) -> list[dict]: + """Load per-request records from profile_export.jsonl.""" + jsonl_path = artifacts_dir / "profile_export.jsonl" + records = [] + with open(jsonl_path) as f: + for line in f: + line = line.strip() + if line: + records.append(json.loads(line)) + return records + + +def load_trace_replay_records(trace_replay_dir: Path) -> list[dict]: + """Load per-request records from trace_replay detailed_results.csv. + + Converts to the same format as AIPerf JSONL records so the analyze() + function can process both formats identically. + """ + import csv + import sys + csv.field_size_limit(sys.maxsize) + + csv_path = trace_replay_dir / "detailed_results.csv" + records = [] + with open(csv_path) as f: + reader = csv.DictReader(f) + for row in reader: + if row.get("success") != "True": + continue + records.append({ + "metadata": { + "x_correlation_id": row["trace_id"], + "conversation_id": row["trace_id"], + "turn_index": int(row["request_idx"]), + "benchmark_phase": "profiling", + }, + "metrics": { + "input_sequence_length": {"value": int(row["input_tokens"])}, + "output_sequence_length": {"value": int(row["output_tokens_actual"])}, + }, + }) + return records + + +def analyze(records: list[dict], output_dir: Path) -> None: + """Run distribution analysis and save results.""" + output_dir.mkdir(parents=True, exist_ok=True) + + # Group by conversation + convos: dict[str, list[dict]] = defaultdict(list) + for r in records: + metrics = r.get("metrics", {}) + if "input_sequence_length" not in metrics or "output_sequence_length" not in metrics: + continue + # Use x_correlation_id (unique per session) not conversation_id (template, reused) + cid = r["metadata"].get("x_correlation_id") or r["metadata"]["conversation_id"] + ti = r["metadata"]["turn_index"] + isl = metrics["input_sequence_length"]["value"] + osl = metrics["output_sequence_length"]["value"] + convos[cid].append({"turn": ti, "isl": isl, "osl": osl}) + + # Sort turns within each conversation + for v in convos.values(): + v.sort(key=lambda x: x["turn"]) + + # Turn count distribution + turn_counts = Counter(len(v) for v in convos.values()) + total_convos = len(convos) + total_requests = len(records) + + lines = [] + lines.append("=" * 70) + lines.append("BENCHMARK WORKLOAD DISTRIBUTION ANALYSIS") + lines.append("=" * 70) + lines.append(f"Total conversations: {total_convos:,}") + lines.append(f"Total requests: {total_requests:,}") + lines.append(f"Avg turns/conv: {total_requests / total_convos:.2f}") + lines.append("") + + lines.append("TURN COUNT DISTRIBUTION:") + lines.append(f" {'Turns':>5s} {'Count':>6s} {'Pct':>6s} Target") + target = {1: 59, 2: 20, 3: 10, 4: 5, 5: 3, 6: 2, 7: 1} + for k in sorted(turn_counts.keys()): + pct = 100 * turn_counts[k] / total_convos + tgt = f"{target.get(k, 0):.0f}%" if k in target else "" + lines.append(f" {k:5d} {turn_counts[k]:6,} {pct:5.1f}% {tgt}") + + # ISL/OSL by turn index + lines.append("") + lines.append("ISL BY TURN INDEX:") + lines.append( + f" {'Turn':>4s} {'N':>6s} {'Mean':>8s} {'Median':>8s} {'Std':>8s} {'P5':>8s} {'P95':>8s}" + ) + max_turn = max(t["turn"] for v in convos.values() for t in v) + for ti in range(max_turn + 1): + vals = sorted(t["isl"] for v in convos.values() for t in v if t["turn"] == ti) + if not vals: + continue + n = len(vals) + mean = sum(vals) / n + std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n) + median = vals[n // 2] + p5 = vals[int(n * 0.05)] + p95 = vals[int(n * 0.95)] + lines.append( + f" {ti:4d} {n:6,} {mean:8.0f} {median:8.0f} {std:8.0f} {p5:8.0f} {p95:8.0f}" + ) + + lines.append("") + lines.append("OSL BY TURN INDEX:") + lines.append( + f" {'Turn':>4s} {'N':>6s} {'Mean':>8s} {'Median':>8s} {'Std':>8s} {'P5':>8s} {'P95':>8s}" + ) + for ti in range(max_turn + 1): + vals = sorted(t["osl"] for v in convos.values() for t in v if t["turn"] == ti) + if not vals: + continue + n = len(vals) + mean = sum(vals) / n + std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n) + median = vals[n // 2] + p5 = vals[int(n * 0.05)] + p95 = vals[int(n * 0.95)] + lines.append( + f" {ti:4d} {n:6,} {mean:8.0f} {median:8.0f} {std:8.0f} {p5:8.0f} {p95:8.0f}" + ) + + # Overall ISL/OSL stats + all_isl = sorted(t["isl"] for v in convos.values() for t in v) + all_osl = sorted(t["osl"] for v in convos.values() for t in v) + n = len(all_isl) + isl_mean = sum(all_isl) / n + osl_mean = sum(all_osl) / n + lines.append("") + lines.append("ALL REQUESTS ISL:") + lines.append( + f" n={n:,} mean={isl_mean:.0f} median={all_isl[n//2]} " + f"p5={all_isl[int(n*0.05)]} p95={all_isl[int(n*0.95)]}" + ) + lines.append("ALL REQUESTS OSL:") + lines.append( + f" n={n:,} mean={osl_mean:.0f} median={all_osl[n//2]} " + f"p5={all_osl[int(n*0.05)]} p95={all_osl[int(n*0.95)]}" + ) + + # Per-conversation stats + conv_max_isl = sorted(max(t["isl"] for t in v) for v in convos.values()) + conv_total_osl = sorted(sum(t["osl"] for t in v) for v in convos.values()) + nc = len(conv_max_isl) + lines.append("") + lines.append("PER-CONVERSATION MAX ISL (final context size):") + lines.append( + f" n={nc:,} mean={sum(conv_max_isl)/nc:.0f} median={conv_max_isl[nc//2]} " + f"p5={conv_max_isl[int(nc*0.05)]} p95={conv_max_isl[int(nc*0.95)]}" + ) + lines.append("PER-CONVERSATION TOTAL OSL:") + lines.append( + f" n={nc:,} mean={sum(conv_total_osl)/nc:.0f} median={conv_total_osl[nc//2]} " + f"p5={conv_total_osl[int(nc*0.05)]} p95={conv_total_osl[int(nc*0.95)]}" + ) + + # ISL context growth (shows accumulation across turns) + lines.append("") + lines.append("ISL CONTEXT GROWTH (sample multi-turn conversations):") + multi = [(cid, v) for cid, v in convos.items() if len(v) >= 3][:10] + for cid, turns in multi: + isls = " -> ".join(str(t["isl"]) for t in turns) + lines.append(f" {cid}: {isls}") + + lines.append("=" * 70) + + summary_text = "\n".join(lines) + print(summary_text) + + # Save summary + (output_dir / "workload_distribution_summary.txt").write_text(summary_text) + + # Try to generate plots (matplotlib may not be available) + try: + _generate_plots(convos, records, output_dir) + except ImportError: + print("matplotlib not available, skipping plots") + + +def _generate_plots( + convos: dict[str, list[dict]], records: list[dict], output_dir: Path +) -> None: + """Generate distribution plots.""" + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + fig, axes = plt.subplots(3, 3, figsize=(18, 15)) + fig.suptitle("Benchmark Workload Distribution Analysis", fontsize=14) + + # (0,0) Turn count distribution + ax = axes[0, 0] + turn_counts = Counter(len(v) for v in convos.values()) + turns = sorted(turn_counts.keys()) + counts = [turn_counts[t] for t in turns] + total = sum(counts) + bars = ax.bar(turns, [100 * c / total for c in counts], edgecolor="black", alpha=0.7) + for bar, t in zip(bars, turns): + ax.text( + bar.get_x() + bar.get_width() / 2, + bar.get_height(), + f"{bar.get_height():.0f}%", + ha="center", + va="bottom", + fontsize=8, + ) + ax.set_xlabel("Number of Turns") + ax.set_ylabel("% of Conversations") + ax.set_title(f"Turn Count Distribution (n={total:,})") + ax.grid(True, alpha=0.3, axis="y") + + # (0,1) All requests ISL histogram + ax = axes[0, 1] + all_isl = [t["isl"] for v in convos.values() for t in v] + clip = int(sorted(all_isl)[int(len(all_isl) * 0.99)] * 1.2) + ax.hist([v for v in all_isl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="steelblue") + all_isl_sorted = sorted(all_isl) + median_isl = all_isl_sorted[len(all_isl) // 2] + mean_isl = sum(all_isl) / len(all_isl) + ax.axvline(median_isl, color="red", linestyle="--", label=f"Median: {median_isl:,}") + ax.axvline(mean_isl, color="orange", linestyle="--", label=f"Mean: {mean_isl:,.0f}") + ax.set_xlabel("Input Sequence Length") + ax.set_ylabel("Count") + ax.set_title(f"All Requests ISL (n={len(all_isl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (0,2) All requests OSL histogram + ax = axes[0, 2] + all_osl = [t["osl"] for v in convos.values() for t in v] + clip = min(3000, int(sorted(all_osl)[int(len(all_osl) * 0.99)] * 1.2)) + ax.hist([v for v in all_osl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="coral") + all_osl_sorted = sorted(all_osl) + median_osl = all_osl_sorted[len(all_osl) // 2] + mean_osl = sum(all_osl) / len(all_osl) + ax.axvline(median_osl, color="red", linestyle="--", label=f"Median: {median_osl:,}") + ax.axvline(mean_osl, color="orange", linestyle="--", label=f"Mean: {mean_osl:,.0f}") + ax.set_xlabel("Output Sequence Length") + ax.set_ylabel("Count") + ax.set_title(f"All Requests OSL (n={len(all_osl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (1,0) Average new prefill tokens by turn index (ISL delta per turn) + ax = axes[1, 0] + # Collect deltas grouped by turn index + deltas_by_turn: dict[int, list[int]] = defaultdict(list) + for v in convos.values(): + for i, t in enumerate(v): + if i == 0: + deltas_by_turn[t["turn"]].append(t["isl"]) + else: + deltas_by_turn[t["turn"]].append(max(0, t["isl"] - v[i - 1]["isl"])) + if deltas_by_turn: + turn_indices = sorted(deltas_by_turn.keys()) + means = [sum(deltas_by_turn[ti]) / len(deltas_by_turn[ti]) for ti in turn_indices] + ns = [len(deltas_by_turn[ti]) for ti in turn_indices] + ax.plot(turn_indices, means, marker="o", markersize=3, linewidth=1, color="mediumseagreen") + ax.fill_between(turn_indices, 0, means, alpha=0.2, color="mediumseagreen") + # Label first and last points + if len(turn_indices) > 0: + ax.annotate(f"{means[0]:,.0f}", (turn_indices[0], means[0]), fontsize=7, ha="left", va="bottom") + if len(turn_indices) > 1: + ax.annotate(f"{means[-1]:,.0f}\n(n={ns[-1]})", (turn_indices[-1], means[-1]), fontsize=7, ha="right", va="bottom") + # Overall mean/median across all deltas + all_deltas = [d for dlist in deltas_by_turn.values() for d in dlist] + if all_deltas: + overall_mean = sum(all_deltas) / len(all_deltas) + all_deltas_sorted = sorted(all_deltas) + overall_median = all_deltas_sorted[len(all_deltas) // 2] + ax.axhline(overall_mean, color="orange", linestyle="--", linewidth=1, label=f"Mean: {overall_mean:,.0f}") + ax.axhline(overall_median, color="red", linestyle="--", linewidth=1, label=f"Median: {overall_median:,}") + ax.legend(fontsize=7) + ax.set_xlabel("Turn Index") + ax.set_ylabel("Mean New Prefill Tokens") + ax.set_title("Avg New Prefill Tokens by Turn") + ax.grid(True, alpha=0.3) + + # (1,1) ISL vs OSL scatter + ax = axes[1, 1] + ax.scatter(all_isl, all_osl, alpha=0.15, s=3, c="purple") + ax.set_xlabel("ISL (tokens)") + ax.set_ylabel("OSL (tokens)") + ax.set_title("ISL vs OSL (all requests)") + ax.grid(True, alpha=0.3) + + # (1,2) Per-conversation max ISL vs num turns scatter + ax = axes[1, 2] + conv_turns = [len(v) for v in convos.values()] + conv_max_isl_list = [max(t["isl"] for t in v) for v in convos.values()] + ax.scatter(conv_turns, conv_max_isl_list, alpha=0.3, s=8, c="steelblue") + ax.set_xlabel("Number of Turns") + ax.set_ylabel("Max ISL (tokens)") + ax.set_title("Final Context Size vs Turn Count") + ax.grid(True, alpha=0.3) + + # (2,0) Per-conversation max ISL (final context size per conversation) + ax = axes[2, 0] + conv_max_isl = [max(t["isl"] for t in v) for v in convos.values()] + clip = int(sorted(conv_max_isl)[int(len(conv_max_isl) * 0.99)] * 1.2) + ax.hist([v for v in conv_max_isl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="steelblue") + conv_max_isl_sorted = sorted(conv_max_isl) + median_max = conv_max_isl_sorted[len(conv_max_isl) // 2] + mean_max = sum(conv_max_isl) / len(conv_max_isl) + ax.axvline(median_max, color="red", linestyle="--", label=f"Median: {median_max:,}") + ax.axvline(mean_max, color="orange", linestyle="--", label=f"Mean: {mean_max:,.0f}") + ax.set_xlabel("Max ISL per Conversation (tokens)") + ax.set_ylabel("Count") + ax.set_title(f"Per-Conversation Final Context Size (n={len(conv_max_isl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (3,1) Per-conversation total OSL (sum of all output tokens across turns) + ax = axes[2, 1] + conv_total_osl = [sum(t["osl"] for t in v) for v in convos.values()] + clip = int(sorted(conv_total_osl)[int(len(conv_total_osl) * 0.99)] * 1.2) + ax.hist([v for v in conv_total_osl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="coral") + conv_total_osl_sorted = sorted(conv_total_osl) + median_tosl = conv_total_osl_sorted[len(conv_total_osl) // 2] + mean_tosl = sum(conv_total_osl) / len(conv_total_osl) + ax.axvline(median_tosl, color="red", linestyle="--", label=f"Median: {median_tosl:,}") + ax.axvline(mean_tosl, color="orange", linestyle="--", label=f"Mean: {mean_tosl:,.0f}") + ax.set_xlabel("Total OSL per Conversation (tokens)") + ax.set_ylabel("Count") + ax.set_title(f"Per-Conversation Total Output Tokens (n={len(conv_total_osl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (2,2) is empty — already placed scatter at (1,2) + axes[2, 2].axis("off") + + plt.tight_layout() + out = output_dir / "workload_distribution_plots.png" + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved plots to {out}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Analyze benchmark workload distributions" + ) + parser.add_argument("artifacts_dir", help="Path to aiperf_artifacts/ or trace_replay/ directory") + parser.add_argument( + "-o", "--output", default=None, help="Output directory (default: same as artifacts_dir)" + ) + args = parser.parse_args() + + artifacts_dir = Path(args.artifacts_dir) + output_dir = Path(args.output) if args.output else artifacts_dir + + # Auto-detect format + trace_replay_csv = artifacts_dir / "detailed_results.csv" + aiperf_jsonl = artifacts_dir / "profile_export.jsonl" + + if trace_replay_csv.exists(): + records = load_trace_replay_records(artifacts_dir) + print(f"Loaded {len(records):,} records from {artifacts_dir} (trace replay)") + elif aiperf_jsonl.exists(): + records = load_records(artifacts_dir) + print(f"Loaded {len(records):,} records from {artifacts_dir} (AIPerf)") + else: + print(f"No recognized data files in {artifacts_dir}") + return + + analyze(records, output_dir) + + +if __name__ == "__main__": + main() diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py new file mode 100755 index 000000000..fc02b1865 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 +""" +Collect and aggregate multi-turn benchmark sweep results from GitHub Actions +artifacts. + +Expects a directory of artifact subdirectories named: + multiturn_tp{N}_users{M}_offload{mode}/ +each containing metrics CSVs, status.txt, etc. + +Produces: + - summary.csv with per-experiment aggregated metrics + - Pareto frontier plots (via plot_pareto.py) + +Usage: + python collect_sweep_results.py +""" + +import json +import sys +from pathlib import Path + +import pandas as pd +import numpy as np + + +def _load_custom_client_csv(client_csv: Path, exp_dir: Path) -> pd.DataFrame | None: + """Load per-request metrics from custom benchmark client CSV.""" + df = pd.read_csv(client_csv) + if len(df) == 0: + return None + # Columns expected: start_time_ms, ttft_ms, tpot_ms, latency_ms, + # input_num_tokens, output_num_tokens, ... + return df + + +def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None: + """Load per-request metrics from aiperf profile_export JSONL. + + Converts aiperf's per-record format into the same column schema + used by the custom benchmark client CSV. + """ + records = [] + with open(jsonl_path) as f: + for line in f: + line = line.strip() + if not line: + continue + entry = json.loads(line) + meta = entry.get("metadata", {}) + metrics = entry.get("metrics", {}) + + # Skip non-profiling records or cancelled requests + if meta.get("benchmark_phase") != "profiling": + continue + if meta.get("was_cancelled", False): + continue + + # Extract values (aiperf stores metrics as {value, unit} dicts) + def val(key, default=0): + m = metrics.get(key) + if m is None: + return default + return m.get("value", default) if isinstance(m, dict) else m + + # Compute TPOT from ITL if available + itl = metrics.get("inter_token_latency") + if itl and isinstance(itl, dict): + tpot_ms = itl.get("value", 0) + else: + # Fallback: (latency - ttft) / (output_tokens - 1) + osl = val("output_sequence_length", 1) + ttft = val("time_to_first_token", 0) + latency = val("request_latency", 0) + tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0 + + # Convert request_start_ns to ms (epoch) + start_ns = meta.get("request_start_ns", 0) + start_ms = start_ns / 1e6 + + records.append({ + "start_time_ms": start_ms, + "ttft_ms": val("time_to_first_token"), + "tpot_ms": tpot_ms, + "latency_ms": val("request_latency"), + "input_num_tokens": val("input_sequence_length"), + "output_num_tokens": val("output_sequence_length"), + }) + + if not records: + return None + + return pd.DataFrame(records) + + +def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None: + """Load per-request metrics from trace_replay detailed_results.csv.""" + df = pd.read_csv(csv_path) + if len(df) == 0: + return None + + # Filter to successful requests only + df = df[df["success"] == True].copy() + if len(df) == 0: + return None + + # Convert to the same schema as _load_aiperf_jsonl + latency_s = df["request_complete_time"] - df["request_start_time"] + return pd.DataFrame({ + "start_time_ms": df["request_start_time"] * 1000, + "ttft_ms": df["ttft"] * 1000, + "tpot_ms": df["itl"] * 1000, + "latency_ms": latency_s * 1000, + "input_num_tokens": df["input_tokens"], + "output_num_tokens": df["output_tokens_actual"], + }) + + +def load_experiment(exp_dir: Path) -> dict | None: + """Load metrics from a single experiment artifact directory.""" + client_csv = exp_dir / "metrics_client_metrics.csv" + server_csv = exp_dir / "metrics_server_metrics.csv" + status_file = exp_dir / "status.txt" + + if not status_file.exists(): + return None + status = status_file.read_text().strip() + + # Also check for aiperf output + aiperf_jsonl = None + aiperf_artifacts = exp_dir / "aiperf_artifacts" + if aiperf_artifacts.exists(): + candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl")) + if not candidates: + candidates = list(aiperf_artifacts.glob("profile_export*.jsonl")) + if candidates: + aiperf_jsonl = candidates[0] + + # Check for trace replay output + trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" + + if not client_csv.exists() and aiperf_jsonl is None and not trace_replay_csv.exists(): + return None + + # Parse experiment name from directory: multiturn_tp{N}_users{M}_offload{mode} + # or just tp{N}_users{M}_offload{mode} + name = exp_dir.name + if name.startswith("multiturn_"): + name = name[len("multiturn_"):] + + try: + parts = name.split("_") + tp = int(parts[0].replace("tp", "")) + users = int(parts[1].replace("users", "").replace("bs", "")) + offload = parts[2].replace("offload", "") + except (IndexError, ValueError): + print(f"Warning: cannot parse experiment name '{exp_dir.name}', skipping") + return None + + result = { + "exp_name": name, + "tp": tp, + "users": users, + "offload": offload, + "status": status, + } + + if status != "SUCCESS": + return result + + try: + # Determine data source: custom client CSV, aiperf JSONL, or trace replay CSV + if client_csv.exists(): + df = _load_custom_client_csv(client_csv, exp_dir) + elif aiperf_jsonl is not None: + df = _load_aiperf_jsonl(aiperf_jsonl) + elif trace_replay_csv.exists(): + df = _load_trace_replay_csv(trace_replay_csv) + else: + return result + + if df is None or len(df) == 0: + return result + + # Prefer benchmark_metadata.json for precise wall-clock duration + metadata_file = exp_dir / "benchmark_metadata.json" + total_time_sec = None + if metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + total_time_sec = metadata.get("benchmark_runtime_sec") + except Exception: + pass + + # Fallback: derive from per-request data (first start to last finish) + if not total_time_sec or total_time_sec <= 0: + first_start_ms = df["start_time_ms"].min() + last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() + total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 + if total_time_sec <= 0: + total_time_sec = df["latency_ms"].sum() / 1000 + + num_requests = len(df) + result.update({ + "num_requests": num_requests, + "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, + "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, + "mean_ttft_ms": df["ttft_ms"].mean(), + "p50_ttft_ms": df["ttft_ms"].median(), + "p90_ttft_ms": df["ttft_ms"].quantile(0.9), + "p99_ttft_ms": df["ttft_ms"].quantile(0.99), + "mean_tpot_ms": df["tpot_ms"].mean(), + "p50_tpot_ms": df["tpot_ms"].median(), + "p90_tpot_ms": df["tpot_ms"].quantile(0.9), + "p99_tpot_ms": df["tpot_ms"].quantile(0.99), + "mean_latency_ms": df["latency_ms"].mean(), + "p50_latency_ms": df["latency_ms"].median(), + "p90_latency_ms": df["latency_ms"].quantile(0.9), + "p99_latency_ms": df["latency_ms"].quantile(0.99), + }) + + # Cache hit rates from server metrics + if server_csv.exists(): + try: + sdf = pd.read_csv(server_csv) + if len(sdf) > 0: + final = sdf.iloc[-1] + if final.get("prefix_cache_queries", 0) > 0: + result["gpu_hit_rate"] = 100 * final["prefix_cache_hits"] / final["prefix_cache_queries"] + if final.get("cpu_prefix_cache_queries", 0) > 0: + result["cpu_hit_rate"] = 100 * final["cpu_prefix_cache_hits"] / final["cpu_prefix_cache_queries"] + except Exception as e: + print(f"Warning: failed to load server metrics for {exp_dir.name}: {e}") + + except Exception as e: + print(f"Warning: failed to load client metrics for {exp_dir.name}: {e}") + + return result + + +def run_pareto_analysis(results_dir: Path, output_dir: Path) -> None: + """Run plot_pareto.py if available, restructuring artifacts to match its + expected layout (subdirs named tp{N}_users{M}_offload{mode}).""" + # plot_pareto.py expects direct subdirectories with experiment names + # The artifact download gives us multiturn_tp{N}_users{M}_offload{mode}/ + # We create symlinks with the canonical names + pareto_input = output_dir / "pareto_input" + pareto_input.mkdir(parents=True, exist_ok=True) + + for subdir in sorted(results_dir.iterdir()): + if not subdir.is_dir(): + continue + name = subdir.name + if name.startswith("multiturn_"): + name = name[len("multiturn_"):] + # plot_pareto.py expects "bs" not "users" in directory names + name = name.replace("_users", "_bs") + link = pareto_input / name + if not link.exists(): + link.symlink_to(subdir.resolve()) + + # Try to import and run plot_pareto + analysis_dir = Path(__file__).resolve().parent.parent / "analysis" + sys.path.insert(0, str(analysis_dir)) + try: + import plot_pareto # type: ignore + plot_pareto.main(pareto_input) + + # Move any generated plots to output dir + for f in pareto_input.glob("*.png"): + f.rename(output_dir / f.name) + for f in pareto_input.glob("*.pdf"): + f.rename(output_dir / f.name) + except Exception as e: + print(f"Warning: plot_pareto analysis failed: {e}") + print("Continuing with summary CSV only.") + + +def main() -> None: + if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + + artifacts_dir = Path(sys.argv[1]) + output_dir = Path(sys.argv[2]) + output_dir.mkdir(parents=True, exist_ok=True) + + if not artifacts_dir.is_dir(): + print(f"Error: {artifacts_dir} is not a directory") + sys.exit(1) + + # Load all experiments + experiments = [] + for subdir in sorted(artifacts_dir.iterdir()): + if not subdir.is_dir(): + continue + result = load_experiment(subdir) + if result is not None: + experiments.append(result) + + if not experiments: + print("No experiments found.") + sys.exit(0) + + # Write summary CSV + summary_path = output_dir / "summary.csv" + df = pd.DataFrame(experiments) + df.to_csv(summary_path, index=False) + print(f"Summary written to {summary_path} ({len(experiments)} experiments)") + + # Print status summary + success = sum(1 for e in experiments if e.get("status") == "SUCCESS") + failed = sum(1 for e in experiments if e.get("status") == "FAILED") + other = len(experiments) - success - failed + print(f" SUCCESS: {success}, FAILED: {failed}, OTHER: {other}") + + # Run Pareto analysis + run_pareto_analysis(artifacts_dir, output_dir) + + # Run overview plots (throughput vs concurrency, workload consistency) + try: + from plot_sweep_overview import plot_throughput_vs_concurrency, plot_workload_consistency + pareto_input = output_dir / "pareto_input" + summary_csv = pareto_input / "experiment_summary.csv" + if summary_csv.exists(): + overview_df = pd.read_csv(summary_csv) + plot_throughput_vs_concurrency(overview_df, output_dir) + plot_workload_consistency(pareto_input, output_dir) + else: + print("Warning: No experiment_summary.csv found, skipping overview plots") + except Exception as e: + print(f"Warning: Overview plots failed: {e}") + + print(f"Aggregated results saved to {output_dir}") + + +if __name__ == "__main__": + main() diff --git a/experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py b/experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py new file mode 100644 index 000000000..1fd04bdc0 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/plot_sweep_overview.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +"""Generate overview plots for sweep results. + +Produces: +- throughput_vs_concurrency.png: Throughput & cache hit rate vs concurrent sessions per TP +- workload_consistency.png: ISL distribution box plots per experiment to verify consistent workload + +Usage: + python plot_sweep_overview.py [] +""" + +import csv +import sys +from collections import defaultdict +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + + +def plot_throughput_vs_concurrency(df: pd.DataFrame, output_dir: Path) -> None: + """Throughput and cache hit rate vs concurrent sessions, per TP.""" + tps = sorted(df["tp"].unique()) + n = len(tps) + if n == 0: + return + + fig, axes = plt.subplots(2, n, figsize=(7 * n, 10)) + if n == 1: + axes = axes.reshape(2, 1) + fig.suptitle("Throughput & Cache Hit Rate vs Concurrent Sessions", fontsize=15) + + for idx, tp in enumerate(tps): + tp_df = df[df["tp"] == tp].sort_values("bs") + off = tp_df[tp_df["offload"] == "off"].sort_values("bs") + on = tp_df[tp_df["offload"] == "on"].sort_values("bs") + + # --- Top row: Throughput --- + ax = axes[0, idx] + if len(off) > 0: + ax.plot(off["bs"], off["total_tps_per_gpu"], "o-", color="#d62728", + linewidth=2.5, markersize=7, label="Offload OFF") + if len(on) > 0: + ax.plot(on["bs"], on["total_tps_per_gpu"], "s-", color="#2ca02c", + linewidth=2.5, markersize=7, label="Offload ON") + + # Annotate max gain + if len(off) > 0 and len(on) > 0: + merged = pd.merge(off[["bs", "total_tps_per_gpu"]], on[["bs", "total_tps_per_gpu"]], + on="bs", suffixes=("_off", "_on")) + if len(merged) > 0: + merged["gain_pct"] = ((merged["total_tps_per_gpu_on"] - merged["total_tps_per_gpu_off"]) + / merged["total_tps_per_gpu_off"] * 100) + max_row = merged.loc[merged["gain_pct"].idxmax()] + if max_row["gain_pct"] > 20: + ax.annotate(f"+{max_row['gain_pct']:.0f}%", + xy=(max_row["bs"], max_row["total_tps_per_gpu_on"]), + xytext=(0, 15), textcoords="offset points", + fontsize=11, fontweight="bold", color="green", ha="center") + + ax.set_xlabel("Concurrent Sessions", fontsize=10) + ax.set_ylabel("Throughput/GPU (tok/s)", fontsize=10) + ax.set_title(f"TP{tp} — Throughput", fontsize=13, fontweight="bold") + max_tput = df["total_tps_per_gpu"].max() + ax.set_ylim(0, max_tput * 1.15 if max_tput > 0 else 15000) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.2) + + # --- Bottom row: Cache hit rate --- + ax = axes[1, idx] + if len(off) > 0: + ax.plot(off["bs"], off["gpu_hit_rate"], "o-", color="#d62728", + linewidth=2, markersize=6, label="GPU Hit — OFF") + if len(on) > 0: + ax.plot(on["bs"], on["gpu_hit_rate"], "s-", color="#2ca02c", + linewidth=2, markersize=6, label="GPU Hit — ON") + cpu_hit = on["cpu_hit_rate"].fillna(0) + if cpu_hit.max() > 1: + ax.plot(on["bs"], cpu_hit, "v--", color="#9467bd", + linewidth=2, markersize=6, label="CPU Hit — ON") + + ax.set_xlabel("Concurrent Sessions", fontsize=10) + ax.set_ylabel("Cache Hit Rate (%)", fontsize=10) + ax.set_title(f"TP{tp} — Cache Hit Rate", fontsize=13, fontweight="bold") + ax.set_ylim(0, 105) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.2) + + plt.tight_layout() + out = output_dir / "throughput_vs_concurrency.png" + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved {out}") + + +def plot_workload_consistency(pareto_input_dir: Path, output_dir: Path) -> None: + """ISL distribution box plots per experiment to verify consistent workload.""" + csv.field_size_limit(sys.maxsize) + + tps = set() + data_by_tp: dict[int, list[tuple[int, str, list[float]]]] = defaultdict(list) + + for exp_dir in sorted(pareto_input_dir.iterdir()): + if not exp_dir.is_dir() or not exp_dir.name.startswith("tp"): + continue + if "offloadon" in exp_dir.name: + continue # Only use offload-off for consistency check + + parts = exp_dir.name.split("_") + try: + tp = int(parts[0].replace("tp", "")) + bs = int(parts[1].replace("bs", "")) + except (IndexError, ValueError): + continue + + tps.add(tp) + + # Try trace replay CSV + csv_path = exp_dir / "trace_replay" / "detailed_results.csv" + if not csv_path.exists(): + # Try aiperf JSONL + continue + + isls = [] + try: + with open(csv_path) as f: + reader = csv.DictReader(f) + for row in reader: + if row.get("success") == "True": + isls.append(int(row["input_tokens"]) / 1000) # k tokens + except Exception: + continue + + if isls: + data_by_tp[tp].append((bs, exp_dir.name, isls)) + + if not data_by_tp: + print("No workload data found for consistency plot") + return + + sorted_tps = sorted(data_by_tp.keys()) + n = len(sorted_tps) + + fig, axes = plt.subplots(1, n, figsize=(7 * n, 6)) + if n == 1: + axes = [axes] + fig.suptitle("Workload Consistency — ISL Distribution Per Experiment (Offload OFF)", fontsize=14) + + for idx, tp in enumerate(sorted_tps): + ax = axes[idx] + entries = sorted(data_by_tp[tp], key=lambda x: x[0]) + + box_data = [e[2] for e in entries] + labels = [str(e[0]) for e in entries] + means = [np.mean(e[2]) for e in entries] + + bp = ax.boxplot(box_data, tick_labels=labels, patch_artist=True, + showfliers=False, widths=0.6, + medianprops=dict(color="red", linewidth=2)) + for patch in bp["boxes"]: + patch.set_facecolor("steelblue") + patch.set_alpha(0.6) + + ax.plot(range(1, len(means) + 1), means, "o--", color="orange", linewidth=2, + markersize=6, label=f"Mean ({np.mean(means):.0f}k ± {np.std(means):.0f}k)", zorder=5) + + overall_mean = np.mean(means) + overall_std = np.std(means) + ax.axhspan(overall_mean - overall_std, overall_mean + overall_std, + alpha=0.1, color="orange", label="±1σ band") + ax.axhline(overall_mean, color="orange", linestyle=":", alpha=0.5) + + ax.set_xlabel("Concurrent Sessions", fontsize=11) + ax.set_ylabel("ISL (k tokens)", fontsize=11) + ax.set_title(f"TP{tp}", fontsize=13, fontweight="bold") + ax.legend(fontsize=9) + ax.grid(True, alpha=0.2, axis="y") + ax.set_ylim(0, 140) + + plt.tight_layout() + out = output_dir / "workload_consistency.png" + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved {out}") + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} []") + sys.exit(1) + + pareto_input_dir = Path(sys.argv[1]) + output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else pareto_input_dir.parent + output_dir.mkdir(parents=True, exist_ok=True) + + # Load experiment summary + summary_csv = pareto_input_dir / "experiment_summary.csv" + if not summary_csv.exists(): + # Try parent + summary_csv = output_dir / "summary.csv" + if not summary_csv.exists(): + print(f"No summary CSV found in {pareto_input_dir} or {output_dir}") + return + + df = pd.read_csv(summary_csv) + + # Ensure required columns exist + required = ["tp", "bs", "offload", "total_tps_per_gpu", "gpu_hit_rate"] + missing = [c for c in required if c not in df.columns] + if missing: + print(f"Missing columns in summary: {missing}") + return + + plot_throughput_vs_concurrency(df, output_dir) + plot_workload_consistency(pareto_input_dir, output_dir) + + +if __name__ == "__main__": + main() diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 52e28e9b8..ac91177ca 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -159,7 +159,7 @@ else LOCK_FILE="${SQUASH_FILE}.lock" set -x - salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" + salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)" @@ -188,7 +188,7 @@ else --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh scancel $JOB_ID From 28991ebac6d1e51c63ffc136d42f40d9d59e2ae7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 15:27:35 -0500 Subject: [PATCH 02/78] remove deprecated GpuTransferCollector from metrics collector Replaced by vLLM's native kv_offload metrics. Removes subprocess/threading imports and ~100 lines of dead code. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../vllm_benchmark/bench/metrics_collector.py | 105 ------------------ 1 file changed, 105 deletions(-) diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py index c129f38b8..064795f51 100644 --- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py +++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py @@ -6,8 +6,6 @@ import asyncio import csv import re -import subprocess -import threading import time from dataclasses import dataclass, field from pathlib import Path @@ -16,109 +14,6 @@ import matplotlib.pyplot as plt -@dataclass -class GpuTransferSnapshot: - timestamp: float - gpu_id: int = 0 - tx_pci: float = 0.0 # PCIe TX (MB/s) - rx_pci: float = 0.0 # PCIe RX (MB/s) - - -class GpuTransferCollector: - """DEPRECATED: Collects GPU transfer stats using nvidia-smi dmon. - - Replaced by vLLM's native kv_offload metrics (vllm:kv_offload_total_bytes_total, - vllm:kv_offload_total_time_total) which are more precise and don't require - spawning a subprocess. - """ - - def __init__(self, gpu_id: int = 0, poll_interval: int = 1): - self.gpu_id = gpu_id - self.poll_interval = poll_interval - self.snapshots: list[GpuTransferSnapshot] = [] - self._process: subprocess.Popen | None = None - self._thread: threading.Thread | None = None - self._running = False - - def _parse_line(self, line: str) -> GpuTransferSnapshot | None: - """Parse a line of nvidia-smi dmon CSV output. - - Format: gpu, rxpci, txpci (values in MB/s) - Example: 0, 406, 32013 - """ - line = line.strip() - if not line or line.startswith('#'): # Skip header/comments - return None - - parts = [p.strip() for p in line.split(',')] - if len(parts) < 3: - return None - - try: - return GpuTransferSnapshot( - timestamp=time.time(), - gpu_id=int(parts[0]), - rx_pci=float(parts[1]) if parts[1] != '-' else 0.0, - tx_pci=float(parts[2]) if parts[2] != '-' else 0.0, - ) - except (ValueError, IndexError): - return None - - def _reader_thread(self) -> None: - """Background thread to read nvidia-smi output.""" - if self._process is None: - return - - for line in iter(self._process.stdout.readline, ''): - if not self._running: - break - snapshot = self._parse_line(line) - if snapshot and snapshot.gpu_id == self.gpu_id: - self.snapshots.append(snapshot) - - def start(self) -> None: - """Start collecting GPU transfer stats.""" - if self._running: - return - - self._running = True - self.snapshots = [] - - try: - self._process = subprocess.Popen( - [ - 'nvidia-smi', 'dmon', - '-i', str(self.gpu_id), - '-s', 't', - '-d', str(self.poll_interval), - '--format', 'csv', - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - self._thread = threading.Thread(target=self._reader_thread, daemon=True) - self._thread.start() - except FileNotFoundError: - print("nvidia-smi not found, GPU transfer monitoring disabled") - self._running = False - - def stop(self) -> None: - """Stop collecting GPU transfer stats.""" - self._running = False - if self._process: - self._process.terminate() - try: - self._process.wait(timeout=2) - except subprocess.TimeoutExpired: - self._process.kill() - self._process = None - - if self._thread: - self._thread.join(timeout=2) - self._thread = None - - @dataclass class MetricsSnapshot: timestamp: float From 695ec2e03f62e9d0e523cb084f6c72297d3447a8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 16:50:32 -0500 Subject: [PATCH 03/78] modularize metrics collector with backend auto-detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VLLMMetricsParser and SGLangMetricsParser with shared MetricsSnapshot. Backend is auto-detected from metrics prefix (vllm: vs sglang:) on first poll. sglang metrics mapped: - token_usage / num_used_tokens → kv_cache_usage - num_running_reqs → num_requests_running - num_queue_reqs → num_requests_waiting - cache_hit_rate × prompt_tokens → prefix_cache_hits/queries - num_retracted_reqs → num_preemptions - realtime_tokens_total mode=prefill_compute/prefill_cache → token source Co-Authored-By: Claude Opus 4.6 (1M context) --- .../vllm_benchmark/bench/metrics_collector.py | 235 ++++++++++-------- 1 file changed, 129 insertions(+), 106 deletions(-) diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py index 064795f51..6091318c0 100644 --- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py +++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py @@ -1,6 +1,7 @@ """ -Metrics collector for vLLM server during benchmarks. +Metrics collector for inference servers during benchmarks. Polls /metrics endpoint and generates visualizations. +Supports vLLM and sglang backends (auto-detected from metrics prefix). """ import asyncio @@ -9,6 +10,7 @@ import time from dataclasses import dataclass, field from pathlib import Path +from typing import Protocol import aiohttp import matplotlib.pyplot as plt @@ -43,123 +45,144 @@ class MetricsSnapshot: prefill_kv_computed_tokens_count: int = 0 -@dataclass -class MetricsCollector: - base_url: str - poll_interval: float = 1.0 - snapshots: list[MetricsSnapshot] = field(default_factory=list) - _running: bool = False - _task: asyncio.Task | None = None - gpu_transfer_collector: GpuTransferCollector | None = None - gpu_id: int = 0 +# ============================================================================= +# Metrics Parsers — one per backend +# ============================================================================= + +def _get_value(text: str, pattern: str, default: float = 0.0) -> float: + """Extract a gauge/counter value from Prometheus text using a regex.""" + match = re.search(pattern, text) + return float(match.group(1)) if match else default - def _parse_metrics(self, text: str) -> MetricsSnapshot: - """Parse Prometheus metrics text format.""" - snapshot = MetricsSnapshot(timestamp=time.time()) - # Helper to extract gauge/counter value - def get_value(pattern: str, default: float = 0.0) -> float: - match = re.search(pattern, text) - if match: - return float(match.group(1)) - return default +class VLLMMetricsParser: + """Parse vLLM Prometheus metrics (prefix: vllm:).""" + + def parse(self, text: str) -> MetricsSnapshot: + snapshot = MetricsSnapshot(timestamp=time.time()) + g = lambda p, d=0.0: _get_value(text, p, d) # KV cache usage (0-1 scale) - snapshot.kv_cache_usage = get_value( - r'vllm:gpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' - ) - # Fallback to old metric name if new one not found + snapshot.kv_cache_usage = g(r'vllm:gpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)') if snapshot.kv_cache_usage == 0.0: - snapshot.kv_cache_usage = get_value( - r'vllm:kv_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' - ) - - # CPU/offloaded KV cache usage - snapshot.cpu_kv_cache_usage = get_value( - r'vllm:cpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)' - ) - - # Running/waiting requests - snapshot.num_requests_running = int(get_value( - r'vllm:num_requests_running\{[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.num_requests_waiting = int(get_value( - r'vllm:num_requests_waiting\{[^}]*\}\s+([\d.e+-]+)' - )) - - # Prefix cache (cumulative counters) - GPU - snapshot.prefix_cache_hits = int(get_value( - r'vllm:prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.prefix_cache_queries = int(get_value( - r'vllm:prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)' - )) - - # Prefix cache - external/offloaded (KV connector cross-instance cache) - snapshot.cpu_prefix_cache_hits = int(get_value( - r'vllm:external_prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.cpu_prefix_cache_queries = int(get_value( - r'vllm:external_prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)' - )) - - # Token counters - snapshot.prompt_tokens = int(get_value( - r'vllm:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.generation_tokens = int(get_value( - r'vllm:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)' - )) - - # Preemptions - snapshot.num_preemptions = int(get_value( - r'vllm:num_preemptions_total\{[^}]*\}\s+([\d.e+-]+)' - )) - - # Request success (sum all finish reasons) + snapshot.kv_cache_usage = g(r'vllm:kv_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)') + + snapshot.cpu_kv_cache_usage = g(r'vllm:cpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)') + + snapshot.num_requests_running = int(g(r'vllm:num_requests_running\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.num_requests_waiting = int(g(r'vllm:num_requests_waiting\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.prefix_cache_hits = int(g(r'vllm:prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.prefix_cache_queries = int(g(r'vllm:prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.cpu_prefix_cache_hits = int(g(r'vllm:external_prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.cpu_prefix_cache_queries = int(g(r'vllm:external_prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.prompt_tokens = int(g(r'vllm:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.generation_tokens = int(g(r'vllm:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.num_preemptions = int(g(r'vllm:num_preemptions_total\{[^}]*\}\s+([\d.e+-]+)')) + for match in re.finditer( - r'vllm:request_success_total\{[^}]*finished_reason="[^"]*"[^}]*\}\s+([\d.e+-]+)', - text + r'vllm:request_success_total\{[^}]*finished_reason="[^"]*"[^}]*\}\s+([\d.e+-]+)', text ): snapshot.request_success += int(float(match.group(1))) - # KV offload bytes transferred (cumulative counters by direction) - snapshot.kv_offload_bytes_gpu_to_cpu = get_value( - r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)' - ) - snapshot.kv_offload_bytes_cpu_to_gpu = get_value( - r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)' - ) - - # KV offload time (cumulative, seconds) - snapshot.kv_offload_time_gpu_to_cpu = get_value( - r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)' - ) - snapshot.kv_offload_time_cpu_to_gpu = get_value( - r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)' - ) - - # Prompt tokens by source (cumulative) - snapshot.prompt_tokens_local_compute = int(get_value( - r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_compute"[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.prompt_tokens_local_cache_hit = int(get_value( - r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_cache_hit"[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.prompt_tokens_external_kv_transfer = int(get_value( - r'vllm:prompt_tokens_by_source_total\{[^}]*source="external_kv_transfer"[^}]*\}\s+([\d.e+-]+)' - )) - - # Prefill KV computed tokens (histogram sum and count) - snapshot.prefill_kv_computed_tokens_sum = int(get_value( - r'vllm:request_prefill_kv_computed_tokens_sum\{[^}]*\}\s+([\d.e+-]+)' - )) - snapshot.prefill_kv_computed_tokens_count = int(get_value( - r'vllm:request_prefill_kv_computed_tokens_count\{[^}]*\}\s+([\d.e+-]+)' - )) + snapshot.kv_offload_bytes_gpu_to_cpu = g(r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)') + snapshot.kv_offload_bytes_cpu_to_gpu = g(r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)') + snapshot.kv_offload_time_gpu_to_cpu = g(r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)') + snapshot.kv_offload_time_cpu_to_gpu = g(r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)') + + snapshot.prompt_tokens_local_compute = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_compute"[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens_local_cache_hit = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_cache_hit"[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens_external_kv_transfer = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="external_kv_transfer"[^}]*\}\s+([\d.e+-]+)')) + + snapshot.prefill_kv_computed_tokens_sum = int(g(r'vllm:request_prefill_kv_computed_tokens_sum\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.prefill_kv_computed_tokens_count = int(g(r'vllm:request_prefill_kv_computed_tokens_count\{[^}]*\}\s+([\d.e+-]+)')) return snapshot + +class SGLangMetricsParser: + """Parse sglang Prometheus metrics (prefix: sglang:).""" + + def parse(self, text: str) -> MetricsSnapshot: + snapshot = MetricsSnapshot(timestamp=time.time()) + g = lambda p, d=0.0: _get_value(text, p, d) + + # KV cache usage — sglang reports token_usage as a ratio (0-1) + snapshot.kv_cache_usage = g(r'sglang:token_usage\{[^}]*\}\s+([\d.e+-]+)') + # Fallback: compute from num_used_tokens / max_total_num_tokens + if snapshot.kv_cache_usage == 0.0: + used = g(r'sglang:num_used_tokens\{[^}]*\}\s+([\d.e+-]+)') + total = g(r'sglang:max_total_num_tokens\{[^}]*\}\s+([\d.e+-]+)') + if total > 0: + snapshot.kv_cache_usage = used / total + + snapshot.num_requests_running = int(g(r'sglang:num_running_reqs\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.num_requests_waiting = int(g(r'sglang:num_queue_reqs\{[^}]*\}\s+([\d.e+-]+)')) + + # sglang exposes cache_hit_rate as a direct gauge (0-1) + # We convert to cumulative-style by tracking hits/queries from token sources + cache_hit_rate = g(r'sglang:cache_hit_rate\{[^}]*\}\s+([\d.e+-]+)') + prompt_tokens = int(g(r'sglang:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens = prompt_tokens + # Approximate cumulative cache hits from rate × total prompts + if prompt_tokens > 0 and cache_hit_rate > 0: + snapshot.prefix_cache_queries = prompt_tokens + snapshot.prefix_cache_hits = int(prompt_tokens * cache_hit_rate) + + snapshot.generation_tokens = int(g(r'sglang:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + + # Preemptions — sglang calls them "retractions" + snapshot.num_preemptions = int(g(r'sglang:num_retracted_reqs\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.request_success = int(g(r'sglang:num_requests_total\{[^}]*\}\s+([\d.e+-]+)')) + + # Token source breakdown from realtime_tokens_total + snapshot.prompt_tokens_local_compute = int(g( + r'sglang:realtime_tokens_total\{[^}]*mode="prefill_compute"[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens_local_cache_hit = int(g( + r'sglang:realtime_tokens_total\{[^}]*mode="prefill_cache"[^}]*\}\s+([\d.e+-]+)')) + + return snapshot + + +def detect_backend(text: str) -> str: + """Auto-detect backend from metrics text.""" + if 'vllm:' in text: + return 'vllm' + elif 'sglang:' in text: + return 'sglang' + return 'unknown' + + +def get_parser(backend: str): + """Get the appropriate parser for the backend.""" + if backend == 'sglang': + return SGLangMetricsParser() + return VLLMMetricsParser() # default + + +@dataclass +class MetricsCollector: + base_url: str + poll_interval: float = 1.0 + snapshots: list[MetricsSnapshot] = field(default_factory=list) + _running: bool = False + _task: asyncio.Task | None = None + _parser: VLLMMetricsParser | SGLangMetricsParser | None = None + _backend: str = "" + + def _parse_metrics(self, text: str) -> MetricsSnapshot: + """Parse Prometheus metrics text, auto-detecting backend on first call.""" + if self._parser is None: + self._backend = detect_backend(text) + self._parser = get_parser(self._backend) + if self._backend != 'unknown': + print(f"Auto-detected metrics backend: {self._backend}") + return self._parser.parse(text) + async def _poll_loop(self) -> None: """Background polling loop.""" metrics_url = f"{self.base_url}/metrics" From 6a41d49a2345207899e5f8c30e48078abccb25b2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 16:51:19 -0500 Subject: [PATCH 04/78] remove unused Protocol import --- experimental/multiturn/vllm_benchmark/bench/metrics_collector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py index 6091318c0..7bcdf31a4 100644 --- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py +++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py @@ -10,7 +10,6 @@ import time from dataclasses import dataclass, field from pathlib import Path -from typing import Protocol import aiohttp import matplotlib.pyplot as plt From c137677e1f0d5b90617d3578ae99f404ceb2a55c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 17:09:47 -0500 Subject: [PATCH 05/78] add LMCache agentic trace benchmark for H100 Replays SWE-bench/GAIA/WildClaw traces from sammshen/lmcache-agentic-traces via AIPerf with mooncake_trace format. Downloads and converts traces at runtime. Supports concurrency sweep with offload on/off. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp8_h100_lmcache_aiperf.sh | 230 ++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100755 benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh new file mode 100755 index 000000000..fb02a79a1 --- /dev/null +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -0,0 +1,230 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# LMCache agentic trace benchmark for FP8 models on H100 using AIPerf. +# Replays SWE-bench/GAIA/WildClaw agentic traces via mooncake_trace format. +# Dataset: https://huggingface.co/datasets/sammshen/lmcache-agentic-traces +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (if set, runs for this many seconds; otherwise runs to completion) +# REQUEST_RATE (default: 0 = no rate limit, concurrency-burst mode) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} +REQUEST_RATE=${REQUEST_RATE:-0} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +AIPERF_DIR="$MULTITURN_DIR/aiperf" +TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl" + +pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +# ---- Convert LMCache traces to mooncake format ----------------------------- +echo "Downloading and converting LMCache traces..." +python3 -c " +import json, os +try: + from datasets import load_dataset + ds = load_dataset('sammshen/lmcache-agentic-traces', split='train') + out_path = '$TRACE_FILE' + sessions = set() + with open(out_path, 'w') as f: + for row in ds: + entry = { + 'session_id': row['session_id'], + 'messages': row['input'], + 'output_length': row['output_length'], + } + f.write(json.dumps(entry) + '\n') + sessions.add(row['session_id']) + print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}') +except Exception as e: + print(f'ERROR converting traces: {e}') + exit(1) +" + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" + +echo "Installing aiperf in isolated venv..." +python3 -m venv /tmp/aiperf-venv --system-site-packages +/tmp/aiperf-venv/bin/pip install -q -e "$AIPERF_DIR" 2>&1 | tail -10 +AIPERF_BIN="/tmp/aiperf-venv/bin/aiperf" + +/tmp/aiperf-venv/bin/python -c "import aiperf; print('aiperf installed OK')" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run AIPerf benchmark ---------------------------------------------------- +export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl" + +AIPERF_CMD="$AIPERF_BIN profile" +AIPERF_CMD+=" --model $MODEL" +AIPERF_CMD+=" --url http://localhost:$PORT" +AIPERF_CMD+=" --endpoint-type chat" +AIPERF_CMD+=" --streaming" +AIPERF_CMD+=" --input-file $TRACE_FILE" +AIPERF_CMD+=" --custom-dataset-type mooncake_trace" +AIPERF_CMD+=" --concurrency $USERS" +if [ "$REQUEST_RATE" != "0" ]; then + AIPERF_CMD+=" --request-rate $REQUEST_RATE" +fi +if [ -n "${DURATION:-}" ]; then + AIPERF_CMD+=" --benchmark-duration $DURATION" + AIPERF_CMD+=" --benchmark-grace-period 0" +fi +AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT" +AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts" +AIPERF_CMD+=" --extra-inputs ignore_eos:true" +AIPERF_CMD+=" --export-level records" +AIPERF_CMD+=" --ui-type simple" +AIPERF_CMD+=" --random-seed 42" + +echo "$AIPERF_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $AIPERF_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/aiperf_artifacts" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" From ee767671f52da38c31d355ab359b9a0d8000d532 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 17:15:20 -0500 Subject: [PATCH 06/78] add H100 LMCache trace sweep config Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/multiturn-agentic-trace.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index 5ec98b902..e19780a21 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -20,6 +20,17 @@ mi355x-fp8-llama70b: users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512] offload: ["on", "off"] +h100-fp8-llama70b-lmcache: + tp2: + users: [2, 4, 6, 8, 10, 12, 16, 20] + offload: ["on", "off"] + tp4: + users: [2, 4, 8, 12, 16, 20, 24, 32, 40] + offload: ["on", "off"] + tp8: + users: [2, 4, 8, 16, 24, 32, 48, 64] + offload: ["on", "off"] + b200-fp4-dsr1: tp4: ep: 4 From 839ba0f8de99ef541ca8c652a6bfe087479e5a02 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 17:30:54 -0500 Subject: [PATCH 07/78] fix LMCache benchmark: use fixed-schedule replay, remove ignore_eos - Add --fixed-schedule to replay at exact trace timestamps - Remove --extra-inputs ignore_eos:true (let model stop naturally) - Remove unused REQUEST_RATE logic Co-Authored-By: Claude Opus 4.6 (1M context) --- .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index fb02a79a1..53d2c03b1 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -11,7 +11,6 @@ set -x # Optional: # PORT (default 8888), REQUEST_TIMEOUT (default 3600) # DURATION (if set, runs for this many seconds; otherwise runs to completion) -# REQUEST_RATE (default: 0 = no rate limit, concurrency-burst mode) source "$(dirname "$0")/../benchmark_lib.sh" @@ -25,7 +24,6 @@ check_env_vars \ PORT=${PORT:-8888} REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} -REQUEST_RATE=${REQUEST_RATE:-0} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -183,17 +181,14 @@ AIPERF_CMD+=" --endpoint-type chat" AIPERF_CMD+=" --streaming" AIPERF_CMD+=" --input-file $TRACE_FILE" AIPERF_CMD+=" --custom-dataset-type mooncake_trace" +AIPERF_CMD+=" --fixed-schedule" AIPERF_CMD+=" --concurrency $USERS" -if [ "$REQUEST_RATE" != "0" ]; then - AIPERF_CMD+=" --request-rate $REQUEST_RATE" -fi if [ -n "${DURATION:-}" ]; then AIPERF_CMD+=" --benchmark-duration $DURATION" AIPERF_CMD+=" --benchmark-grace-period 0" fi AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT" AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts" -AIPERF_CMD+=" --extra-inputs ignore_eos:true" AIPERF_CMD+=" --export-level records" AIPERF_CMD+=" --ui-type simple" AIPERF_CMD+=" --random-seed 42" From fc8e3cf02d7975931233bcd43589030ab036d829 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 17:36:00 -0500 Subject: [PATCH 08/78] remove --fixed-schedule: use concurrency mode per Samuel's recommendation --- benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 53d2c03b1..ff10f0252 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -181,7 +181,6 @@ AIPERF_CMD+=" --endpoint-type chat" AIPERF_CMD+=" --streaming" AIPERF_CMD+=" --input-file $TRACE_FILE" AIPERF_CMD+=" --custom-dataset-type mooncake_trace" -AIPERF_CMD+=" --fixed-schedule" AIPERF_CMD+=" --concurrency $USERS" if [ -n "${DURATION:-}" ]; then AIPERF_CMD+=" --benchmark-duration $DURATION" From 6bbbfa989d23789385897fb015b2271a89390293 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 21:23:45 -0500 Subject: [PATCH 09/78] update yaml --- .github/configs/multiturn-agentic-trace.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index e19780a21..500a6705e 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -22,13 +22,13 @@ mi355x-fp8-llama70b: h100-fp8-llama70b-lmcache: tp2: - users: [2, 4, 6, 8, 10, 12, 16, 20] + users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64] offload: ["on", "off"] tp4: - users: [2, 4, 8, 12, 16, 20, 24, 32, 40] + users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 128] offload: ["on", "off"] tp8: - users: [2, 4, 8, 16, 24, 32, 48, 64] + users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256] offload: ["on", "off"] b200-fp4-dsr1: From a2e4fe64351a31f378eb535e903555995b9f9341 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 21:51:24 -0500 Subject: [PATCH 10/78] fix H100 runner: add SCRIPT_SUFFIX support Co-Authored-By: Claude Opus 4.6 (1M context) --- runners/launch_h100-cw.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 49a42e981..28e89e0cb 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -31,7 +31,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh +bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100${SCRIPT_SUFFIX:-}.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID From fee02780917b7755aec804fcea39dc940160ddaf Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 22:45:33 -0500 Subject: [PATCH 11/78] fix: mkdir RESULT_DIR before trace conversion --- benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index ff10f0252..1bec35577 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -81,6 +81,8 @@ with open(sys.argv[1], 'w') as f: PYEOF fi +mkdir -p "$RESULT_DIR" + # ---- Convert LMCache traces to mooncake format ----------------------------- echo "Downloading and converting LMCache traces..." python3 -c " From 769532c3985bd24714d65dfdf3ad6e3651c9b60c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 23:15:28 -0500 Subject: [PATCH 12/78] add H200 LMCache trace benchmark and config Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/multiturn-agentic-trace.yaml | 11 + .../multiturn_fp8_h200_lmcache_aiperf.sh | 226 ++++++++++++++++++ 2 files changed, 237 insertions(+) create mode 100755 benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index 500a6705e..bb0e568d3 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -20,6 +20,17 @@ mi355x-fp8-llama70b: users: [1, 2, 4, 8, 16, 32, 64, 96, 128, 160, 256, 512] offload: ["on", "off"] +h200-fp8-llama70b-lmcache: + tp2: + users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32] + offload: ["on", "off"] + tp4: + users: [4, 8, 16, 24, 32, 40, 48, 56] + offload: ["on", "off"] + tp8: + users: [2, 4, 8, 16, 32, 48, 64, 80] + offload: ["on", "off"] + h100-fp8-llama70b-lmcache: tp2: users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64] diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh new file mode 100755 index 000000000..9a0c89e5a --- /dev/null +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -0,0 +1,226 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# LMCache agentic trace benchmark for FP8 models on H200 using AIPerf. +# Replays SWE-bench/GAIA/WildClaw agentic traces via mooncake_trace format. +# Dataset: https://huggingface.co/datasets/sammshen/lmcache-agentic-traces +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (if set, runs for this many seconds; otherwise runs to completion) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +AIPERF_DIR="$MULTITURN_DIR/aiperf" +TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl" + +pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +mkdir -p "$RESULT_DIR" + +# ---- Convert LMCache traces to mooncake format ----------------------------- +echo "Downloading and converting LMCache traces..." +python3 -c " +import json, os +try: + from datasets import load_dataset + ds = load_dataset('sammshen/lmcache-agentic-traces', split='train') + out_path = '$TRACE_FILE' + sessions = set() + with open(out_path, 'w') as f: + for row in ds: + entry = { + 'session_id': row['session_id'], + 'messages': row['input'], + 'output_length': row['output_length'], + } + f.write(json.dumps(entry) + '\n') + sessions.add(row['session_id']) + print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}') +except Exception as e: + print(f'ERROR converting traces: {e}') + exit(1) +" + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" + +echo "Installing aiperf in isolated venv..." +python3 -m venv /tmp/aiperf-venv --system-site-packages +/tmp/aiperf-venv/bin/pip install -q -e "$AIPERF_DIR" 2>&1 | tail -10 +AIPERF_BIN="/tmp/aiperf-venv/bin/aiperf" + +/tmp/aiperf-venv/bin/python -c "import aiperf; print('aiperf installed OK')" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run AIPerf benchmark ---------------------------------------------------- +export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl" + +AIPERF_CMD="$AIPERF_BIN profile" +AIPERF_CMD+=" --model $MODEL" +AIPERF_CMD+=" --url http://localhost:$PORT" +AIPERF_CMD+=" --endpoint-type chat" +AIPERF_CMD+=" --streaming" +AIPERF_CMD+=" --input-file $TRACE_FILE" +AIPERF_CMD+=" --custom-dataset-type mooncake_trace" +AIPERF_CMD+=" --concurrency $USERS" +if [ -n "${DURATION:-}" ]; then + AIPERF_CMD+=" --benchmark-duration $DURATION" + AIPERF_CMD+=" --benchmark-grace-period 0" +fi +AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT" +AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts" +AIPERF_CMD+=" --export-level records" +AIPERF_CMD+=" --ui-type simple" +AIPERF_CMD+=" --random-seed 42" + +echo "$AIPERF_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $AIPERF_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/aiperf_artifacts" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" From 02876afda83786ab96df394708356f99076d9fe0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 23:15:47 -0500 Subject: [PATCH 13/78] update yaml --- .github/configs/multiturn-agentic-trace.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index bb0e568d3..63892d202 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -22,13 +22,13 @@ mi355x-fp8-llama70b: h200-fp8-llama70b-lmcache: tp2: - users: [2, 4, 6, 8, 10, 12, 16, 20, 24, 32] + users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64] offload: ["on", "off"] tp4: - users: [4, 8, 16, 24, 32, 40, 48, 56] + users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 128] offload: ["on", "off"] tp8: - users: [2, 4, 8, 16, 32, 48, 64, 80] + users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256] offload: ["on", "off"] h100-fp8-llama70b-lmcache: From 2134fd8664effdb5066834c2e81a5c53a50ce3fd Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 23:19:08 -0500 Subject: [PATCH 14/78] fix H200-nb runner: add SCRIPT_SUFFIX support --- runners/launch_h200-nb.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 9d157a858..8c75700df 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -19,4 +19,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh From ab2812a8eaea1d52c1d08e37383d7649308ca613 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 1 Apr 2026 23:23:08 -0500 Subject: [PATCH 15/78] fix all H200 runners: add SCRIPT_SUFFIX support --- runners/launch_h200-cw.sh | 2 +- runners/launch_h200-dgxc-slurm.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 657f84792..c4bdad736 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -44,7 +44,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 9b3b771a5..e09eaeeed 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -258,7 +258,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp')${SCRIPT_SUFFIX:-}.sh scancel $JOB_ID From 5aa993f5eef7ecf3625bb861c04530e976d2a1a0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 08:20:01 -0500 Subject: [PATCH 16/78] fix all runners: add SCRIPT_SUFFIX support Co-Authored-By: Claude Opus 4.6 (1M context) --- runners/launch_b200-dgxc-slurm.sh | 2 +- runners/launch_b200-dgxc.sh | 2 +- runners/launch_b200-nb.sh | 2 +- runners/launch_gb200-nv.sh | 2 +- runners/launch_h100-cr.sh | 2 +- runners/launch_h100-dgxc-slurm.sh | 2 +- runners/launch_mi300x-amds.sh | 2 +- runners/launch_mi325x-amd.sh | 2 +- runners/launch_mi355x-amds.sh | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 0d1bd40cc..d2ad4bc5d 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -234,5 +234,5 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh fi diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f8c614936..8243fd6d0 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -36,7 +36,7 @@ docker run --rm --init --network host --name $server_name \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ -benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" +benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh" # Try graceful first docker stop -t 90 "$server_name" || true diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index c321ee0f9..eda4b17ba 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -17,4 +17,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh \ No newline at end of file +bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}${SCRIPT_SUFFIX:-}.sh \ No newline at end of file diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index f8f0ef26e..8d20ea162 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -63,7 +63,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" && -z "$CONFIG_FILE" ]]; then else BENCHMARK_SUBDIR="single_node" fi - bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}" + bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME%.sh}${SCRIPT_SUFFIX:-}.sh" # Wait for all jobs to complete echo "Waiting for all jobs to complete..." while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 5100419b9..7539d99db 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -15,4 +15,4 @@ docker run --rm --network=host --name=$server_name \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ -benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100.sh" +benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100${SCRIPT_SUFFIX:-}.sh" diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index bb0335955..98af3caf2 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -247,7 +247,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100${SCRIPT_SUFFIX:-}.sh scancel $JOB_ID diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index b654c515a..8b9896e00 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -35,6 +35,6 @@ srun --jobid=$JOB_ID \ --container-remap-root \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi300x.sh +bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi300x${SCRIPT_SUFFIX:-}.sh scancel $JOB_ID \ No newline at end of file diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 67f93a309..e6c3ca4e4 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -35,6 +35,6 @@ srun --jobid=$JOB_ID \ --container-remap-root \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x.sh +bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x${SCRIPT_SUFFIX:-}.sh scancel $JOB_ID diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index ac91177ca..2069774ba 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -57,7 +57,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then else BENCHMARK_SUBDIR="single_node" fi - JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") + JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME%.sh}${SCRIPT_SUFFIX:-}.sh") # Wait for job to complete LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out" From d5dd15103276a358988792f7d8d41c37b5ff07d0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 08:35:56 -0500 Subject: [PATCH 17/78] reduce multiturn artifact size: upload only files needed for post-processing Drops ~18GB per artifact by excluding inputs.json, conversations.jsonl, responses.json, GPU telemetry, raw records, and full aiperf_artifacts/. Only uploads the specific files used by collect_sweep_results.py and plot_pareto.py. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index a72034b14..20777d0eb 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -156,18 +156,17 @@ jobs: results/metrics_server_metrics.csv results/metrics_plots.png results/benchmark.log - results/server.log results/config.yaml results/vllm_command.txt results/benchmark_command.txt results/benchmark_metadata.json results/metrics_workload.png - results/responses.json - results/aiperf_artifacts/ - results/conversations.jsonl + results/aiperf_artifacts/profile_export_aiperf.csv + results/aiperf_artifacts/profile_export_aiperf.json + results/aiperf_artifacts/profile_export.jsonl results/workload_distribution_summary.txt results/workload_distribution_plots.png - results/trace_replay/ + results/trace_replay/detailed_results.csv results/status.txt if-no-files-found: ignore From bd4ec30ec4d83fefc403828b61db0fe599c00aab Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 10:04:42 -0500 Subject: [PATCH 18/78] add exclusive --- runners/launch_h100-dgxc-slurm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 98af3caf2..b3190577a 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -242,6 +242,7 @@ else fi srun --jobid=$JOB_ID \ + --exclusive \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ From a12cc9d2498c2571b98e9bb4239a3c2c047901f4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 10:04:57 -0500 Subject: [PATCH 19/78] add exclusive --- runners/launch_b200-dgxc-slurm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index d2ad4bc5d..3ff289e61 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -229,6 +229,7 @@ else fi srun --jobid=$JOB_ID \ + --exclusive \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ From af49d11635ee979f36b9550bfcf56199671a8ce3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 11:04:40 -0500 Subject: [PATCH 20/78] add exclusive --- runners/launch_h100-dgxc-slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index b3190577a..124c8de6e 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -3,7 +3,7 @@ # System-specific configuration for H100 DGXC Slurm cluster SLURM_PARTITION="hpc-gpu-1" SLURM_ACCOUNT="customer" -SLURM_EXCLUDED_NODELIST="hpc-gpu-1-7" +SLURM_EXCLUDED_NODELIST="hpc-gpu-1-1,hpc-gpu-1-7,hpc-gpu-1-18" set -x From 48ef44d54c63823ff127c214e0785dc4f70cafb2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 11:14:32 -0500 Subject: [PATCH 21/78] use aiperf summary CSV instead of per-record JSONL for post-processing The profile_export.jsonl with 233K records was ~10GB per artifact. Switch collect_sweep_results.py and plot_pareto.py to read from the pre-computed profile_export_aiperf.csv (~4KB) instead. Remove the JSONL from the artifact upload. Existing client CSV and trace_replay paths are unchanged. Also exclude low-FreeMem H100 nodes (1, 7, 18) to avoid cudaMallocHost/mlock failures during vLLM CPU KV cache allocation. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../workflows/benchmark-multiturn-tmpl.yml | 1 - .../vllm_benchmark/analysis/plot_pareto.py | 172 +++++++------ .../scripts/collect_sweep_results.py | 242 ++++++++++-------- 3 files changed, 223 insertions(+), 192 deletions(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index 20777d0eb..7c1d5ce0d 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -163,7 +163,6 @@ jobs: results/metrics_workload.png results/aiperf_artifacts/profile_export_aiperf.csv results/aiperf_artifacts/profile_export_aiperf.json - results/aiperf_artifacts/profile_export.jsonl results/workload_distribution_summary.txt results/workload_distribution_plots.png results/trace_replay/detailed_results.csv diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py index 277bfca7f..7da67c8a4 100644 --- a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py +++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py @@ -17,53 +17,69 @@ from pathlib import Path -def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None: - """Load per-request metrics from aiperf profile_export JSONL.""" - records = [] - with open(jsonl_path) as f: - for line in f: - line = line.strip() - if not line: - continue - entry = json.loads(line) - meta = entry.get("metadata", {}) - metrics = entry.get("metrics", {}) - - if meta.get("benchmark_phase") != "profiling": - continue - if meta.get("was_cancelled", False): - continue - - def val(key, default=0): - m = metrics.get(key) - if m is None: - return default - return m.get("value", default) if isinstance(m, dict) else m - - itl = metrics.get("inter_token_latency") - if itl and isinstance(itl, dict): - tpot_ms = itl.get("value", 0) - else: - osl = val("output_sequence_length", 1) - ttft = val("time_to_first_token", 0) - latency = val("request_latency", 0) - tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0 - - start_ns = meta.get("request_start_ns", 0) - start_ms = start_ns / 1e6 - - records.append({ - "start_time_ms": start_ms, - "ttft_ms": val("time_to_first_token"), - "tpot_ms": tpot_ms, - "latency_ms": val("request_latency"), - "input_num_tokens": val("input_sequence_length"), - "output_num_tokens": val("output_sequence_length"), - }) - - if not records: +def _load_aiperf_summary_csv(csv_path: Path, exp_dir: Path, tp: int, + gpu_hit_rate: float | None, + cpu_hit_rate: float | None) -> dict | None: + """Load aggregate metrics directly from aiperf's profile_export_aiperf.csv.""" + df = pd.read_csv(csv_path) + if len(df) == 0: return None - return pd.DataFrame(records) + + per_metric = df[df["avg"].notna()].set_index("Metric") + scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric") + + def metric_stat(metric_name, stat): + if metric_name in per_metric.index: + return float(per_metric.loc[metric_name, stat]) + return 0 + + def scalar_val(metric_name): + if metric_name in scalars.index: + return float(scalars.loc[metric_name, "min"]) + return 0 + + exp_name = exp_dir.name + parts = exp_name.split("_") + tp_parsed = int(parts[0].replace("tp", "")) + bs = int(parts[1].replace("bs", "")) + offload = parts[2].replace("offload", "") + + num_requests = int(scalar_val("Request Count")) + throughput_rps = scalar_val("Request Throughput (requests/sec)") + output_throughput_tps = scalar_val("Output Token Throughput (tokens/sec)") + total_throughput_tps = scalar_val("Total Token Throughput (tokens/sec)") + input_throughput_tps = total_throughput_tps - output_throughput_tps + + return { + "exp_name": exp_name, + "tp": tp_parsed, + "bs": bs, + "offload": offload, + "num_requests": num_requests, + "throughput_rps": throughput_rps, + "input_throughput_tps": input_throughput_tps, + "total_throughput_tps": total_throughput_tps, + "input_tps_per_gpu": input_throughput_tps / tp_parsed, + "output_tps_per_gpu": output_throughput_tps / tp_parsed, + "total_tps_per_gpu": total_throughput_tps / tp_parsed, + "mean_ttft_ms": metric_stat("Time to First Token (ms)", "avg"), + "p50_ttft_ms": metric_stat("Time to First Token (ms)", "p50"), + "p90_ttft_ms": metric_stat("Time to First Token (ms)", "p90"), + "p99_ttft_ms": metric_stat("Time to First Token (ms)", "p99"), + "mean_tpot_ms": metric_stat("Inter Token Latency (ms)", "avg"), + "p50_tpot_ms": metric_stat("Inter Token Latency (ms)", "p50"), + "p90_tpot_ms": metric_stat("Inter Token Latency (ms)", "p90"), + "p99_tpot_ms": metric_stat("Inter Token Latency (ms)", "p99"), + "p999_tpot_ms": metric_stat("Inter Token Latency (ms)", "p99"), # p999 not available, use p99 + "mean_latency_ms": metric_stat("Request Latency (ms)", "avg"), + "p50_latency_ms": metric_stat("Request Latency (ms)", "p50"), + "p90_latency_ms": metric_stat("Request Latency (ms)", "p90"), + "p99_latency_ms": metric_stat("Request Latency (ms)", "p99"), + "p999_latency_ms": metric_stat("Request Latency (ms)", "p99"), # p999 not available, use p99 + "p999_ttft_ms": metric_stat("Time to First Token (ms)", "p99"), # p999 not available, use p99 + "gpu_hit_rate": gpu_hit_rate, + "cpu_hit_rate": cpu_hit_rate, + } def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None: @@ -103,43 +119,46 @@ def load_experiment_data(exp_dir: Path) -> dict | None: if status != "SUCCESS": return None - # Also check for aiperf output - aiperf_jsonl = None + # Check for aiperf summary CSV (preferred) + aiperf_summary_csv = None aiperf_artifacts = exp_dir / "aiperf_artifacts" if aiperf_artifacts.exists(): - candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl")) - if not candidates: - candidates = list(aiperf_artifacts.glob("profile_export*.jsonl")) - if candidates: - aiperf_jsonl = candidates[0] + candidate = aiperf_artifacts / "profile_export_aiperf.csv" + if candidate.exists(): + aiperf_summary_csv = candidate # Check for trace replay output trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" - if not client_metrics_file.exists() and aiperf_jsonl is None and not trace_replay_csv.exists(): + if not client_metrics_file.exists() and aiperf_summary_csv is None and not trace_replay_csv.exists(): return None try: - if client_metrics_file.exists(): - df = pd.read_csv(client_metrics_file) - elif aiperf_jsonl is not None: - df = _load_aiperf_jsonl(aiperf_jsonl) - elif trace_replay_csv.exists(): - df = _load_trace_replay_csv(trace_replay_csv) - else: - return None - # Load server metrics for cache hit rates gpu_hit_rate = None cpu_hit_rate = None if server_metrics_file.exists(): server_df = pd.read_csv(server_metrics_file) - # Get final cumulative values final_row = server_df.iloc[-1] if final_row["prefix_cache_queries"] > 0: gpu_hit_rate = 100 * final_row["prefix_cache_hits"] / final_row["prefix_cache_queries"] if final_row["cpu_prefix_cache_queries"] > 0: cpu_hit_rate = 100 * final_row["cpu_prefix_cache_hits"] / final_row["cpu_prefix_cache_queries"] + + # Use aiperf summary CSV directly if available + if aiperf_summary_csv is not None and not client_metrics_file.exists(): + exp_name = exp_dir.name + parts = exp_name.split("_") + tp = int(parts[0].replace("tp", "")) + return _load_aiperf_summary_csv(aiperf_summary_csv, exp_dir, tp, gpu_hit_rate, cpu_hit_rate) + + if client_metrics_file.exists(): + df = pd.read_csv(client_metrics_file) + elif trace_replay_csv.exists(): + df = _load_trace_replay_csv(trace_replay_csv) + else: + return None + if len(df) == 0: return None @@ -151,7 +170,6 @@ def load_experiment_data(exp_dir: Path) -> dict | None: offload = parts[2].replace("offload", "") # Calculate metrics - # Prefer benchmark_metadata.json for precise wall-clock duration metadata_file = exp_dir / "benchmark_metadata.json" total_time_sec = None if metadata_file.exists(): @@ -162,33 +180,20 @@ def load_experiment_data(exp_dir: Path) -> dict | None: except Exception: pass - # Fallback: derive from per-request data (first start to last finish) if not total_time_sec or total_time_sec <= 0: first_start_ms = df["start_time_ms"].min() last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 if total_time_sec <= 0: - total_time_sec = df["latency_ms"].sum() / 1000 # fallback + total_time_sec = df["latency_ms"].sum() / 1000 num_requests = len(df) throughput_rps = num_requests / total_time_sec if total_time_sec > 0 else 0 - - # Input token throughput (prefill) total_input_tokens = df["input_num_tokens"].sum() input_throughput_tps = total_input_tokens / total_time_sec if total_time_sec > 0 else 0 - - # Output token throughput (decode only) total_output_tokens = df["output_num_tokens"].sum() output_throughput_tps = total_output_tokens / total_time_sec if total_time_sec > 0 else 0 - - # Total token throughput (input + output) - total_tokens = total_input_tokens + total_output_tokens - total_throughput_tps = total_tokens / total_time_sec if total_time_sec > 0 else 0 - - # Normalized throughput (per GPU) - input_tps_per_gpu = input_throughput_tps / tp - output_tps_per_gpu = output_throughput_tps / tp - total_tps_per_gpu = total_throughput_tps / tp + total_throughput_tps = (total_input_tokens + total_output_tokens) / total_time_sec if total_time_sec > 0 else 0 return { "exp_name": exp_name, @@ -199,9 +204,9 @@ def load_experiment_data(exp_dir: Path) -> dict | None: "throughput_rps": throughput_rps, "input_throughput_tps": input_throughput_tps, "total_throughput_tps": total_throughput_tps, - "input_tps_per_gpu": input_tps_per_gpu, - "output_tps_per_gpu": output_tps_per_gpu, - "total_tps_per_gpu": total_tps_per_gpu, + "input_tps_per_gpu": input_throughput_tps / tp, + "output_tps_per_gpu": output_throughput_tps / tp, + "total_tps_per_gpu": total_throughput_tps / tp, "mean_ttft_ms": df["ttft_ms"].mean(), "p50_ttft_ms": df["ttft_ms"].median(), "p90_ttft_ms": df["ttft_ms"].quantile(0.9), @@ -217,7 +222,6 @@ def load_experiment_data(exp_dir: Path) -> dict | None: "p99_latency_ms": df["latency_ms"].quantile(0.99), "p999_latency_ms": df["latency_ms"].quantile(0.999), "p999_ttft_ms": df["ttft_ms"].quantile(0.999), - # Cache hit rates "gpu_hit_rate": gpu_hit_rate, "cpu_hit_rate": cpu_hit_rate, } diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py index fc02b1865..9910fb8ff 100755 --- a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py +++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py @@ -33,63 +33,52 @@ def _load_custom_client_csv(client_csv: Path, exp_dir: Path) -> pd.DataFrame | N return df -def _load_aiperf_jsonl(jsonl_path: Path) -> pd.DataFrame | None: - """Load per-request metrics from aiperf profile_export JSONL. +def _load_aiperf_summary_csv(csv_path: Path) -> dict | None: + """Load aggregate metrics directly from aiperf's profile_export_aiperf.csv. - Converts aiperf's per-record format into the same column schema - used by the custom benchmark client CSV. + Returns a dict with pre-computed metrics matching the result schema, + or None if the file can't be parsed. """ - records = [] - with open(jsonl_path) as f: - for line in f: - line = line.strip() - if not line: - continue - entry = json.loads(line) - meta = entry.get("metadata", {}) - metrics = entry.get("metrics", {}) - - # Skip non-profiling records or cancelled requests - if meta.get("benchmark_phase") != "profiling": - continue - if meta.get("was_cancelled", False): - continue - - # Extract values (aiperf stores metrics as {value, unit} dicts) - def val(key, default=0): - m = metrics.get(key) - if m is None: - return default - return m.get("value", default) if isinstance(m, dict) else m - - # Compute TPOT from ITL if available - itl = metrics.get("inter_token_latency") - if itl and isinstance(itl, dict): - tpot_ms = itl.get("value", 0) - else: - # Fallback: (latency - ttft) / (output_tokens - 1) - osl = val("output_sequence_length", 1) - ttft = val("time_to_first_token", 0) - latency = val("request_latency", 0) - tpot_ms = (latency - ttft) / max(osl - 1, 1) if osl > 1 else 0 - - # Convert request_start_ns to ms (epoch) - start_ns = meta.get("request_start_ns", 0) - start_ms = start_ns / 1e6 - - records.append({ - "start_time_ms": start_ms, - "ttft_ms": val("time_to_first_token"), - "tpot_ms": tpot_ms, - "latency_ms": val("request_latency"), - "input_num_tokens": val("input_sequence_length"), - "output_num_tokens": val("output_sequence_length"), - }) - - if not records: + df = pd.read_csv(csv_path) + if len(df) == 0: return None - return pd.DataFrame(records) + # The CSV has two sections: + # 1. Per-metric rows with columns: Metric, avg, min, max, sum, p1..p99, std + # 2. Scalar rows with columns: Metric, Value + # Split by finding rows where only Metric and Value are populated + per_metric = df[df["avg"].notna()].set_index("Metric") + scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric") + + def metric_stat(metric_name, stat): + if metric_name in per_metric.index: + return float(per_metric.loc[metric_name, stat]) + return 0 + + def scalar_val(metric_name): + if metric_name in scalars.index: + return float(scalars.loc[metric_name, "min"]) # "min" column holds Value + return 0 + + return { + "num_requests": int(scalar_val("Request Count")), + "throughput_rps": scalar_val("Request Throughput (requests/sec)"), + "output_throughput_tps": scalar_val("Output Token Throughput (tokens/sec)"), + "total_throughput_tps": scalar_val("Total Token Throughput (tokens/sec)"), + "input_throughput_tps": scalar_val("Total Token Throughput (tokens/sec)") - scalar_val("Output Token Throughput (tokens/sec)"), + "mean_ttft_ms": metric_stat("Time to First Token (ms)", "avg"), + "p50_ttft_ms": metric_stat("Time to First Token (ms)", "p50"), + "p90_ttft_ms": metric_stat("Time to First Token (ms)", "p90"), + "p99_ttft_ms": metric_stat("Time to First Token (ms)", "p99"), + "mean_tpot_ms": metric_stat("Inter Token Latency (ms)", "avg"), + "p50_tpot_ms": metric_stat("Inter Token Latency (ms)", "p50"), + "p90_tpot_ms": metric_stat("Inter Token Latency (ms)", "p90"), + "p99_tpot_ms": metric_stat("Inter Token Latency (ms)", "p99"), + "mean_latency_ms": metric_stat("Request Latency (ms)", "avg"), + "p50_latency_ms": metric_stat("Request Latency (ms)", "p50"), + "p90_latency_ms": metric_stat("Request Latency (ms)", "p90"), + "p99_latency_ms": metric_stat("Request Latency (ms)", "p99"), + } def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None: @@ -125,20 +114,18 @@ def load_experiment(exp_dir: Path) -> dict | None: return None status = status_file.read_text().strip() - # Also check for aiperf output - aiperf_jsonl = None + # Check for aiperf summary CSV (preferred) or per-record JSONL (fallback) + aiperf_summary_csv = None aiperf_artifacts = exp_dir / "aiperf_artifacts" if aiperf_artifacts.exists(): - candidates = list(aiperf_artifacts.glob("profile_export_aiperf.jsonl")) - if not candidates: - candidates = list(aiperf_artifacts.glob("profile_export*.jsonl")) - if candidates: - aiperf_jsonl = candidates[0] + candidate = aiperf_artifacts / "profile_export_aiperf.csv" + if candidate.exists(): + aiperf_summary_csv = candidate # Check for trace replay output trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" - if not client_csv.exists() and aiperf_jsonl is None and not trace_replay_csv.exists(): + if not client_csv.exists() and aiperf_summary_csv is None and not trace_replay_csv.exists(): return None # Parse experiment name from directory: multiturn_tp{N}_users{M}_offload{mode} @@ -168,59 +155,100 @@ def load_experiment(exp_dir: Path) -> dict | None: return result try: - # Determine data source: custom client CSV, aiperf JSONL, or trace replay CSV + # Determine data source: custom client CSV, aiperf summary CSV, or trace replay CSV if client_csv.exists(): df = _load_custom_client_csv(client_csv, exp_dir) - elif aiperf_jsonl is not None: - df = _load_aiperf_jsonl(aiperf_jsonl) + if df is None or len(df) == 0: + return result + + # Prefer benchmark_metadata.json for precise wall-clock duration + metadata_file = exp_dir / "benchmark_metadata.json" + total_time_sec = None + if metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + total_time_sec = metadata.get("benchmark_runtime_sec") + except Exception: + pass + + if not total_time_sec or total_time_sec <= 0: + first_start_ms = df["start_time_ms"].min() + last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() + total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 + if total_time_sec <= 0: + total_time_sec = df["latency_ms"].sum() / 1000 + + num_requests = len(df) + result.update({ + "num_requests": num_requests, + "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, + "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, + "mean_ttft_ms": df["ttft_ms"].mean(), + "p50_ttft_ms": df["ttft_ms"].median(), + "p90_ttft_ms": df["ttft_ms"].quantile(0.9), + "p99_ttft_ms": df["ttft_ms"].quantile(0.99), + "mean_tpot_ms": df["tpot_ms"].mean(), + "p50_tpot_ms": df["tpot_ms"].median(), + "p90_tpot_ms": df["tpot_ms"].quantile(0.9), + "p99_tpot_ms": df["tpot_ms"].quantile(0.99), + "mean_latency_ms": df["latency_ms"].mean(), + "p50_latency_ms": df["latency_ms"].median(), + "p90_latency_ms": df["latency_ms"].quantile(0.9), + "p99_latency_ms": df["latency_ms"].quantile(0.99), + }) + elif aiperf_summary_csv is not None: + aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv) + if aiperf_metrics is None: + return result + result.update(aiperf_metrics) elif trace_replay_csv.exists(): df = _load_trace_replay_csv(trace_replay_csv) + if df is None or len(df) == 0: + return result + + metadata_file = exp_dir / "benchmark_metadata.json" + total_time_sec = None + if metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + total_time_sec = metadata.get("benchmark_runtime_sec") + except Exception: + pass + + if not total_time_sec or total_time_sec <= 0: + first_start_ms = df["start_time_ms"].min() + last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() + total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 + if total_time_sec <= 0: + total_time_sec = df["latency_ms"].sum() / 1000 + + num_requests = len(df) + result.update({ + "num_requests": num_requests, + "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, + "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, + "mean_ttft_ms": df["ttft_ms"].mean(), + "p50_ttft_ms": df["ttft_ms"].median(), + "p90_ttft_ms": df["ttft_ms"].quantile(0.9), + "p99_ttft_ms": df["ttft_ms"].quantile(0.99), + "mean_tpot_ms": df["tpot_ms"].mean(), + "p50_tpot_ms": df["tpot_ms"].median(), + "p90_tpot_ms": df["tpot_ms"].quantile(0.9), + "p99_tpot_ms": df["tpot_ms"].quantile(0.99), + "mean_latency_ms": df["latency_ms"].mean(), + "p50_latency_ms": df["latency_ms"].median(), + "p90_latency_ms": df["latency_ms"].quantile(0.9), + "p99_latency_ms": df["latency_ms"].quantile(0.99), + }) else: return result - if df is None or len(df) == 0: - return result - - # Prefer benchmark_metadata.json for precise wall-clock duration - metadata_file = exp_dir / "benchmark_metadata.json" - total_time_sec = None - if metadata_file.exists(): - try: - with open(metadata_file) as f: - metadata = json.load(f) - total_time_sec = metadata.get("benchmark_runtime_sec") - except Exception: - pass - - # Fallback: derive from per-request data (first start to last finish) - if not total_time_sec or total_time_sec <= 0: - first_start_ms = df["start_time_ms"].min() - last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() - total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 - if total_time_sec <= 0: - total_time_sec = df["latency_ms"].sum() / 1000 - - num_requests = len(df) - result.update({ - "num_requests": num_requests, - "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, - "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, - "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, - "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, - "mean_ttft_ms": df["ttft_ms"].mean(), - "p50_ttft_ms": df["ttft_ms"].median(), - "p90_ttft_ms": df["ttft_ms"].quantile(0.9), - "p99_ttft_ms": df["ttft_ms"].quantile(0.99), - "mean_tpot_ms": df["tpot_ms"].mean(), - "p50_tpot_ms": df["tpot_ms"].median(), - "p90_tpot_ms": df["tpot_ms"].quantile(0.9), - "p99_tpot_ms": df["tpot_ms"].quantile(0.99), - "mean_latency_ms": df["latency_ms"].mean(), - "p50_latency_ms": df["latency_ms"].median(), - "p90_latency_ms": df["latency_ms"].quantile(0.9), - "p99_latency_ms": df["latency_ms"].quantile(0.99), - }) - # Cache hit rates from server metrics if server_csv.exists(): try: From 4f106b8fdc9e27b30ca843eaf699510204e28216 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 11:26:03 -0500 Subject: [PATCH 22/78] debug --- benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 1bec35577..926bda021 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -14,6 +14,10 @@ set -x source "$(dirname "$0")/../benchmark_lib.sh" +export CUDA_LAUNCH_BLOCKING=1 + +ulimit -a + check_env_vars \ MODEL \ TP \ From cfb25fb509e7a87b0d8f8dadb4b60821f06eb072 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 12:31:49 -0500 Subject: [PATCH 23/78] fix LMCache traces: convert system role to developer for vLLM v0.18+ vLLM v0.18.0 follows the newer OpenAI API spec where the 'system' message role was renamed to 'developer'. The LMCache traces use 'system', causing 100% 400 Bad Request errors. Also drop the 15GB profile_export_aiperf.json from artifact uploads. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 1 - .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 9 ++++++++- .../single_node/multiturn_fp8_h200_lmcache_aiperf.sh | 9 ++++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index 7c1d5ce0d..f366564d3 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -162,7 +162,6 @@ jobs: results/benchmark_metadata.json results/metrics_workload.png results/aiperf_artifacts/profile_export_aiperf.csv - results/aiperf_artifacts/profile_export_aiperf.json results/workload_distribution_summary.txt results/workload_distribution_plots.png results/trace_replay/detailed_results.csv diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 926bda021..1d1c3154d 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -98,9 +98,16 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: + # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role + # was renamed to 'developer'. Convert to avoid 400 validation errors. + messages = [] + for msg in row['input']: + if msg.get('role') == 'system': + msg = {**msg, 'role': 'developer'} + messages.append(msg) entry = { 'session_id': row['session_id'], - 'messages': row['input'], + 'messages': messages, 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index 9a0c89e5a..03fd4402e 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -94,9 +94,16 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: + # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role + # was renamed to 'developer'. Convert to avoid 400 validation errors. + messages = [] + for msg in row['input']: + if msg.get('role') == 'system': + msg = {**msg, 'role': 'developer'} + messages.append(msg) entry = { 'session_id': row['session_id'], - 'messages': row['input'], + 'messages': messages, 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') From ede9bde6e081eff22aad6683a9472d8babb2be86 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 12:33:59 -0500 Subject: [PATCH 24/78] revert system->developer role conversion in LMCache traces Co-Authored-By: Claude Opus 4.6 (1M context) --- .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 9 +-------- .../single_node/multiturn_fp8_h200_lmcache_aiperf.sh | 9 +-------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 1d1c3154d..926bda021 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -98,16 +98,9 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: - # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role - # was renamed to 'developer'. Convert to avoid 400 validation errors. - messages = [] - for msg in row['input']: - if msg.get('role') == 'system': - msg = {**msg, 'role': 'developer'} - messages.append(msg) entry = { 'session_id': row['session_id'], - 'messages': messages, + 'messages': row['input'], 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index 03fd4402e..9a0c89e5a 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -94,16 +94,9 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: - # vLLM v0.18+ follows the newer OpenAI API spec where 'system' role - # was renamed to 'developer'. Convert to avoid 400 validation errors. - messages = [] - for msg in row['input']: - if msg.get('role') == 'system': - msg = {**msg, 'role': 'developer'} - messages.append(msg) entry = { 'session_id': row['session_id'], - 'messages': messages, + 'messages': row['input'], 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') From a7ac440570908ca5f64e71b06b83fec3ea2da444 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 12:34:47 -0500 Subject: [PATCH 25/78] fix MetricsCollector missing gpu_transfer_collector attribute Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/bench/metrics_collector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py index 7bcdf31a4..b38653ea8 100644 --- a/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py +++ b/experimental/multiturn/vllm_benchmark/bench/metrics_collector.py @@ -172,6 +172,7 @@ class MetricsCollector: _task: asyncio.Task | None = None _parser: VLLMMetricsParser | SGLangMetricsParser | None = None _backend: str = "" + gpu_transfer_collector: object = None def _parse_metrics(self, text: str) -> MetricsSnapshot: """Parse Prometheus metrics text, auto-detecting backend on first call.""" From db87b95fc7eb55e19ec318569e1771369fa6ac28 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 12:44:27 -0500 Subject: [PATCH 26/78] fix LMCache traces: strip null fields to pass vLLM Pydantic validation The LMCache traces include explicit null values for optional fields (tool_calls, tool_call_id, name) on every message. vLLM's strict Pydantic validation rejects these, causing 100% HTTP 400 errors. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 5 ++++- benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 926bda021..034df4d89 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -98,9 +98,12 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: + # Strip None fields — vLLM's Pydantic validation rejects explicit nulls + # for optional fields like tool_calls, tool_call_id, name + messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] entry = { 'session_id': row['session_id'], - 'messages': row['input'], + 'messages': messages, 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index 9a0c89e5a..c4d26dd7e 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -94,9 +94,12 @@ try: sessions = set() with open(out_path, 'w') as f: for row in ds: + # Strip None fields — vLLM's Pydantic validation rejects explicit nulls + # for optional fields like tool_calls, tool_call_id, name + messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] entry = { 'session_id': row['session_id'], - 'messages': row['input'], + 'messages': messages, 'output_length': row['output_length'], } f.write(json.dumps(entry) + '\n') From 07ce85de133bf608d48bd635b3816458a2e5db53 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 13:13:29 -0500 Subject: [PATCH 27/78] use hf download for LMCache traces instead of datasets.load_dataset Avoids flaky streaming downloads that fail mid-transfer. The dataset is now cached via hf download (same as model weights) and read from the local parquet files. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp8_h100_lmcache_aiperf.sh | 58 ++++++++++++------- .../multiturn_fp8_h200_lmcache_aiperf.sh | 58 ++++++++++++------- 2 files changed, 72 insertions(+), 44 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 034df4d89..ae666c37b 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -87,31 +87,45 @@ fi mkdir -p "$RESULT_DIR" -# ---- Convert LMCache traces to mooncake format ----------------------------- -echo "Downloading and converting LMCache traces..." +# ---- Download and convert LMCache traces to mooncake format ---------------- +echo "Downloading LMCache traces..." +hf download sammshen/lmcache-agentic-traces --repo-type dataset + +echo "Converting LMCache traces to mooncake format..." python3 -c " -import json, os -try: +import json, glob, os +hf_cache = os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')) +# Find the downloaded parquet/jsonl files in the HF cache +candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.parquet'), recursive=True) +if not candidates: + candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.jsonl'), recursive=True) +if not candidates: + # Fallback: use datasets library to load from cache from datasets import load_dataset ds = load_dataset('sammshen/lmcache-agentic-traces', split='train') - out_path = '$TRACE_FILE' - sessions = set() - with open(out_path, 'w') as f: - for row in ds: - # Strip None fields — vLLM's Pydantic validation rejects explicit nulls - # for optional fields like tool_calls, tool_call_id, name - messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] - entry = { - 'session_id': row['session_id'], - 'messages': messages, - 'output_length': row['output_length'], - } - f.write(json.dumps(entry) + '\n') - sessions.add(row['session_id']) - print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}') -except Exception as e: - print(f'ERROR converting traces: {e}') - exit(1) + rows = list(ds) +else: + import pyarrow.parquet as pq + rows = [] + for f in sorted(candidates): + table = pq.read_table(f) + rows.extend(table.to_pylist()) + print(f'Loaded {len(rows)} rows from {len(candidates)} cached files') + +out_path = '$TRACE_FILE' +sessions = set() +with open(out_path, 'w') as f: + for row in rows: + # Strip None fields — vLLM's Pydantic validation rejects explicit nulls + messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + entry = { + 'session_id': row['session_id'], + 'messages': messages, + 'output_length': row['output_length'], + } + f.write(json.dumps(entry) + '\n') + sessions.add(row['session_id']) +print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') " SERVER_LOG="$RESULT_DIR/server.log" diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index c4d26dd7e..56232cf58 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -83,31 +83,45 @@ fi mkdir -p "$RESULT_DIR" -# ---- Convert LMCache traces to mooncake format ----------------------------- -echo "Downloading and converting LMCache traces..." +# ---- Download and convert LMCache traces to mooncake format ---------------- +echo "Downloading LMCache traces..." +hf download sammshen/lmcache-agentic-traces --repo-type dataset + +echo "Converting LMCache traces to mooncake format..." python3 -c " -import json, os -try: +import json, glob, os +hf_cache = os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')) +# Find the downloaded parquet/jsonl files in the HF cache +candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.parquet'), recursive=True) +if not candidates: + candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.jsonl'), recursive=True) +if not candidates: + # Fallback: use datasets library to load from cache from datasets import load_dataset ds = load_dataset('sammshen/lmcache-agentic-traces', split='train') - out_path = '$TRACE_FILE' - sessions = set() - with open(out_path, 'w') as f: - for row in ds: - # Strip None fields — vLLM's Pydantic validation rejects explicit nulls - # for optional fields like tool_calls, tool_call_id, name - messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] - entry = { - 'session_id': row['session_id'], - 'messages': messages, - 'output_length': row['output_length'], - } - f.write(json.dumps(entry) + '\n') - sessions.add(row['session_id']) - print(f'Converted {len(ds)} iterations from {len(sessions)} sessions to {out_path}') -except Exception as e: - print(f'ERROR converting traces: {e}') - exit(1) + rows = list(ds) +else: + import pyarrow.parquet as pq + rows = [] + for f in sorted(candidates): + table = pq.read_table(f) + rows.extend(table.to_pylist()) + print(f'Loaded {len(rows)} rows from {len(candidates)} cached files') + +out_path = '$TRACE_FILE' +sessions = set() +with open(out_path, 'w') as f: + for row in rows: + # Strip None fields — vLLM's Pydantic validation rejects explicit nulls + messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + entry = { + 'session_id': row['session_id'], + 'messages': messages, + 'output_length': row['output_length'], + } + f.write(json.dumps(entry) + '\n') + sessions.add(row['session_id']) +print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') " SERVER_LOG="$RESULT_DIR/server.log" From 195ca66d90e2dd14412bbeee38fa1ee612949832 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 15:00:04 -0500 Subject: [PATCH 28/78] add B200 FP4 multiturn benchmark script using aiperf Based on H100 aiperf script with B200-specific changes: - TORCH_CUDA_ARCH_LIST=10.0 (Blackwell) - B200 compilation config (FULL_DECODE_ONLY cudagraphs, custom ops) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp4_b200_lmcache_aiperf.sh | 248 ++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100755 benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh new file mode 100755 index 000000000..2e8164f3f --- /dev/null +++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh @@ -0,0 +1,248 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# LMCache agentic trace benchmark for FP4 models on B200 using AIPerf. +# Replays SWE-bench/GAIA/WildClaw agentic traces via mooncake_trace format. +# Dataset: https://huggingface.co/datasets/sammshen/lmcache-agentic-traces +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# DURATION (if set, runs for this many seconds; otherwise runs to completion) + +source "$(dirname "$0")/../benchmark_lib.sh" + +export CUDA_LAUNCH_BLOCKING=1 + +ulimit -a + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +AIPERF_DIR="$MULTITURN_DIR/aiperf" +TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl" + +pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true + +# Patch vLLM bug: local_cache_hit counter can go negative under high load +# (causes "Counters can only be incremented by non-negative amounts" crash) +STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") +if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then + echo "Patching vLLM stats.py: $STATS_FILE" + python3 -c " +import re, sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', + 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +" "$STATS_FILE" +fi + +# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) +# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) +SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") +if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then + echo "Patching vLLM scheduler.py: $SCHED_FILE" + python3 << 'PYEOF' "$SCHED_FILE" +import sys +with open(sys.argv[1]) as f: + src = f.read() +src = src.replace( + 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', +) +src = src.replace( + 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', + 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', +) +with open(sys.argv[1], 'w') as f: + f.write(src) +PYEOF +fi + +mkdir -p "$RESULT_DIR" + +# ---- Download and convert LMCache traces to mooncake format ---------------- +echo "Downloading LMCache traces..." +hf download sammshen/lmcache-agentic-traces --repo-type dataset + +echo "Converting LMCache traces to mooncake format..." +python3 -c " +import json, glob, os +hf_cache = os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')) +# Find the downloaded parquet/jsonl files in the HF cache +candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.parquet'), recursive=True) +if not candidates: + candidates = glob.glob(os.path.join(hf_cache, 'datasets--sammshen--lmcache-agentic-traces', '**', '*.jsonl'), recursive=True) +if not candidates: + # Fallback: use datasets library to load from cache + from datasets import load_dataset + ds = load_dataset('sammshen/lmcache-agentic-traces', split='train') + rows = list(ds) +else: + import pyarrow.parquet as pq + rows = [] + for f in sorted(candidates): + table = pq.read_table(f) + rows.extend(table.to_pylist()) + print(f'Loaded {len(rows)} rows from {len(candidates)} cached files') + +out_path = '$TRACE_FILE' +sessions = set() +with open(out_path, 'w') as f: + for row in rows: + # Strip None fields — vLLM's Pydantic validation rejects explicit nulls + messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + entry = { + 'session_id': row['session_id'], + 'messages': messages, + 'output_length': row['output_length'], + } + f.write(json.dumps(entry) + '\n') + sessions.add(row['session_id']) +print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') +" + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}' +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" + +if [ "$OFFLOAD_MODE" = "on" ]; then + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" + +echo "Installing aiperf in isolated venv..." +python3 -m venv /tmp/aiperf-venv --system-site-packages +/tmp/aiperf-venv/bin/pip install -q -e "$AIPERF_DIR" 2>&1 | tail -10 +AIPERF_BIN="/tmp/aiperf-venv/bin/aiperf" + +/tmp/aiperf-venv/bin/python -c "import aiperf; print('aiperf installed OK')" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run AIPerf benchmark ---------------------------------------------------- +export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl" + +AIPERF_CMD="$AIPERF_BIN profile" +AIPERF_CMD+=" --model $MODEL" +AIPERF_CMD+=" --url http://localhost:$PORT" +AIPERF_CMD+=" --endpoint-type chat" +AIPERF_CMD+=" --streaming" +AIPERF_CMD+=" --input-file $TRACE_FILE" +AIPERF_CMD+=" --custom-dataset-type mooncake_trace" +AIPERF_CMD+=" --concurrency $USERS" +if [ -n "${DURATION:-}" ]; then + AIPERF_CMD+=" --benchmark-duration $DURATION" + AIPERF_CMD+=" --benchmark-grace-period 0" +fi +AIPERF_CMD+=" --request-timeout-seconds $REQUEST_TIMEOUT" +AIPERF_CMD+=" --output-artifact-dir $RESULT_DIR/aiperf_artifacts" +AIPERF_CMD+=" --export-level records" +AIPERF_CMD+=" --ui-type simple" +AIPERF_CMD+=" --random-seed 42" + +echo "$AIPERF_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $AIPERF_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/aiperf_artifacts" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" From 09e6ec1c9746f86563e178aebde9681e04899cae Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 15:09:13 -0500 Subject: [PATCH 29/78] add entry for b200 ds --- .github/configs/multiturn-agentic-trace.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index 63892d202..f371c5625 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -42,12 +42,12 @@ h100-fp8-llama70b-lmcache: users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256] offload: ["on", "off"] -b200-fp4-dsr1: +b200-fp4-dsr1-weka-trace: tp4: ep: 4 - users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128] + users: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128, 256] offload: ["on", "off"] tp8: ep: 8 - users: [1, 2, 4, 8, 12, 16, 32, 64, 128] + users: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512] offload: ["on", "off"] From 951326a2b5cf281c7b057b18b75558eb01a70b20 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 15:09:47 -0500 Subject: [PATCH 30/78] add expert parallel support to B200 FP4 aiperf script Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh index 2e8164f3f..5acba8a73 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh @@ -146,6 +146,10 @@ VLLM_CMD+=" --config $RESULT_DIR/config.yaml" VLLM_CMD+=" --gpu-memory-utilization 0.9" VLLM_CMD+=" --tensor-parallel-size $TP" +if [ "${EP_SIZE:-0}" -gt 1 ]; then + VLLM_CMD+=" --enable-expert-parallel" +fi + if [ "$OFFLOAD_MODE" = "on" ]; then VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" From 0100fa1bc6ed4326c73de918088feddef542471c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 15:32:48 -0500 Subject: [PATCH 31/78] skip LMCache trace entries with empty messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dataset was updated (24K → 74K rows) and now includes entries with empty message lists, causing aiperf MooncakeTrace validation to fail. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../single_node/multiturn_fp4_b200_lmcache_aiperf.sh | 8 +++++++- .../single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 8 +++++++- .../single_node/multiturn_fp8_h200_lmcache_aiperf.sh | 8 +++++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh index 5acba8a73..0df4efb0c 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh @@ -114,10 +114,14 @@ else: out_path = '$TRACE_FILE' sessions = set() +skipped = 0 with open(out_path, 'w') as f: for row in rows: # Strip None fields — vLLM's Pydantic validation rejects explicit nulls messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + if not messages: + skipped += 1 + continue entry = { 'session_id': row['session_id'], 'messages': messages, @@ -125,7 +129,9 @@ with open(out_path, 'w') as f: } f.write(json.dumps(entry) + '\n') sessions.add(row['session_id']) -print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') +if skipped: + print(f'Skipped {skipped} entries with empty messages') +print(f'Converted {len(rows) - skipped} iterations from {len(sessions)} sessions to {out_path}') " SERVER_LOG="$RESULT_DIR/server.log" diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index ae666c37b..b81105d5b 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -114,10 +114,14 @@ else: out_path = '$TRACE_FILE' sessions = set() +skipped = 0 with open(out_path, 'w') as f: for row in rows: # Strip None fields — vLLM's Pydantic validation rejects explicit nulls messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + if not messages: + skipped += 1 + continue entry = { 'session_id': row['session_id'], 'messages': messages, @@ -125,7 +129,9 @@ with open(out_path, 'w') as f: } f.write(json.dumps(entry) + '\n') sessions.add(row['session_id']) -print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') +if skipped: + print(f'Skipped {skipped} entries with empty messages') +print(f'Converted {len(rows) - skipped} iterations from {len(sessions)} sessions to {out_path}') " SERVER_LOG="$RESULT_DIR/server.log" diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index 56232cf58..e3acd1bb0 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -110,10 +110,14 @@ else: out_path = '$TRACE_FILE' sessions = set() +skipped = 0 with open(out_path, 'w') as f: for row in rows: # Strip None fields — vLLM's Pydantic validation rejects explicit nulls messages = [{k: v for k, v in msg.items() if v is not None} for msg in row['input']] + if not messages: + skipped += 1 + continue entry = { 'session_id': row['session_id'], 'messages': messages, @@ -121,7 +125,9 @@ with open(out_path, 'w') as f: } f.write(json.dumps(entry) + '\n') sessions.add(row['session_id']) -print(f'Converted {len(rows)} iterations from {len(sessions)} sessions to {out_path}') +if skipped: + print(f'Skipped {skipped} entries with empty messages') +print(f'Converted {len(rows) - skipped} iterations from {len(sessions)} sessions to {out_path}') " SERVER_LOG="$RESULT_DIR/server.log" From 110dfa4803fcdf6d529baecfb5ca6598bdc8516b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 17:56:15 -0500 Subject: [PATCH 32/78] fix: prioritize aiperf summary CSV over malformed client CSV Both collect_sweep_results.py and plot_pareto.py were trying to load metrics_client_metrics.csv first, which fails with "Expected 15 fields, saw 19" on aiperf runs. Now aiperf summary CSV is checked first. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../vllm_benchmark/analysis/plot_pareto.py | 4 ++-- .../scripts/collect_sweep_results.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py index 7da67c8a4..081c98ebd 100644 --- a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py +++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py @@ -145,8 +145,8 @@ def load_experiment_data(exp_dir: Path) -> dict | None: if final_row["cpu_prefix_cache_queries"] > 0: cpu_hit_rate = 100 * final_row["cpu_prefix_cache_hits"] / final_row["cpu_prefix_cache_queries"] - # Use aiperf summary CSV directly if available - if aiperf_summary_csv is not None and not client_metrics_file.exists(): + # Use aiperf summary CSV directly if available (preferred over client CSV) + if aiperf_summary_csv is not None: exp_name = exp_dir.name parts = exp_name.split("_") tp = int(parts[0].replace("tp", "")) diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py index 9910fb8ff..28f115f47 100755 --- a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py +++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py @@ -155,8 +155,13 @@ def load_experiment(exp_dir: Path) -> dict | None: return result try: - # Determine data source: custom client CSV, aiperf summary CSV, or trace replay CSV - if client_csv.exists(): + # Determine data source: aiperf summary CSV (preferred), custom client CSV, or trace replay CSV + if aiperf_summary_csv is not None: + aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv) + if aiperf_metrics is None: + return result + result.update(aiperf_metrics) + elif client_csv.exists(): df = _load_custom_client_csv(client_csv, exp_dir) if df is None or len(df) == 0: return result @@ -199,11 +204,6 @@ def load_experiment(exp_dir: Path) -> dict | None: "p90_latency_ms": df["latency_ms"].quantile(0.9), "p99_latency_ms": df["latency_ms"].quantile(0.99), }) - elif aiperf_summary_csv is not None: - aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv) - if aiperf_metrics is None: - return result - result.update(aiperf_metrics) elif trace_replay_csv.exists(): df = _load_trace_replay_csv(trace_replay_csv) if df is None or len(df) == 0: From c64e644b4b1af0a0bd6c7eb0a364d455ec02db71 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 2 Apr 2026 17:58:44 -0500 Subject: [PATCH 33/78] fix aiperf CSV parser: handle multi-section format with different column counts The profile_export_aiperf.csv has 3 sections (per-metric stats, scalar values, GPU metrics) with different column counts. pd.read_csv choked on the GPU section (19 cols vs 14). Parse manually by splitting on column count changes. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../vllm_benchmark/analysis/plot_pareto.py | 35 +++++++++++---- .../scripts/collect_sweep_results.py | 43 +++++++++++++------ 2 files changed, 58 insertions(+), 20 deletions(-) diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py index 081c98ebd..90b7ed1f8 100644 --- a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py +++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py @@ -21,21 +21,40 @@ def _load_aiperf_summary_csv(csv_path: Path, exp_dir: Path, tp: int, gpu_hit_rate: float | None, cpu_hit_rate: float | None) -> dict | None: """Load aggregate metrics directly from aiperf's profile_export_aiperf.csv.""" - df = pd.read_csv(csv_path) - if len(df) == 0: + # The CSV has multiple sections with different column counts. + # Read raw lines and split into per-metric and scalar sections. + lines = csv_path.read_text().strip().split('\n') + if len(lines) < 2: return None - per_metric = df[df["avg"].notna()].set_index("Metric") - scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric") + header = lines[0].split(',') + per_metric = {} + scalars = {} + for line in lines[1:]: + if not line.strip(): + continue + parts = line.split(',') + if len(parts) == len(header): + per_metric[parts[0]] = {h: parts[i] for i, h in enumerate(header)} + elif len(parts) == 2: + scalars[parts[0]] = parts[1] + else: + break def metric_stat(metric_name, stat): - if metric_name in per_metric.index: - return float(per_metric.loc[metric_name, stat]) + if metric_name in per_metric: + try: + return float(per_metric[metric_name].get(stat, 0)) + except (ValueError, TypeError): + return 0 return 0 def scalar_val(metric_name): - if metric_name in scalars.index: - return float(scalars.loc[metric_name, "min"]) + if metric_name in scalars: + try: + return float(scalars[metric_name]) + except (ValueError, TypeError): + return 0 return 0 exp_name = exp_dir.name diff --git a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py index 28f115f47..89cf990f3 100755 --- a/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py +++ b/experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py @@ -39,25 +39,44 @@ def _load_aiperf_summary_csv(csv_path: Path) -> dict | None: Returns a dict with pre-computed metrics matching the result schema, or None if the file can't be parsed. """ - df = pd.read_csv(csv_path) - if len(df) == 0: + # The CSV has multiple sections with different column counts. + # Read raw lines and split into per-metric and scalar sections. + lines = csv_path.read_text().strip().split('\n') + if len(lines) < 2: return None - # The CSV has two sections: - # 1. Per-metric rows with columns: Metric, avg, min, max, sum, p1..p99, std - # 2. Scalar rows with columns: Metric, Value - # Split by finding rows where only Metric and Value are populated - per_metric = df[df["avg"].notna()].set_index("Metric") - scalars = df[df["avg"].isna() & df["Metric"].notna()].set_index("Metric") + # Section 1: per-metric stats (header + data rows with 14 columns) + header = lines[0].split(',') + per_metric = {} + scalars = {} + for line in lines[1:]: + if not line.strip(): + continue + parts = line.split(',') + if len(parts) == len(header): + # Per-metric row + per_metric[parts[0]] = {h: parts[i] for i, h in enumerate(header)} + elif len(parts) == 2: + # Scalar row (Metric, Value) + scalars[parts[0]] = parts[1] + else: + # Different section (GPU metrics) — stop + break def metric_stat(metric_name, stat): - if metric_name in per_metric.index: - return float(per_metric.loc[metric_name, stat]) + if metric_name in per_metric: + try: + return float(per_metric[metric_name].get(stat, 0)) + except (ValueError, TypeError): + return 0 return 0 def scalar_val(metric_name): - if metric_name in scalars.index: - return float(scalars.loc[metric_name, "min"]) # "min" column holds Value + if metric_name in scalars: + try: + return float(scalars[metric_name]) + except (ValueError, TypeError): + return 0 return 0 return { From 43abe6b1fcb4d13dff22d3ebb545f47bb8fd959f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 3 Apr 2026 10:11:20 -0500 Subject: [PATCH 34/78] update aiperf submodule: fix raw_messages input token counting Points to inferencemax/fix-raw-messages-token-counting which fixes client-side ISL undercounting for mooncake trace workloads. Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/aiperf b/experimental/multiturn/vllm_benchmark/aiperf index 373218fb3..d0f2290cc 160000 --- a/experimental/multiturn/vllm_benchmark/aiperf +++ b/experimental/multiturn/vllm_benchmark/aiperf @@ -1 +1 @@ -Subproject commit 373218fb3c3d15fada9c4be6465daf8fb5a70ef6 +Subproject commit d0f2290ccc9d29de0b3ee786684afdf278f4f0a9 From 4754c40a95ab326bff51300570c2c4982453482f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 3 Apr 2026 11:01:59 -0500 Subject: [PATCH 35/78] add --use-server-token-count to lmcache aiperf benchmarks Use server-reported token counts instead of client-side tokenization, which undercounts input tokens for raw_messages (mooncake trace) workloads. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh | 1 + benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 1 + benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh | 1 + 3 files changed, 3 insertions(+) diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh index 0df4efb0c..2320acaed 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh @@ -213,6 +213,7 @@ AIPERF_CMD+=" --model $MODEL" AIPERF_CMD+=" --url http://localhost:$PORT" AIPERF_CMD+=" --endpoint-type chat" AIPERF_CMD+=" --streaming" +AIPERF_CMD+=" --use-server-token-count" AIPERF_CMD+=" --input-file $TRACE_FILE" AIPERF_CMD+=" --custom-dataset-type mooncake_trace" AIPERF_CMD+=" --concurrency $USERS" diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index b81105d5b..754599b61 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -208,6 +208,7 @@ AIPERF_CMD+=" --model $MODEL" AIPERF_CMD+=" --url http://localhost:$PORT" AIPERF_CMD+=" --endpoint-type chat" AIPERF_CMD+=" --streaming" +AIPERF_CMD+=" --use-server-token-count" AIPERF_CMD+=" --input-file $TRACE_FILE" AIPERF_CMD+=" --custom-dataset-type mooncake_trace" AIPERF_CMD+=" --concurrency $USERS" diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index e3acd1bb0..e02f00b95 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -204,6 +204,7 @@ AIPERF_CMD+=" --model $MODEL" AIPERF_CMD+=" --url http://localhost:$PORT" AIPERF_CMD+=" --endpoint-type chat" AIPERF_CMD+=" --streaming" +AIPERF_CMD+=" --use-server-token-count" AIPERF_CMD+=" --input-file $TRACE_FILE" AIPERF_CMD+=" --custom-dataset-type mooncake_trace" AIPERF_CMD+=" --concurrency $USERS" From 648d522d14040b6a928400d86f05386e93bece4b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 3 Apr 2026 11:21:13 -0500 Subject: [PATCH 36/78] update kv-cache-tester submodule: fix reasoning token counting Points to inferencemax/fix-reasoning-token-counting which counts delta.reasoning_content for metrics (TTFT/TPOT/OSL) without adding reasoning tokens to conversation history. Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index a41ee2261..197b20136 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit a41ee2261b743328be84c472b7b97112d046e62f +Subproject commit 197b20136b5486b3e5d3140ec92b0ef3f4e0a3ec From 63c26ffab4479ac7d6d3a703c64d9fd70f20e0ce Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 3 Apr 2026 11:46:44 -0500 Subject: [PATCH 37/78] add p50 (median) clean and overlay pareto frontier plots Adds generate_pareto_only_figure_p50 and generate_pareto_overlay_figure_p50 for consistency with the existing p90/p99/p999 plot variants. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../vllm_benchmark/analysis/plot_pareto.py | 153 ++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py index 90b7ed1f8..99b24c7c1 100644 --- a/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py +++ b/experimental/multiturn/vllm_benchmark/analysis/plot_pareto.py @@ -414,6 +414,155 @@ def generate_pareto_only_figure(df: pd.DataFrame, results_dir: Path): plt.close() +def generate_pareto_only_figure_p50(df: pd.DataFrame, results_dir: Path): + """Generate a clean figure showing only Pareto frontier points with median (p50) latencies.""" + + df = df.copy() + df["interactivity"] = 1000.0 / df["p50_tpot_ms"] + + available_modes = sorted(df["offload"].unique()) + mode_titles = {"on": "Prefix+Offload", "off": "Prefix Only", "noprefix": "No Prefix"} + df_subsets = {mode: df[df["offload"] == mode] for mode in available_modes} + + num_cols = len(available_modes) + fig, axes = plt.subplots(4, num_cols, figsize=(6 * num_cols, 18)) + fig.suptitle("Pareto Frontiers (Median Latencies) with Concurrency Labels", fontsize=14) + + if num_cols == 1: + axes = axes.reshape(-1, 1) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + metrics_configs = [ + (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity", "total_tps_per_gpu", "Interactivity", "Interactivity (1000/Median TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput", "Interactivity (1000/Median TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, metric_name, x_label, y_label, maximize_x in metrics_configs: + for col, mode in enumerate(available_modes): + ax = axes[row, col] + df_subset = df_subsets[mode] + title = f"{metric_name} ({mode_titles.get(mode, mode)})" + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle='-', linewidth=2, alpha=0.5, color="black") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors="black", linewidths=1, + label=f"TP={tp}", zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=(5, 5), + fontsize=8, + alpha=0.8) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + if len(frontier_df) > 0: + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_clean_p50.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved clean Median Pareto plot to {output_file}") + plt.close() + + +def generate_pareto_overlay_figure_p50(df: pd.DataFrame, results_dir: Path): + """Generate a figure with all prefix cache modes overlaid using median (p50) latencies.""" + + df = df.copy() + df["interactivity"] = 1000.0 / df["p50_tpot_ms"] + + available_modes = df["offload"].unique() + + mode_styles = { + "on": ("-", "black", "black", (5, 8), "normal"), + "off": ("--", "none", "gray", (5, -12), "italic"), + "noprefix": (":", "red", "red", (5, -25), "oblique"), + } + mode_labels = { + "on": "Prefix+Offload", + "off": "Prefix Only", + "noprefix": "No Prefix", + } + + fig, axes = plt.subplots(4, 1, figsize=(10, 18)) + fig.suptitle("Pareto Frontiers (Median Latencies): Mode Comparison", fontsize=14) + + tp_colors = {1: "blue", 2: "green", 4: "orange", 8: "red"} + tp_markers = {1: "o", 2: "s", 4: "^", 8: "D"} + + plot_configs = [ + (0, "p50_ttft_ms", "input_tps_per_gpu", "TTFT vs Input Throughput/GPU", "Median TTFT (ms)", "Input Throughput/GPU (tok/s)", False), + (1, "interactivity", "total_tps_per_gpu", "Interactivity vs Total Throughput/GPU", "Interactivity (1000/Median TPOT)", "Total Throughput/GPU (tok/s)", True), + (2, "p50_latency_ms", "total_tps_per_gpu", "E2E Latency vs Total Throughput/GPU", "Median E2E Latency (ms)", "Total Throughput/GPU (tok/s)", False), + (3, "interactivity", "output_tps_per_gpu", "Output Throughput vs Interactivity", "Interactivity (1000/Median TPOT)", "Output Throughput/GPU (tok/s)", True), + ] + + for row, x_col, y_col, title, x_label, y_label, maximize_x in plot_configs: + ax = axes[row] + + for mode in ["on", "off", "noprefix"]: + if mode not in available_modes: + continue + + df_subset = df[df["offload"] == mode] + linestyle, marker_edge, line_color, label_offset, font_style = mode_styles[mode] + + frontier_df = compute_pareto_frontier_with_metadata(df_subset, x_col, y_col, maximize_x) + + if len(frontier_df) > 0: + ax.plot(frontier_df[x_col], frontier_df[y_col], + linestyle=linestyle, linewidth=2, alpha=0.6, color=line_color, + label=f"Pareto ({mode_labels[mode]})") + + for tp in sorted(frontier_df["tp"].unique()): + tp_data = frontier_df[frontier_df["tp"] == tp] + label = f"TP={tp}" if mode == "on" else None + ax.scatter(tp_data[x_col], tp_data[y_col], + c=tp_colors.get(tp, "purple"), marker=tp_markers.get(tp, "x"), + s=150, alpha=0.9, edgecolors=marker_edge, linewidths=1.5, + label=label, zorder=5) + + for _, point in frontier_df.iterrows(): + ax.annotate(f"conc={point['bs']}", + (point[x_col], point[y_col]), + textcoords="offset points", + xytext=label_offset, + fontsize=7, + alpha=0.7, + style=font_style) + + ax.set_xlabel(x_label) + ax.set_ylabel(y_label) + ax.set_title(title) + ax.grid(True, alpha=0.3) + ax.legend(fontsize=8, loc="lower right" if not maximize_x else "upper right") + + plt.tight_layout() + + output_file = results_dir / "pareto_frontiers_overlay_p50.png" + plt.savefig(output_file, dpi=150, bbox_inches='tight') + print(f"Saved overlay Median Pareto plot to {output_file}") + plt.close() + + def generate_pareto_only_figure_p90(df: pd.DataFrame, results_dir: Path): """Generate a clean figure showing only Pareto frontier points with p90 latencies.""" @@ -1172,6 +1321,10 @@ def main(results_dir: Path): # Generate overlay figure (on vs off comparison) generate_pareto_overlay_figure(df, results_dir) + # Generate P50 (Median) versions + generate_pareto_only_figure_p50(df, results_dir) + generate_pareto_overlay_figure_p50(df, results_dir) + # Generate P90 versions generate_pareto_only_figure_p90(df, results_dir) generate_pareto_overlay_figure_p90(df, results_dir) From aa8276c56f9424184cca509de076721115f1c446 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 3 Apr 2026 11:55:11 -0500 Subject: [PATCH 38/78] expand lmcache concurrency sweep to find GPU cache overflow points Add higher concurrency levels for H200 lmcache benchmarks to map out the regime where KV offloading improves TTFT and throughput: - TP=2: add bs=48,80,96,128 (cliff observed at bs=64) - TP=4: add bs=80,96,160,192 (cliff between bs=64 and bs=128) - TP=8: add bs=384,512,768,1024 (no cliff observed up to bs=256) Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/multiturn-agentic-trace.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index f371c5625..4f480aaab 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -22,13 +22,13 @@ mi355x-fp8-llama70b: h200-fp8-llama70b-lmcache: tp2: - users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 64] + users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 48, 64, 80, 96, 128] offload: ["on", "off"] tp4: - users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 128] + users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 80, 96, 128, 160, 192] offload: ["on", "off"] tp8: - users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256] + users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256, 384, 512, 768, 1024] offload: ["on", "off"] h100-fp8-llama70b-lmcache: From 4781db8da73a4067a9848d25f703c72b39d992cc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 3 Apr 2026 11:58:49 -0500 Subject: [PATCH 39/78] prune dominated concurrency points from lmcache sweep Remove concurrency levels that are clearly dominated on all pareto frontiers based on prior results, reducing sweep from 92 to 74 experiments while adding higher concurrency to find cache overflow. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/multiturn-agentic-trace.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index 4f480aaab..be6bc625d 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -22,13 +22,13 @@ mi355x-fp8-llama70b: h200-fp8-llama70b-lmcache: tp2: - users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 48, 64, 80, 96, 128] + users: [2, 4, 8, 12, 16, 20, 32, 48, 64, 80, 96, 128] offload: ["on", "off"] tp4: - users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 80, 96, 128, 160, 192] + users: [2, 4, 8, 12, 20, 32, 64, 80, 96, 128, 160, 192] offload: ["on", "off"] tp8: - users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256, 384, 512, 768, 1024] + users: [2, 4, 8, 16, 32, 48, 64, 128, 256, 384, 512, 768, 1024] offload: ["on", "off"] h100-fp8-llama70b-lmcache: From 3cb9983efa1581602c9817c1bfb08f949d4d1e60 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 3 Apr 2026 11:59:24 -0500 Subject: [PATCH 40/78] Revert "prune dominated concurrency points from lmcache sweep" This reverts commit 4781db8da73a4067a9848d25f703c72b39d992cc. --- .github/configs/multiturn-agentic-trace.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index be6bc625d..4f480aaab 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -22,13 +22,13 @@ mi355x-fp8-llama70b: h200-fp8-llama70b-lmcache: tp2: - users: [2, 4, 8, 12, 16, 20, 32, 48, 64, 80, 96, 128] + users: [1, 2, 4, 6, 8, 10, 12, 16, 20, 32, 48, 64, 80, 96, 128] offload: ["on", "off"] tp4: - users: [2, 4, 8, 12, 20, 32, 64, 80, 96, 128, 160, 192] + users: [1, 2, 4, 8, 12, 16, 20, 24, 32, 40, 64, 80, 96, 128, 160, 192] offload: ["on", "off"] tp8: - users: [2, 4, 8, 16, 32, 48, 64, 128, 256, 384, 512, 768, 1024] + users: [1, 2, 4, 8, 16, 24, 32, 48, 64, 128, 256, 384, 512, 768, 1024] offload: ["on", "off"] h100-fp8-llama70b-lmcache: From 16918631ebdcd93c1220b14c287969a200a68442 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 6 Apr 2026 12:27:13 -0500 Subject: [PATCH 41/78] remove excluded nodes from H100 DGXC SLURM config Clear the SLURM_EXCLUDED_NODELIST to allow scheduling on all hpc-gpu-1-* nodes. Co-Authored-By: Claude Opus 4.6 (1M context) --- runners/launch_h100-dgxc-slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 124c8de6e..d0d288aec 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -3,7 +3,7 @@ # System-specific configuration for H100 DGXC Slurm cluster SLURM_PARTITION="hpc-gpu-1" SLURM_ACCOUNT="customer" -SLURM_EXCLUDED_NODELIST="hpc-gpu-1-1,hpc-gpu-1-7,hpc-gpu-1-18" +SLURM_EXCLUDED_NODELIST="" set -x From a9069da00006302c170cb71328d5d22a2b943aa5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 8 Apr 2026 10:32:21 -0500 Subject: [PATCH 42/78] point kv-cache-tester submodule to upstream (callanjfox) at latest HEAD Fix branch was merged upstream and extended with additional reasoning token fixes for cache_rate_tester and working_set_tester. Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitmodules | 2 +- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index c45593c07..1a7386038 100644 --- a/.gitmodules +++ b/.gitmodules @@ -3,4 +3,4 @@ url = https://github.com/cquil11/aiperf.git [submodule "experimental/multiturn/vllm_benchmark/kv-cache-tester"] path = experimental/multiturn/vllm_benchmark/kv-cache-tester - url = https://github.com/cquil11/kv-cache-tester.git + url = https://github.com/callanjfox/kv-cache-tester.git diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 197b20136..404866515 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 197b20136b5486b3e5d3140ec92b0ef3f4e0a3ec +Subproject commit 4048665151c0fa4039a1a936ddd3d1a902140097 From e66e75a6bf918eac1645b76a7685a01e2fe86e5f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 13 Apr 2026 13:52:28 -0500 Subject: [PATCH 43/78] Update kv-cache-tester submodule to neon-trace-support branch Adds Neon DB trace generator and 100 trace files from our own Claude Code proxy data. Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 404866515..559928a9b 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 4048665151c0fa4039a1a936ddd3d1a902140097 +Subproject commit 559928a9b2330c35c40e329c4320378109b00ff0 From 037dd102dc7e6f190556c0b98c8b96d4bd5e5016 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 13 Apr 2026 13:53:59 -0500 Subject: [PATCH 44/78] Switch multiturn benchmark to Neon traces and add LFS support - Default TRACE_DIR now points to traces_neon/ (from our own proxy data) instead of the old callanjfox traces with s/n pairing - TRACE_DIR is configurable via env var to use old traces if needed - Add git lfs pull step to workflow for submodule LFS files Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 4 ++++ benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index f366564d3..506ab0999 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -124,7 +124,11 @@ jobs: fetch-depth: 0 ref: ${{ inputs.ref || github.ref }} submodules: true + lfs: true + - name: Pull LFS files in submodules + run: | + git -C experimental/multiturn/vllm_benchmark/kv-cache-tester lfs pull || true - name: Launch job script env: diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh index d22448892..b3b5a80d3 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -10,6 +10,7 @@ set -x # MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR # Optional: # PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# TRACE_DIR (default: kv-cache-tester/traces_neon) # DURATION (default 1800, benchmark duration in seconds) # MAX_DELAY (default 60, max gap between requests in seconds) # ADVANCE_MIN (default 0.0, min trace advancement fraction) @@ -44,7 +45,7 @@ nvidia-smi # ---- Paths ----------------------------------------------------------------- MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" -TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" +TRACE_DIR="${TRACE_DIR:-$KV_CACHE_TESTER_DIR/traces_neon}" pip install --quiet urllib3 requests 2>/dev/null || true From 7b51764533a4c0151e6c35c4df0b6d6c8db59cdd Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 13 Apr 2026 13:59:22 -0500 Subject: [PATCH 45/78] Remove LFS, update submodule to non-LFS trace commit LFS caused auth issues in CI (submodule PAT scope). Traces are now committed as regular files (oversized trace_0045 excluded). Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 5 ----- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index 506ab0999..b283da930 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -124,11 +124,6 @@ jobs: fetch-depth: 0 ref: ${{ inputs.ref || github.ref }} submodules: true - lfs: true - - - name: Pull LFS files in submodules - run: | - git -C experimental/multiturn/vllm_benchmark/kv-cache-tester lfs pull || true - name: Launch job script env: diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 559928a9b..0880f6d69 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 559928a9b2330c35c40e329c4320378109b00ff0 +Subproject commit 0880f6d695d7c31bdbcfda34eeedba8c9b55539d From 8f4c069ae92f30f39f073dc3aa9ae97ad21cb9b4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 14 Apr 2026 10:52:21 -0500 Subject: [PATCH 46/78] no exclusive --- runners/launch_h100-cw.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 28e89e0cb..2c8efaca7 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -7,7 +7,7 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" From aebb6fec216cddce8e3149cf59b120401f6d1f10 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 14 Apr 2026 11:00:15 -0500 Subject: [PATCH 47/78] no exclusive dgxc --- runners/launch_h100-cw.sh | 2 +- runners/launch_h100-dgxc-slurm.sh | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 2c8efaca7..28e89e0cb 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -7,7 +7,7 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index d0d288aec..d92a3e978 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -231,7 +231,7 @@ else HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/" SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - salloc --exclude="$SLURM_EXCLUDED_NODELIST" --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" @@ -242,7 +242,6 @@ else fi srun --jobid=$JOB_ID \ - --exclusive \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ From ea1013d376ad668bf4793280c7158d3a5b2681e4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 14 Apr 2026 11:24:24 -0500 Subject: [PATCH 48/78] increase aiperf configure timeout for H100 lmcache benchmark The 14GB LMCache dataset mmap takes >5 minutes on some nodes, exceeding the default 300s PROFILE_CONFIGURE_TIMEOUT. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 754599b61..111fd7413 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -202,6 +202,7 @@ sleep 2 # ---- Run AIPerf benchmark ---------------------------------------------------- export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl" +export AIPERF_SERVICE_PROFILE_CONFIGURE_TIMEOUT=1800 AIPERF_CMD="$AIPERF_BIN profile" AIPERF_CMD+=" --model $MODEL" From 674793dd18cf452b5e73af5a5e5c595532d8eccd Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 14 Apr 2026 12:19:52 -0500 Subject: [PATCH 49/78] update kv-cache-tester submodule: add cumulative assessment metrics Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 0880f6d69..25f9a34c3 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 0880f6d695d7c31bdbcfda34eeedba8c9b55539d +Subproject commit 25f9a34c3e594f5b6fef68ea55c003e61d62dd31 From 80e324cfd9366a22b71f5f02efdd3e1874bee9a6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 14 Apr 2026 12:31:44 -0500 Subject: [PATCH 50/78] switch to simple KV offloading, remove vLLM patches - Replace native offloading with SimpleCPUOffloadConnector (VLLM_USE_SIMPLE_KV_OFFLOAD=1 + --no-disable-hybrid-kv-cache-manager) for ~10% better throughput and TPOT per vllm-project/vllm#37160 - Remove local_cache_hit and scheduler.py monkey-patches (fixed in vLLM 0.19.0+), replace with version check warning - Add AIPERF_SERVICE_PROFILE_CONFIGURE_TIMEOUT=1800 to H200 and B200 (H100 already had it) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp4_b200_lmcache_aiperf.sh | 50 ++++--------------- .../multiturn_fp8_h100_lmcache_aiperf.sh | 49 ++++-------------- .../multiturn_fp8_h200_lmcache_aiperf.sh | 50 ++++--------------- 3 files changed, 32 insertions(+), 117 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh index 2320acaed..5e4fc86b1 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_lmcache_aiperf.sh @@ -45,44 +45,14 @@ TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl" pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true -# Patch vLLM bug: local_cache_hit counter can go negative under high load -# (causes "Counters can only be incremented by non-negative amounts" crash) -STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") -if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then - echo "Patching vLLM stats.py: $STATS_FILE" - python3 -c " -import re, sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', - 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -" "$STATS_FILE" -fi - -# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) -# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) -SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") -if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then - echo "Patching vLLM scheduler.py: $SCHED_FILE" - python3 << 'PYEOF' "$SCHED_FILE" -import sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', -) -src = src.replace( - 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -PYEOF +# Check vLLM version — patches for local_cache_hit and scheduler stale KV +# transfer are fixed in 0.19.0+ +VLLM_VERSION=$(python3 -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "unknown") +echo "vLLM version: $VLLM_VERSION" +if python3 -c "from packaging.version import Version; exit(0 if Version('${VLLM_VERSION}') >= Version('0.19.0') else 1)" 2>/dev/null; then + echo "vLLM >= 0.19.0: no patches needed" +else + echo "WARNING: vLLM $VLLM_VERSION < 0.19.0 — local_cache_hit and scheduler patches are no longer applied. Upgrade to 0.19.0+ for stability." fi mkdir -p "$RESULT_DIR" @@ -157,9 +127,10 @@ if [ "${EP_SIZE:-0}" -gt 1 ]; then fi if [ "$OFFLOAD_MODE" = "on" ]; then + export VLLM_USE_SIMPLE_KV_OFFLOAD=1 VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" - VLLM_CMD+=" --disable-hybrid-kv-cache-manager" + VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" elif [ "$OFFLOAD_MODE" = "noprefix" ]; then VLLM_CMD+=" --no-enable-prefix-caching" fi @@ -207,6 +178,7 @@ sleep 2 # ---- Run AIPerf benchmark ---------------------------------------------------- export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl" +export AIPERF_SERVICE_PROFILE_CONFIGURE_TIMEOUT=1800 AIPERF_CMD="$AIPERF_BIN profile" AIPERF_CMD+=" --model $MODEL" diff --git a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh index 111fd7413..a2e96d1d5 100755 --- a/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h100_lmcache_aiperf.sh @@ -45,44 +45,14 @@ TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl" pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true -# Patch vLLM bug: local_cache_hit counter can go negative under high load -# (causes "Counters can only be incremented by non-negative amounts" crash) -STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") -if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then - echo "Patching vLLM stats.py: $STATS_FILE" - python3 -c " -import re, sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', - 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -" "$STATS_FILE" -fi - -# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) -# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) -SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") -if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then - echo "Patching vLLM scheduler.py: $SCHED_FILE" - python3 << 'PYEOF' "$SCHED_FILE" -import sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', -) -src = src.replace( - 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -PYEOF +# Check vLLM version — patches for local_cache_hit and scheduler stale KV +# transfer are fixed in 0.19.0+ +VLLM_VERSION=$(python3 -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "unknown") +echo "vLLM version: $VLLM_VERSION" +if python3 -c "from packaging.version import Version; exit(0 if Version('${VLLM_VERSION}') >= Version('0.19.0') else 1)" 2>/dev/null; then + echo "vLLM >= 0.19.0: no patches needed" +else + echo "WARNING: vLLM $VLLM_VERSION < 0.19.0 — local_cache_hit and scheduler patches are no longer applied. Upgrade to 0.19.0+ for stability." fi mkdir -p "$RESULT_DIR" @@ -152,9 +122,10 @@ VLLM_CMD+=" --gpu-memory-utilization 0.9" VLLM_CMD+=" --tensor-parallel-size $TP" if [ "$OFFLOAD_MODE" = "on" ]; then + export VLLM_USE_SIMPLE_KV_OFFLOAD=1 VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" - VLLM_CMD+=" --disable-hybrid-kv-cache-manager" + VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" elif [ "$OFFLOAD_MODE" = "noprefix" ]; then VLLM_CMD+=" --no-enable-prefix-caching" fi diff --git a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh index e02f00b95..43d72fa0f 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_lmcache_aiperf.sh @@ -41,44 +41,14 @@ TRACE_FILE="$RESULT_DIR/lmcache_traces.jsonl" pip install --quiet urllib3 requests orjson datasets 2>/dev/null || true -# Patch vLLM bug: local_cache_hit counter can go negative under high load -# (causes "Counters can only be incremented by non-negative amounts" crash) -STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") -if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then - echo "Patching vLLM stats.py: $STATS_FILE" - python3 -c " -import re, sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', - 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -" "$STATS_FILE" -fi - -# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) -# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) -SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") -if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then - echo "Patching vLLM scheduler.py: $SCHED_FILE" - python3 << 'PYEOF' "$SCHED_FILE" -import sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', -) -src = src.replace( - 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -PYEOF +# Check vLLM version — patches for local_cache_hit and scheduler stale KV +# transfer are fixed in 0.19.0+ +VLLM_VERSION=$(python3 -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "unknown") +echo "vLLM version: $VLLM_VERSION" +if python3 -c "from packaging.version import Version; exit(0 if Version('${VLLM_VERSION}') >= Version('0.19.0') else 1)" 2>/dev/null; then + echo "vLLM >= 0.19.0: no patches needed" +else + echo "WARNING: vLLM $VLLM_VERSION < 0.19.0 — local_cache_hit and scheduler patches are no longer applied. Upgrade to 0.19.0+ for stability." fi mkdir -p "$RESULT_DIR" @@ -148,9 +118,10 @@ VLLM_CMD+=" --gpu-memory-utilization 0.9" VLLM_CMD+=" --tensor-parallel-size $TP" if [ "$OFFLOAD_MODE" = "on" ]; then + export VLLM_USE_SIMPLE_KV_OFFLOAD=1 VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" - VLLM_CMD+=" --disable-hybrid-kv-cache-manager" + VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" elif [ "$OFFLOAD_MODE" = "noprefix" ]; then VLLM_CMD+=" --no-enable-prefix-caching" fi @@ -198,6 +169,7 @@ sleep 2 # ---- Run AIPerf benchmark ---------------------------------------------------- export AIPERF_LOG_CONVERSATIONS="$RESULT_DIR/conversations.jsonl" +export AIPERF_SERVICE_PROFILE_CONFIGURE_TIMEOUT=1800 AIPERF_CMD="$AIPERF_BIN profile" AIPERF_CMD+=" --model $MODEL" From 11245a987a3863d1a350bd52de56f1b903697c6d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 14 Apr 2026 12:37:57 -0500 Subject: [PATCH 51/78] switch trace replay scripts to simple KV offloading, remove patches Same changes as the aiperf scripts: replace native offloading with SimpleCPUOffloadConnector, remove monkey-patches fixed in vLLM 0.19.0+. Applies to: B200 trace replay, H200 trace replay, MI355X trace replay. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp4_b200_trace_replay.sh | 49 ++++--------------- .../multiturn_fp8_h200_trace_replay.sh | 49 ++++--------------- .../multiturn_fp8_mi355x_trace_replay.sh | 49 ++++--------------- 3 files changed, 30 insertions(+), 117 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh index b3b5a80d3..1c97b7827 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -49,44 +49,14 @@ TRACE_DIR="${TRACE_DIR:-$KV_CACHE_TESTER_DIR/traces_neon}" pip install --quiet urllib3 requests 2>/dev/null || true -# Patch vLLM bug: local_cache_hit counter can go negative under high load -# (causes "Counters can only be incremented by non-negative amounts" crash) -STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") -if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then - echo "Patching vLLM stats.py: $STATS_FILE" - python3 -c " -import re, sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', - 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -" "$STATS_FILE" -fi - -# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) -# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) -SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") -if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then - echo "Patching vLLM scheduler.py: $SCHED_FILE" - python3 << 'PYEOF' "$SCHED_FILE" -import sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', -) -src = src.replace( - 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -PYEOF +# Check vLLM version — patches for local_cache_hit and scheduler stale KV +# transfer are fixed in 0.19.0+ +VLLM_VERSION=$(python3 -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "unknown") +echo "vLLM version: $VLLM_VERSION" +if python3 -c "from packaging.version import Version; exit(0 if Version('${VLLM_VERSION}') >= Version('0.19.0') else 1)" 2>/dev/null; then + echo "vLLM >= 0.19.0: no patches needed" +else + echo "WARNING: vLLM $VLLM_VERSION < 0.19.0 — local_cache_hit and scheduler patches are no longer applied. Upgrade to 0.19.0+ for stability." fi SERVER_LOG="$RESULT_DIR/server.log" @@ -119,9 +89,10 @@ if [ "${EP_SIZE:-0}" -gt 1 ]; then fi if [ "$OFFLOAD_MODE" = "on" ]; then + export VLLM_USE_SIMPLE_KV_OFFLOAD=1 VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" - VLLM_CMD+=" --disable-hybrid-kv-cache-manager" + VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" elif [ "$OFFLOAD_MODE" = "noprefix" ]; then VLLM_CMD+=" --no-enable-prefix-caching" fi diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh index f3f967a82..404cd9368 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh @@ -48,44 +48,14 @@ TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" pip install --quiet urllib3 requests 2>/dev/null || true -# Patch vLLM bug: local_cache_hit counter can go negative under high load -# (causes "Counters can only be incremented by non-negative amounts" crash) -STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") -if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then - echo "Patching vLLM stats.py: $STATS_FILE" - python3 -c " -import re, sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', - 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -" "$STATS_FILE" -fi - -# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) -# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) -SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") -if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then - echo "Patching vLLM scheduler.py: $SCHED_FILE" - python3 << 'PYEOF' "$SCHED_FILE" -import sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', -) -src = src.replace( - 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -PYEOF +# Check vLLM version — patches for local_cache_hit and scheduler stale KV +# transfer are fixed in 0.19.0+ +VLLM_VERSION=$(python3 -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "unknown") +echo "vLLM version: $VLLM_VERSION" +if python3 -c "from packaging.version import Version; exit(0 if Version('${VLLM_VERSION}') >= Version('0.19.0') else 1)" 2>/dev/null; then + echo "vLLM >= 0.19.0: no patches needed" +else + echo "WARNING: vLLM $VLLM_VERSION < 0.19.0 — local_cache_hit and scheduler patches are no longer applied. Upgrade to 0.19.0+ for stability." fi SERVER_LOG="$RESULT_DIR/server.log" @@ -114,9 +84,10 @@ VLLM_CMD+=" --gpu-memory-utilization 0.9" VLLM_CMD+=" --tensor-parallel-size $TP" if [ "$OFFLOAD_MODE" = "on" ]; then + export VLLM_USE_SIMPLE_KV_OFFLOAD=1 VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" - VLLM_CMD+=" --disable-hybrid-kv-cache-manager" + VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" elif [ "$OFFLOAD_MODE" = "noprefix" ]; then VLLM_CMD+=" --no-enable-prefix-caching" fi diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh index 4cf20c453..68128d37f 100755 --- a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -48,44 +48,14 @@ TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" pip install --quiet urllib3 requests 2>/dev/null || true -# Patch vLLM bug: local_cache_hit counter can go negative under high load -# (causes "Counters can only be incremented by non-negative amounts" crash) -STATS_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'metrics', 'stats.py'))" 2>/dev/null || echo "") -if [ -n "$STATS_FILE" ] && [ -f "$STATS_FILE" ] && grep -q 'self.local_cache_hit += (' "$STATS_FILE"; then - echo "Patching vLLM stats.py: $STATS_FILE" - python3 -c " -import re, sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'self.local_cache_hit += (\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', - 'self.local_cache_hit += max(0,\n num_cached_tokens + recomputed - num_external_computed_tokens\n )', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -" "$STATS_FILE" -fi - -# Patch vLLM bug: stale KV transfer callback after request cleanup (PR #37859) -# (causes "AssertionError: assert req_id in self.requests" crash under KV offloading) -SCHED_FILE=$(python3 -c "import vllm; import os; print(os.path.join(os.path.dirname(vllm.__file__), 'v1', 'core', 'sched', 'scheduler.py'))" 2>/dev/null || echo "") -if [ -n "$SCHED_FILE" ] && [ -f "$SCHED_FILE" ] && grep -q 'assert req_id in self.requests' "$SCHED_FILE"; then - echo "Patching vLLM scheduler.py: $SCHED_FILE" - python3 << 'PYEOF' "$SCHED_FILE" -import sys -with open(sys.argv[1]) as f: - src = f.read() -src = src.replace( - 'assert req_id in self.requests\n req = self.requests[req_id]\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished recving KV transfer for unknown request %s", req_id)\n self.finished_recving_kv_req_ids.discard(req_id)\n continue\n if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:', -) -src = src.replace( - 'assert req_id in self.requests\n self._free_blocks(self.requests[req_id])', - 'req = self.requests.get(req_id)\n if req is None:\n logger.debug("Ignoring finished sending KV transfer for unknown request %s", req_id)\n continue\n self._free_blocks(req)', -) -with open(sys.argv[1], 'w') as f: - f.write(src) -PYEOF +# Check vLLM version — patches for local_cache_hit and scheduler stale KV +# transfer are fixed in 0.19.0+ +VLLM_VERSION=$(python3 -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "unknown") +echo "vLLM version: $VLLM_VERSION" +if python3 -c "from packaging.version import Version; exit(0 if Version('${VLLM_VERSION}') >= Version('0.19.0') else 1)" 2>/dev/null; then + echo "vLLM >= 0.19.0: no patches needed" +else + echo "WARNING: vLLM $VLLM_VERSION < 0.19.0 — local_cache_hit and scheduler patches are no longer applied. Upgrade to 0.19.0+ for stability." fi SERVER_LOG="$RESULT_DIR/server.log" @@ -114,9 +84,10 @@ VLLM_CMD+=" --gpu-memory-utilization 0.9" VLLM_CMD+=" --tensor-parallel-size $TP" if [ "$OFFLOAD_MODE" = "on" ]; then + export VLLM_USE_SIMPLE_KV_OFFLOAD=1 VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" - VLLM_CMD+=" --disable-hybrid-kv-cache-manager" + VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" elif [ "$OFFLOAD_MODE" = "noprefix" ]; then VLLM_CMD+=" --no-enable-prefix-caching" fi From 38fc1fb0c028d51e40b6133d7f267cb7a20f6b62 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 14 Apr 2026 12:39:28 -0500 Subject: [PATCH 52/78] add explicit --timing-strategy think-only to trace replay scripts Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh | 1 + benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh | 1 + benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh | 1 + 3 files changed, 3 insertions(+) diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh index 1c97b7827..e0bfc25d7 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -136,6 +136,7 @@ sleep 2 REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" --timing-strategy think-only" REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" REPLAY_CMD+=" --start-users $USERS" REPLAY_CMD+=" --max-users $USERS" diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh index 404cd9368..cb2597a63 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh @@ -131,6 +131,7 @@ sleep 2 REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" --timing-strategy think-only" REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" REPLAY_CMD+=" --start-users $USERS" REPLAY_CMD+=" --max-users $USERS" diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh index 68128d37f..31b7691d0 100755 --- a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -132,6 +132,7 @@ sleep 2 REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" --timing-strategy think-only" REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" REPLAY_CMD+=" --start-users $USERS" REPLAY_CMD+=" --max-users $USERS" From b54ce74ae1fcceba7af1e059d32a444e9f4f68d3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 14 Apr 2026 14:39:13 -0500 Subject: [PATCH 53/78] update kv-cache-tester: add debug log for streaming delta attrs Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 25f9a34c3..0ab42bf93 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 25f9a34c3e594f5b6fef68ea55c003e61d62dd31 +Subproject commit 0ab42bf9339f0f2b792a644f1f4cfaa5a65f0758 From b4bc408b86f0255173fdb970e1456d9c33833acf Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 14 Apr 2026 15:19:16 -0500 Subject: [PATCH 54/78] add --ignore-eos option for trace replay benchmarks Passes ignore_eos=true to vLLM via extra_body when IGNORE_EOS=true, forcing exact output token count from traces. Plumbed through: - kv-cache-tester: --ignore-eos CLI flag - trace replay scripts: conditional on IGNORE_EOS env var - GH Actions: ignore_eos workflow dispatch input Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 6 ++++++ .github/workflows/multiturn-sweep.yml | 6 ++++++ benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh | 3 +++ benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh | 3 +++ benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh | 3 +++ experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 6 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index b283da930..ee39beada 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -55,6 +55,11 @@ on: description: "Git ref (branch/sha) to checkout" required: false type: string + ignore-eos: + description: "Ignore EOS token and force exact output_tokens from trace" + required: false + type: string + default: 'false' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -73,6 +78,7 @@ env: TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }} SCRIPT_SUFFIX: ${{ inputs.script-suffix }} SPEC_DECODING: 'off' + IGNORE_EOS: ${{ inputs.ignore-eos }} permissions: contents: read diff --git a/.github/workflows/multiturn-sweep.yml b/.github/workflows/multiturn-sweep.yml index 5ed7bf59e..bf3e6af1e 100644 --- a/.github/workflows/multiturn-sweep.yml +++ b/.github/workflows/multiturn-sweep.yml @@ -88,6 +88,11 @@ on: description: 'Git ref (branch/sha) to checkout' required: false type: string + ignore_eos: + description: 'Ignore EOS token and force exact output_tokens from trace (true/false)' + required: false + default: 'false' + type: string jobs: # --------------------------------------------------------------------------- @@ -191,6 +196,7 @@ jobs: script-suffix: ${{ inputs.script_suffix }} ep: "${{ matrix.ep || inputs.ep }}" ref: ${{ inputs.ref }} + ignore-eos: ${{ inputs.ignore_eos }} # --------------------------------------------------------------------------- # Collect & aggregate results diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh index e0bfc25d7..21c34e2e2 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -150,6 +150,9 @@ REPLAY_CMD+=" --advance-min $ADVANCE_MIN" REPLAY_CMD+=" --advance-max $ADVANCE_MAX" REPLAY_CMD+=" --seed 42" REPLAY_CMD+=" --no-color" +if [ "${IGNORE_EOS:-false}" = "true" ]; then + REPLAY_CMD+=" --ignore-eos" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh index cb2597a63..5bd76de34 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh @@ -145,6 +145,9 @@ REPLAY_CMD+=" --advance-min $ADVANCE_MIN" REPLAY_CMD+=" --advance-max $ADVANCE_MAX" REPLAY_CMD+=" --seed 42" REPLAY_CMD+=" --no-color" +if [ "${IGNORE_EOS:-false}" = "true" ]; then + REPLAY_CMD+=" --ignore-eos" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh index 31b7691d0..f2a9700d4 100755 --- a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -146,6 +146,9 @@ REPLAY_CMD+=" --advance-min $ADVANCE_MIN" REPLAY_CMD+=" --advance-max $ADVANCE_MAX" REPLAY_CMD+=" --seed 42" REPLAY_CMD+=" --no-color" +if [ "${IGNORE_EOS:-false}" = "true" ]; then + REPLAY_CMD+=" --ignore-eos" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 0ab42bf93..261cbcf1f 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 0ab42bf9339f0f2b792a644f1f4cfaa5a65f0758 +Subproject commit 261cbcf1f4bfea6742e2894610b98fd2c35a1f58 From f35c1fe73870bb059e46790cfa8dd15f0f961e3a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 14 Apr 2026 15:45:26 -0500 Subject: [PATCH 55/78] pin checkout to trigger commit SHA, not branch HEAD Use github.sha instead of github.ref so in-flight sweep jobs don't pick up new commits pushed to the branch mid-run. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index ee39beada..9ed704612 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -128,7 +128,7 @@ jobs: with: token: ${{ secrets.REPO_PAT }} fetch-depth: 0 - ref: ${{ inputs.ref || github.ref }} + ref: ${{ inputs.ref || github.sha }} submodules: true - name: Launch job script From cbfffd0f310b40d80d91f032da2a82dacdc4d738 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 09:15:26 -0500 Subject: [PATCH 56/78] add FP4 MI355X trace replay benchmark script Based on B200 FP4 trace replay, adapted for MI355X (ROCm): - rocm-smi fallback for GPU detection - No CUDA arch or NVIDIA-specific compilation config - Simple KV offloading, version warning, ignore-eos support Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp4_mi355x_trace_replay.sh | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100755 benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh diff --git a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh new file mode 100755 index 000000000..1bc450d99 --- /dev/null +++ b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh @@ -0,0 +1,177 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Trace replay benchmark for FP4 models on MI355X. +# Replays real agentic coding traces at a fixed number of concurrent users. +# Uses kv-cache-tester/trace_replay_tester.py with realistic cache patterns. +# +# Required env vars: +# MODEL, TP, USERS, OFFLOAD_MODE, TOTAL_CPU_DRAM_GB, RESULT_DIR +# Optional: +# PORT (default 8888), REQUEST_TIMEOUT (default 3600) +# TRACE_DIR (default: kv-cache-tester/traces_neon) +# DURATION (default 1800, benchmark duration in seconds) +# MAX_DELAY (default 60, max gap between requests in seconds) +# ADVANCE_MIN (default 0.0, min trace advancement fraction) +# ADVANCE_MAX (default 0.7, max trace advancement fraction) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + USERS \ + OFFLOAD_MODE \ + TOTAL_CPU_DRAM_GB \ + RESULT_DIR + +PORT=${PORT:-8888} +REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-3600} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ---- Download model -------------------------------------------------------- +hf download "$MODEL" + +nvidia-smi 2>/dev/null || rocm-smi 2>/dev/null || true + +# ---- Paths ----------------------------------------------------------------- +MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark +KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" +TRACE_DIR="${TRACE_DIR:-$KV_CACHE_TESTER_DIR/traces_neon}" + +pip install --quiet urllib3 requests 2>/dev/null || true + +# Check vLLM version — patches for local_cache_hit and scheduler stale KV +# transfer are fixed in 0.19.0+ +VLLM_VERSION=$(python3 -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "unknown") +echo "vLLM version: $VLLM_VERSION" +if python3 -c "from packaging.version import Version; exit(0 if Version('${VLLM_VERSION}') >= Version('0.19.0') else 1)" 2>/dev/null; then + echo "vLLM >= 0.19.0: no patches needed" +else + echo "WARNING: vLLM $VLLM_VERSION < 0.19.0 — local_cache_hit and scheduler patches are no longer applied. Upgrade to 0.19.0+ for stability." +fi + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# ---- Generate vLLM config -------------------------------------------------- +cat > "$RESULT_DIR/config.yaml" << 'EOF' +kv-cache-dtype: fp8 +async-scheduling: true +EOF + +# ---- Build vLLM command ----------------------------------------------------- +offload_size=$TOTAL_CPU_DRAM_GB + +VLLM_CMD="vllm serve $MODEL --host 0.0.0.0 --port $PORT" +VLLM_CMD+=" --config $RESULT_DIR/config.yaml" +VLLM_CMD+=" --gpu-memory-utilization 0.9" +VLLM_CMD+=" --tensor-parallel-size $TP" +if [ "${EP_SIZE:-0}" -gt 1 ]; then + VLLM_CMD+=" --enable-expert-parallel" +fi + +if [ "$OFFLOAD_MODE" = "on" ]; then + export VLLM_USE_SIMPLE_KV_OFFLOAD=1 + VLLM_CMD+=" --kv_offloading_backend native" + VLLM_CMD+=" --kv_offloading_size $offload_size" + VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" +elif [ "$OFFLOAD_MODE" = "noprefix" ]; then + VLLM_CMD+=" --no-enable-prefix-caching" +fi + +echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" + +# ---- Start vLLM server ------------------------------------------------------ +echo "Starting vllm server..." +# MI355X is ROCm — no CUDA arch needed +export PYTHONNOUSERSITE=1 + +$VLLM_CMD > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready \ + --port "$PORT" \ + --server-log "$SERVER_LOG" \ + --server-pid "$SERVER_PID" + +# ---- Install dependencies --------------------------------------------------- +set -x +pip install -q -r "$MULTITURN_DIR/requirements.txt" +pip install -q -r "$KV_CACHE_TESTER_DIR/requirements.txt" +set +x + +# ---- Start server metrics collector ----------------------------------------- +export PYTHONPATH="$MULTITURN_DIR:${PYTHONPATH:-}" + +echo "Starting server metrics collector..." +python3 -m bench.run_metrics_collector \ + --url "http://localhost:$PORT" \ + --output-prefix "$RESULT_DIR/metrics" \ + --pid-file "$RESULT_DIR/metrics_collector.pid" & +METRICS_PID=$! +echo "Metrics collector PID: $METRICS_PID" + +sleep 2 + +# ---- Run trace replay benchmark --------------------------------------------- +REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" +REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" +REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" --timing-strategy think-only" +REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" +REPLAY_CMD+=" --start-users $USERS" +REPLAY_CMD+=" --max-users $USERS" +REPLAY_CMD+=" --max-ttft 9999" +REPLAY_CMD+=" --test-duration $DURATION" +REPLAY_CMD+=" --recycle" +REPLAY_CMD+=" --max-delay $MAX_DELAY" +REPLAY_CMD+=" --max-concurrent-requests 0" +REPLAY_CMD+=" --max-new-tokens-per-period 999999999" +REPLAY_CMD+=" --advance-min $ADVANCE_MIN" +REPLAY_CMD+=" --advance-max $ADVANCE_MAX" +REPLAY_CMD+=" --seed 42" +REPLAY_CMD+=" --no-color" +if [ "${IGNORE_EOS:-false}" = "true" ]; then + REPLAY_CMD+=" --ignore-eos" +fi + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +if $REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"; then + echo "SUCCESS" > "$RESULT_DIR/status.txt" + echo "Benchmark completed successfully" +else + echo "FAILED" > "$RESULT_DIR/status.txt" + echo "Benchmark failed" +fi +set +x + +# ---- Analyze workload distributions ----------------------------------------- +echo "Analyzing workload distributions..." +python3 "$MULTITURN_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true + +# ---- Stop metrics collector ------------------------------------------------- +echo "Stopping metrics collector..." +if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then + kill -TERM "$METRICS_PID" 2>/dev/null || true + wait "$METRICS_PID" 2>/dev/null || true +fi + +# ---- Cleanup ----------------------------------------------------------------- +echo "Stopping vllm server..." +kill "$SERVER_PID" 2>/dev/null || true +wait "$SERVER_PID" 2>/dev/null || true + +echo "Experiment finished at $(date)" From ca79d3a58b7f97b2aebddc4b6f7aced6076af5c1 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 10:42:54 -0500 Subject: [PATCH 57/78] fix MI355X FP4 trace replay: disable AITER MoE to avoid GPU memory fault AITER ck_moe_stage1 kernel crashes with MXFP4 + expert-parallel on MI355X (vllm-project/vllm#35637). Disable AITER MoE while keeping AITER attention, and add MEC firmware scratch reclaim guard. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp4_mi355x_trace_replay.sh | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh index 1bc450d99..3e843d376 100755 --- a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh @@ -90,9 +90,22 @@ fi echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" +# ---- ROCm / AITER tuning ---------------------------------------------------- +# Keep AITER for attention perf, but disable AITER MoE kernels — +# ck_moe_stage1 crashes with MXFP4 + expert-parallel on MI355X. +# See: https://github.com/vllm-project/vllm/issues/35637 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_MOE=0 + +# If MEC FW < 177, RCCL cannot reclaim scratch memory — disable to avoid crashes. +# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=$(rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + # ---- Start vLLM server ------------------------------------------------------ echo "Starting vllm server..." -# MI355X is ROCm — no CUDA arch needed export PYTHONNOUSERSITE=1 $VLLM_CMD > "$SERVER_LOG" 2>&1 & From e86b5d562b48ab61cc7870220fd3bf4c83c6dfd7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 10:45:54 -0500 Subject: [PATCH 58/78] mi355x dsr1 config --- .github/configs/multiturn-agentic-trace.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index 4f480aaab..11824e8c9 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -51,3 +51,8 @@ b200-fp4-dsr1-weka-trace: ep: 8 users: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512] offload: ["on", "off"] + +mi355x-fp4-dsr1-weka-trace: + tp8: + users: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512] + offload: ["on", "off"] From 7749e303c1d904adbecd758c7af3f8c65889125e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 11:06:58 -0500 Subject: [PATCH 59/78] drop SimpleCPUOffloadConnector on ROCm, update H200 trace dir VLLM_USE_SIMPLE_KV_OFFLOAD=1 routes to SimpleCPUOffloadConnector which imports cuda.bindings (NVIDIA-only, PR vllm-project/vllm#37160). Remove it from MI355X scripts so native offloading uses the ROCm-safe OffloadingConnector. Also update H200 trace dir to use traces_neon with env-var override to match the other trace replay scripts. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh | 2 +- benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh | 2 +- benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh index 3e843d376..74f1e5168 100755 --- a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh @@ -80,7 +80,7 @@ if [ "${EP_SIZE:-0}" -gt 1 ]; then fi if [ "$OFFLOAD_MODE" = "on" ]; then - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 + # SimpleCPUOffloadConnector uses cuda.bindings (NVIDIA-only), skip on ROCm VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh index 5bd76de34..f33d0076b 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh @@ -44,7 +44,7 @@ nvidia-smi # ---- Paths ----------------------------------------------------------------- MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" -TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" +TRACE_DIR="${TRACE_DIR:-$KV_CACHE_TESTER_DIR/traces_neon}" pip install --quiet urllib3 requests 2>/dev/null || true diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh index f2a9700d4..7393ff321 100755 --- a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -84,7 +84,7 @@ VLLM_CMD+=" --gpu-memory-utilization 0.9" VLLM_CMD+=" --tensor-parallel-size $TP" if [ "$OFFLOAD_MODE" = "on" ]; then - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 + # SimpleCPUOffloadConnector uses cuda.bindings (NVIDIA-only), skip on ROCm VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" From 78d7388bb9cdc7fdfc9616129e1cbd3951038109 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 11:18:01 -0500 Subject: [PATCH 60/78] update kv-cache-tester: merge traces-ratelimiting branch Brings in curated v8 trace set, rate limiting metrics (goodput, effective TTFT, SLO tracking), and updated trace data. Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 261cbcf1f..04ef8a3ed 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 261cbcf1f4bfea6742e2894610b98fd2c35a1f58 +Subproject commit 04ef8a3ed6c5ba6f8a3654179fb8d62b4aa005a1 From 56bf004423a5ef131d4d750f50a4819af8ec5285 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 11:46:36 -0500 Subject: [PATCH 61/78] fix CW salloc: specify GPU type in GRES request Nodes define GRES with GPU subtypes (gpu:h100:8, gpu:h200:8), so salloc must request gpu:h100:N / gpu:h200:N instead of generic gpu:N. Co-Authored-By: Claude Opus 4.6 (1M context) --- runners/launch_h100-cw.sh | 2 +- runners/launch_h200-cw.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 28e89e0cb..10d6d796f 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -7,7 +7,7 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:h100:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index c4bdad736..a75205bc4 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -13,7 +13,7 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:h200:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" From 6b91c3fa6074bab8f68870d75b000d4dc3e827fc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 11:50:21 -0500 Subject: [PATCH 62/78] add trace_dir workflow input for trace directory override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plumbs TRACE_DIR through sweep workflow → template → benchmark script. Accepts relative dir name (e.g. 'traces') or absolute path. Defaults to traces_neon when empty. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 6 ++++++ .github/workflows/multiturn-sweep.yml | 6 ++++++ benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh | 7 ++++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index 9ed704612..4785f9c50 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -60,6 +60,11 @@ on: required: false type: string default: 'false' + trace-dir: + description: "Override trace directory (relative to kv-cache-tester dir)" + required: false + type: string + default: '' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -79,6 +84,7 @@ env: SCRIPT_SUFFIX: ${{ inputs.script-suffix }} SPEC_DECODING: 'off' IGNORE_EOS: ${{ inputs.ignore-eos }} + TRACE_DIR: ${{ inputs.trace-dir }} permissions: contents: read diff --git a/.github/workflows/multiturn-sweep.yml b/.github/workflows/multiturn-sweep.yml index bf3e6af1e..aca23ecea 100644 --- a/.github/workflows/multiturn-sweep.yml +++ b/.github/workflows/multiturn-sweep.yml @@ -93,6 +93,11 @@ on: required: false default: 'false' type: string + trace_dir: + description: 'Override trace directory (e.g. traces, traces_neon). Relative to kv-cache-tester dir.' + required: false + default: '' + type: string jobs: # --------------------------------------------------------------------------- @@ -197,6 +202,7 @@ jobs: ep: "${{ matrix.ep || inputs.ep }}" ref: ${{ inputs.ref }} ignore-eos: ${{ inputs.ignore_eos }} + trace-dir: ${{ inputs.trace_dir }} # --------------------------------------------------------------------------- # Collect & aggregate results diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh index 21c34e2e2..db3167ae2 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -45,7 +45,12 @@ nvidia-smi # ---- Paths ----------------------------------------------------------------- MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" -TRACE_DIR="${TRACE_DIR:-$KV_CACHE_TESTER_DIR/traces_neon}" +# TRACE_DIR can be absolute, relative to kv-cache-tester, or empty (default: traces_neon) +if [ -z "${TRACE_DIR:-}" ]; then + TRACE_DIR="$KV_CACHE_TESTER_DIR/traces_neon" +elif [ ! -d "$TRACE_DIR" ] && [ -d "$KV_CACHE_TESTER_DIR/$TRACE_DIR" ]; then + TRACE_DIR="$KV_CACHE_TESTER_DIR/$TRACE_DIR" +fi pip install --quiet urllib3 requests 2>/dev/null || true From 6411d18500c71d1e503b56985f5da99c37de7bd8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 11:51:46 -0500 Subject: [PATCH 63/78] remove --exclusive from CW salloc, not supported on dynamic nodes Co-Authored-By: Claude Opus 4.6 (1M context) --- runners/launch_h100-cw.sh | 2 +- runners/launch_h200-cw.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 10d6d796f..b53b625a2 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -7,7 +7,7 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:h100:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:h100:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index a75205bc4..913423537 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -13,7 +13,7 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:h200:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:h200:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" From b07b4ebe9801b8ba87a6ae7e79ebffd69afc73dc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 12:08:15 -0500 Subject: [PATCH 64/78] update kv-cache-tester: traces only from traces-ratelimiting Only pulled trace data files (curated v8 set), no code changes. Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 04ef8a3ed..9ebcbac70 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 04ef8a3ed6c5ba6f8a3654179fb8d62b4aa005a1 +Subproject commit 9ebcbac70efe26f9b030dee7788e23b4da001339 From 00f81186a07726f53a1f668904ec58fbdf1092cb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 12:15:25 -0500 Subject: [PATCH 65/78] update kv-cache-tester: remove debug logging Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 9ebcbac70..daf698a80 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 9ebcbac70efe26f9b030dee7788e23b4da001339 +Subproject commit daf698a80eef3696651a2680d6ec203e918f5095 From 6abde55075c0d52c7cd6fb518139538bd8946815 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 12:34:11 -0500 Subject: [PATCH 66/78] fix MI355X offloading: use native connector without HMA SimpleCPUOffloadConnector uses cuda.bindings (NVIDIA-only). MI355X must use --disable-hybrid-kv-cache-manager with the native OffloadingConnector. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh | 2 +- benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh index 74f1e5168..f81f5c4ba 100755 --- a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh @@ -83,7 +83,7 @@ if [ "$OFFLOAD_MODE" = "on" ]; then # SimpleCPUOffloadConnector uses cuda.bindings (NVIDIA-only), skip on ROCm VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" - VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" elif [ "$OFFLOAD_MODE" = "noprefix" ]; then VLLM_CMD+=" --no-enable-prefix-caching" fi diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh index 7393ff321..a1dbfe1a5 100755 --- a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -87,7 +87,7 @@ if [ "$OFFLOAD_MODE" = "on" ]; then # SimpleCPUOffloadConnector uses cuda.bindings (NVIDIA-only), skip on ROCm VLLM_CMD+=" --kv_offloading_backend native" VLLM_CMD+=" --kv_offloading_size $offload_size" - VLLM_CMD+=" --no-disable-hybrid-kv-cache-manager" + VLLM_CMD+=" --disable-hybrid-kv-cache-manager" elif [ "$OFFLOAD_MODE" = "noprefix" ]; then VLLM_CMD+=" --no-enable-prefix-caching" fi From 55b53feb00797b5f28afa132efcef492e74fbd7f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 12:48:20 -0500 Subject: [PATCH 67/78] mi355x dsr1 config --- .../multiturn_fp4_mi355x_trace_replay.sh | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh index f81f5c4ba..28a81ebe5 100755 --- a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh @@ -90,19 +90,19 @@ fi echo "$VLLM_CMD" > "$RESULT_DIR/vllm_command.txt" -# ---- ROCm / AITER tuning ---------------------------------------------------- -# Keep AITER for attention perf, but disable AITER MoE kernels — -# ck_moe_stage1 crashes with MXFP4 + expert-parallel on MI355X. -# See: https://github.com/vllm-project/vllm/issues/35637 -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_MOE=0 - -# If MEC FW < 177, RCCL cannot reclaim scratch memory — disable to avoid crashes. -# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates -version=$(rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}') -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi +# # ---- ROCm / AITER tuning ---------------------------------------------------- +# # Keep AITER for attention perf, but disable AITER MoE kernels — +# # ck_moe_stage1 crashes with MXFP4 + expert-parallel on MI355X. +# # See: https://github.com/vllm-project/vllm/issues/35637 +# export VLLM_ROCM_USE_AITER=1 +# export VLLM_ROCM_USE_AITER_MOE=0 + +# # If MEC FW < 177, RCCL cannot reclaim scratch memory — disable to avoid crashes. +# # https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +# version=$(rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}') +# if [[ "$version" == "" || $version -lt 177 ]]; then +# export HSA_NO_SCRATCH_RECLAIM=1 +# fi # ---- Start vLLM server ------------------------------------------------------ echo "Starting vllm server..." From b5e14dcc6f23644d5aa273c0fe9e741e831784e5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 15 Apr 2026 13:29:58 -0500 Subject: [PATCH 68/78] remove second exclusive from b200 dgxc srun --- runners/launch_b200-dgxc-slurm.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 3ff289e61..d2ad4bc5d 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -229,7 +229,6 @@ else fi srun --jobid=$JOB_ID \ - --exclusive \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ From 5f377960eec51954a0cf82c0918dd0c762f2cc2d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 16 Apr 2026 15:19:52 -0500 Subject: [PATCH 69/78] add hash-block mode + HF dataset loading to multiturn workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Workflows: new hash_block_mode input on multiturn-sweep and benchmark-multiturn-tmpl, threaded into the trace_replay_tester via HASH_BLOCK_MODE env (default false, existing runs unchanged). - Benchmark scripts (b200/h200/mi355x): TRACE_DIR prefixed with "hf_" now loads from a Hugging Face dataset — e.g. hf_semianalysisai--cc-traces-0 maps to --hf-dataset semianalysisai/cc-traces-0. Otherwise behaves as before with --trace-directory. - Bump kv-cache-tester submodule to pick up --hash-block-mode and --hf-dataset support. - simulate_hash_block_mode.py: dry-run simulator matching the schema of neon_trace_simulation.json. Reports prefix-cache hit estimate and infinite-set upper bound; aggregate mode runs across a directory. --- .../workflows/benchmark-multiturn-tmpl.yml | 6 + .github/workflows/multiturn-sweep.yml | 6 + .../multiturn_fp4_b200_trace_replay.sh | 28 ++- .../multiturn_fp4_mi355x_trace_replay.sh | 20 +- .../multiturn_fp8_h200_trace_replay.sh | 20 +- .../multiturn_fp8_mi355x_trace_replay.sh | 20 +- .../multiturn/vllm_benchmark/kv-cache-tester | 2 +- .../simulate_hash_block_mode.py | 235 ++++++++++++++++++ 8 files changed, 324 insertions(+), 13 deletions(-) create mode 100644 experimental/multiturn/vllm_benchmark/simulate_hash_block_mode.py diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index 4785f9c50..e409aeb54 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -60,6 +60,11 @@ on: required: false type: string default: 'false' + hash-block-mode: + description: "Build each request's prompt deterministically from hash_ids (bypasses pullback/growth). For neon/interleaved-subagent traces." + required: false + type: string + default: 'false' trace-dir: description: "Override trace directory (relative to kv-cache-tester dir)" required: false @@ -84,6 +89,7 @@ env: SCRIPT_SUFFIX: ${{ inputs.script-suffix }} SPEC_DECODING: 'off' IGNORE_EOS: ${{ inputs.ignore-eos }} + HASH_BLOCK_MODE: ${{ inputs.hash-block-mode }} TRACE_DIR: ${{ inputs.trace-dir }} permissions: diff --git a/.github/workflows/multiturn-sweep.yml b/.github/workflows/multiturn-sweep.yml index aca23ecea..890951685 100644 --- a/.github/workflows/multiturn-sweep.yml +++ b/.github/workflows/multiturn-sweep.yml @@ -93,6 +93,11 @@ on: required: false default: 'false' type: string + hash_block_mode: + description: 'Build each request prompt deterministically from hash_ids, bypassing pullback/growth (true/false). Use for neon/interleaved-subagent traces.' + required: false + default: 'false' + type: string trace_dir: description: 'Override trace directory (e.g. traces, traces_neon). Relative to kv-cache-tester dir.' required: false @@ -202,6 +207,7 @@ jobs: ep: "${{ matrix.ep || inputs.ep }}" ref: ${{ inputs.ref }} ignore-eos: ${{ inputs.ignore_eos }} + hash-block-mode: ${{ inputs.hash_block_mode }} trace-dir: ${{ inputs.trace_dir }} # --------------------------------------------------------------------------- diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh index db3167ae2..b6f50361f 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -45,11 +45,24 @@ nvidia-smi # ---- Paths ----------------------------------------------------------------- MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" -# TRACE_DIR can be absolute, relative to kv-cache-tester, or empty (default: traces_neon) -if [ -z "${TRACE_DIR:-}" ]; then - TRACE_DIR="$KV_CACHE_TESTER_DIR/traces_neon" -elif [ ! -d "$TRACE_DIR" ] && [ -d "$KV_CACHE_TESTER_DIR/$TRACE_DIR" ]; then - TRACE_DIR="$KV_CACHE_TESTER_DIR/$TRACE_DIR" +# TRACE_DIR can be: +# - empty (default: traces_neon) +# - absolute path +# - relative to kv-cache-tester +# - hf_-- → loads from Hugging Face dataset / +# (e.g. hf_semianalysisai--cc-traces-0 → semianalysisai/cc-traces-0) +if [[ "${TRACE_DIR:-}" == hf_* ]]; then + HF_DATASET="${TRACE_DIR#hf_}" + HF_DATASET="${HF_DATASET/--//}" # replace first -- with / (org/repo separator) + TRACE_SOURCE_FLAG="--hf-dataset $HF_DATASET" + echo "Loading traces from Hugging Face dataset: $HF_DATASET" +else + if [ -z "${TRACE_DIR:-}" ]; then + TRACE_DIR="$KV_CACHE_TESTER_DIR/traces_neon" + elif [ ! -d "$TRACE_DIR" ] && [ -d "$KV_CACHE_TESTER_DIR/$TRACE_DIR" ]; then + TRACE_DIR="$KV_CACHE_TESTER_DIR/$TRACE_DIR" + fi + TRACE_SOURCE_FLAG="--trace-directory $TRACE_DIR" fi pip install --quiet urllib3 requests 2>/dev/null || true @@ -140,7 +153,7 @@ sleep 2 # ---- Run trace replay benchmark --------------------------------------------- REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" -REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" $TRACE_SOURCE_FLAG" REPLAY_CMD+=" --timing-strategy think-only" REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" REPLAY_CMD+=" --start-users $USERS" @@ -158,6 +171,9 @@ REPLAY_CMD+=" --no-color" if [ "${IGNORE_EOS:-false}" = "true" ]; then REPLAY_CMD+=" --ignore-eos" fi +if [ "${HASH_BLOCK_MODE:-false}" = "true" ]; then + REPLAY_CMD+=" --hash-block-mode" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh index 28a81ebe5..9916029b9 100755 --- a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh @@ -45,7 +45,20 @@ nvidia-smi 2>/dev/null || rocm-smi 2>/dev/null || true # ---- Paths ----------------------------------------------------------------- MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" -TRACE_DIR="${TRACE_DIR:-$KV_CACHE_TESTER_DIR/traces_neon}" +# TRACE_DIR: local path (abs or relative to kv-cache-tester) or hf_-- +# (e.g. hf_semianalysisai--cc-traces-0 loads from HF dataset semianalysisai/cc-traces-0) +if [[ "${TRACE_DIR:-}" == hf_* ]]; then + HF_DATASET="${TRACE_DIR#hf_}" + HF_DATASET="${HF_DATASET/--//}" + TRACE_SOURCE_FLAG="--hf-dataset $HF_DATASET" + echo "Loading traces from Hugging Face dataset: $HF_DATASET" +else + TRACE_DIR="${TRACE_DIR:-$KV_CACHE_TESTER_DIR/traces_neon}" + if [ ! -d "$TRACE_DIR" ] && [ -d "$KV_CACHE_TESTER_DIR/$TRACE_DIR" ]; then + TRACE_DIR="$KV_CACHE_TESTER_DIR/$TRACE_DIR" + fi + TRACE_SOURCE_FLAG="--trace-directory $TRACE_DIR" +fi pip install --quiet urllib3 requests 2>/dev/null || true @@ -139,7 +152,7 @@ sleep 2 # ---- Run trace replay benchmark --------------------------------------------- REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" -REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" $TRACE_SOURCE_FLAG" REPLAY_CMD+=" --timing-strategy think-only" REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" REPLAY_CMD+=" --start-users $USERS" @@ -157,6 +170,9 @@ REPLAY_CMD+=" --no-color" if [ "${IGNORE_EOS:-false}" = "true" ]; then REPLAY_CMD+=" --ignore-eos" fi +if [ "${HASH_BLOCK_MODE:-false}" = "true" ]; then + REPLAY_CMD+=" --hash-block-mode" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh index f33d0076b..6deb9f68b 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh @@ -44,7 +44,20 @@ nvidia-smi # ---- Paths ----------------------------------------------------------------- MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" -TRACE_DIR="${TRACE_DIR:-$KV_CACHE_TESTER_DIR/traces_neon}" +# TRACE_DIR: local path (abs or relative to kv-cache-tester) or hf_-- +# (e.g. hf_semianalysisai--cc-traces-0 loads from HF dataset semianalysisai/cc-traces-0) +if [[ "${TRACE_DIR:-}" == hf_* ]]; then + HF_DATASET="${TRACE_DIR#hf_}" + HF_DATASET="${HF_DATASET/--//}" + TRACE_SOURCE_FLAG="--hf-dataset $HF_DATASET" + echo "Loading traces from Hugging Face dataset: $HF_DATASET" +else + TRACE_DIR="${TRACE_DIR:-$KV_CACHE_TESTER_DIR/traces_neon}" + if [ ! -d "$TRACE_DIR" ] && [ -d "$KV_CACHE_TESTER_DIR/$TRACE_DIR" ]; then + TRACE_DIR="$KV_CACHE_TESTER_DIR/$TRACE_DIR" + fi + TRACE_SOURCE_FLAG="--trace-directory $TRACE_DIR" +fi pip install --quiet urllib3 requests 2>/dev/null || true @@ -130,7 +143,7 @@ sleep 2 # ---- Run trace replay benchmark --------------------------------------------- REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" -REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" $TRACE_SOURCE_FLAG" REPLAY_CMD+=" --timing-strategy think-only" REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" REPLAY_CMD+=" --start-users $USERS" @@ -148,6 +161,9 @@ REPLAY_CMD+=" --no-color" if [ "${IGNORE_EOS:-false}" = "true" ]; then REPLAY_CMD+=" --ignore-eos" fi +if [ "${HASH_BLOCK_MODE:-false}" = "true" ]; then + REPLAY_CMD+=" --hash-block-mode" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh index a1dbfe1a5..239f4fed3 100755 --- a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -44,7 +44,20 @@ nvidia-smi 2>/dev/null || rocm-smi 2>/dev/null || true # ---- Paths ----------------------------------------------------------------- MULTITURN_DIR=/workspace/experimental/multiturn/vllm_benchmark KV_CACHE_TESTER_DIR="$MULTITURN_DIR/kv-cache-tester" -TRACE_DIR="$KV_CACHE_TESTER_DIR/traces" +# TRACE_DIR: local path (abs or relative to kv-cache-tester) or hf_-- +# (e.g. hf_semianalysisai--cc-traces-0 loads from HF dataset semianalysisai/cc-traces-0) +if [[ "${TRACE_DIR:-}" == hf_* ]]; then + HF_DATASET="${TRACE_DIR#hf_}" + HF_DATASET="${HF_DATASET/--//}" + TRACE_SOURCE_FLAG="--hf-dataset $HF_DATASET" + echo "Loading traces from Hugging Face dataset: $HF_DATASET" +else + TRACE_DIR="${TRACE_DIR:-$KV_CACHE_TESTER_DIR/traces}" + if [ ! -d "$TRACE_DIR" ] && [ -d "$KV_CACHE_TESTER_DIR/$TRACE_DIR" ]; then + TRACE_DIR="$KV_CACHE_TESTER_DIR/$TRACE_DIR" + fi + TRACE_SOURCE_FLAG="--trace-directory $TRACE_DIR" +fi pip install --quiet urllib3 requests 2>/dev/null || true @@ -131,7 +144,7 @@ sleep 2 # ---- Run trace replay benchmark --------------------------------------------- REPLAY_CMD="python3 $KV_CACHE_TESTER_DIR/trace_replay_tester.py" REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" -REPLAY_CMD+=" --trace-directory $TRACE_DIR" +REPLAY_CMD+=" $TRACE_SOURCE_FLAG" REPLAY_CMD+=" --timing-strategy think-only" REPLAY_CMD+=" --output-dir $RESULT_DIR/trace_replay" REPLAY_CMD+=" --start-users $USERS" @@ -149,6 +162,9 @@ REPLAY_CMD+=" --no-color" if [ "${IGNORE_EOS:-false}" = "true" ]; then REPLAY_CMD+=" --ignore-eos" fi +if [ "${HASH_BLOCK_MODE:-false}" = "true" ]; then + REPLAY_CMD+=" --hash-block-mode" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index daf698a80..17a83d1bf 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit daf698a80eef3696651a2680d6ec203e918f5095 +Subproject commit 17a83d1bf2e3057d5a10cb79a88a872dc2453751 diff --git a/experimental/multiturn/vllm_benchmark/simulate_hash_block_mode.py b/experimental/multiturn/vllm_benchmark/simulate_hash_block_mode.py new file mode 100644 index 000000000..5f6fdc324 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/simulate_hash_block_mode.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +"""Dry-run the trace_replay_tester --hash-block-mode code path, pure-logic. + +Matches the schema of neon_trace_simulation.json: + { + "trace": "", + "total_requests": N, + "summary": {...}, + "requests": [ + {"req": i, "action": ..., "in_tokens": N, "hash_id_count": N, + "prev_hash_id_count": N, "kept": N, "removed": N, "new": N, + "kept_tokens": N, "full_reset": bool}, + ... + ] + } + +The "action" classification reproduces the legacy build_messages logic so this +file can be compared directly to neon_trace_simulation.json. In addition it +records hash-block-mode-specific fields so you can see what the hash-block +code path WOULD produce without having to run the real benchmark: + - shared_prefix_with_any_prior + - hash_block_hit_tokens + - hash_block_miss_tokens + +Usage: + # Single trace (same as neon_trace_simulation.json): + python3 simulate_hash_block_mode.py \\ + --trace kv-cache-tester/traces_neon/trace_0001.json \\ + --output neon_trace_simulation_hash_block.json + + # Aggregate across a whole directory: + python3 simulate_hash_block_mode.py \\ + --trace-dir kv-cache-tester/traces_neon \\ + --output neon_trace_simulation_hash_block_agg.json +""" + +import argparse +import json +from pathlib import Path + + +def classify_action(curr_seq: tuple, prefix_len: int, is_first: bool): + """Classify what hash-block mode's build_messages branch does for this request. + + Hash-block mode has exactly TWO code paths (see trace_replay_tester.py:1306-1323): + 1. FIRST — first request in the session (no prior) + 2. HASH_BLOCK_FILL — deterministic content from this request's hash_ids + + The sub-labels below refine HASH_BLOCK_FILL by how much prefix overlaps + with any prior request (what the prefix cache will actually hit): + + - HIT : full hash_id prefix match with some prior request + - PARTIAL_HIT : non-zero but not full prefix match + - MISS : zero prefix overlap with any prior request + - NO_HASH_IDS : request has an empty hash_ids list (fallback text path) + """ + if is_first: + return "FIRST" + if not curr_seq: + return "NO_HASH_IDS" + if prefix_len == 0: + return "MISS" + if prefix_len == len(curr_seq): + return "HIT" + return "PARTIAL_HIT" + + +def longest_shared_prefix(curr_seq, prior_seqs) -> int: + best = 0 + for p in prior_seqs: + lim = min(len(curr_seq), len(p)) + k = 0 + while k < lim and curr_seq[k] == p[k]: + k += 1 + if k > best: + best = k + if best == len(curr_seq): + break + return best + + +def simulate_trace(trace_path: Path) -> dict: + with open(trace_path) as f: + raw = json.load(f) + + block_size = raw.get("block_size", 64) + trace_id = raw.get("id", trace_path.stem) + + prior_seqs: list[tuple[int, ...]] = [] + counts = {"FIRST": 0, "HIT": 0, "PARTIAL_HIT": 0, "MISS": 0, "NO_HASH_IDS": 0} + set_seen: set = set() + set_hit = set_total = 0 + per_request = [] + total_in = total_hb_hit = 0 + prev_hash_id_count = 0 + + for i, req in enumerate(raw.get("requests", [])): + if req.get("type") == "subagent": + continue + + curr_seq = tuple(req.get("hash_ids") or []) + in_tokens = req.get("in") or req.get("input_tokens") or 0 + total_in += in_tokens + + # Longest hash_id prefix shared with ANY prior request → what vLLM's + # prefix cache will actually hit under hash-block mode. + prefix_len = longest_shared_prefix(curr_seq, prior_seqs) + hb_hit_tokens = min(prefix_len * block_size, in_tokens) + hb_miss_tokens = max(0, in_tokens - hb_hit_tokens) + total_hb_hit += hb_hit_tokens + + action = classify_action(curr_seq, prefix_len, is_first=(len(prior_seqs) == 0)) + counts[action] = counts.get(action, 0) + 1 + + # Infinite-independent-cache upper bound + set_hit += sum(1 for x in curr_seq if x in set_seen) + set_total += len(curr_seq) + + per_request.append({ + "req": i, + "action": action, + "in_tokens": in_tokens, + "hash_id_count": len(curr_seq), + "prev_hash_id_count": prev_hash_id_count, + "shared_prefix_with_any_prior": prefix_len, + "hash_block_hit_tokens": hb_hit_tokens, + "hash_block_miss_tokens": hb_miss_tokens, + }) + + set_seen.update(curr_seq) + prior_seqs.append(curr_seq) + prev_hash_id_count = len(curr_seq) + + hb_rate = total_hb_hit / total_in if total_in else 0.0 + set_rate = set_hit / set_total if set_total else 0.0 + + return { + "trace": str(trace_path), + "mode": "hash-block", + "total_requests": len(per_request), + "summary": { + "first": counts["FIRST"], + "hit": counts["HIT"], + "partial_hit": counts["PARTIAL_HIT"], + "miss": counts["MISS"], + "no_hash_ids": counts["NO_HASH_IDS"], + "hash_block_hit_tokens": total_hb_hit, + "hash_block_input_tokens": total_in, + "hash_block_cache_hit_rate": round(hb_rate, 4), + "set_cache_hit_blocks": set_hit, + "set_cache_total_blocks": set_total, + "set_cache_hit_rate": round(set_rate, 4), + }, + "requests": per_request, + } + + +def simulate_dir(trace_dir: Path) -> dict: + trace_files = sorted(trace_dir.glob("trace_*.json")) + if not trace_files: + raise SystemExit(f"No trace_*.json in {trace_dir}") + + print(f"Dry-running hash-block mode on {len(trace_files)} traces...") + per_trace = [] + totals = { + "first": 0, "hit": 0, "partial_hit": 0, "miss": 0, "no_hash_ids": 0, + "hash_block_hit_tokens": 0, "hash_block_input_tokens": 0, + "set_cache_hit_blocks": 0, "set_cache_total_blocks": 0, + "total_requests": 0, + } + for tf in trace_files: + sim = simulate_trace(tf) + s = sim["summary"] + totals["total_requests"] += sim["total_requests"] + for k in ("first", "hit", "partial_hit", "miss", "no_hash_ids", + "hash_block_hit_tokens", "hash_block_input_tokens", + "set_cache_hit_blocks", "set_cache_total_blocks"): + totals[k] += s[k] + per_trace.append({ + "trace": tf.name, + "total_requests": sim["total_requests"], + "summary": s, + }) + + hb_rate = (totals["hash_block_hit_tokens"] / totals["hash_block_input_tokens"] + if totals["hash_block_input_tokens"] else 0.0) + set_rate = (totals["set_cache_hit_blocks"] / totals["set_cache_total_blocks"] + if totals["set_cache_total_blocks"] else 0.0) + + return { + "trace_dir": str(trace_dir), + "mode": "hash-block", + "num_traces": len(trace_files), + "aggregate": { + **totals, + "hash_block_cache_hit_rate": round(hb_rate, 4), + "set_cache_hit_rate": round(set_rate, 4), + }, + "per_trace": per_trace, + } + + +def main(): + p = argparse.ArgumentParser() + src = p.add_mutually_exclusive_group(required=True) + src.add_argument("--trace", help="Single trace JSON — same schema as neon_trace_simulation.json") + src.add_argument("--trace-dir", help="Directory of trace_*.json — aggregate across all") + p.add_argument("--output", required=True) + args = p.parse_args() + + if args.trace: + data = simulate_trace(Path(args.trace)) + else: + data = simulate_dir(Path(args.trace_dir)) + + out = Path(args.output) + out.parent.mkdir(parents=True, exist_ok=True) + with open(out, "w") as f: + json.dump(data, f, indent=2) + + s = data.get("summary") or data["aggregate"] + n_req = data.get("total_requests") or data["aggregate"]["total_requests"] + print(f"\nSummary ({n_req:,} requests, hash-block mode):") + print(f" FIRST: {s['first']}") + print(f" HIT: {s['hit']} (full prefix match)") + print(f" PARTIAL_HIT: {s['partial_hit']} (some prefix overlap)") + print(f" MISS: {s['miss']} (no prefix overlap w/ any prior)") + print(f" NO_HASH_IDS: {s['no_hash_ids']} (fallback text path)") + print(f" hash-block prefix cache hit rate: {s.get('hash_block_cache_hit_rate', 0):.1%}") + print(f" infinite-set cache hit rate: {s.get('set_cache_hit_rate', 0):.1%}") + print(f"\nWritten to {out}") + + +if __name__ == "__main__": + main() From b3c4a83b04fd5a20e1373dcdde8f03654a05b385 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 16 Apr 2026 15:39:44 -0500 Subject: [PATCH 70/78] bump kv-cache-tester: print theoretical cache-hit ceilings at init --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 17a83d1bf..c0d0dcf1a 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 17a83d1bf2e3057d5a10cb79a88a872dc2453751 +Subproject commit c0d0dcf1afe721a3985a20d48cdfe74ee72bdd2e From fcb80e8cb71c01e9816cf7885323355b4ef22651 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 16 Apr 2026 16:00:09 -0500 Subject: [PATCH 71/78] bump kv-cache-tester: add theoretical_cumulative_cache_hit_rate (infinite cache) to assessment periods --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index c0d0dcf1a..495637e93 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit c0d0dcf1afe721a3985a20d48cdfe74ee72bdd2e +Subproject commit 495637e93f19bc54e51735b403678d34f6125dea From 4e497619c03c99df27d36b338f294092278cb397 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 16 Apr 2026 16:27:14 -0500 Subject: [PATCH 72/78] bump kv-cache-tester: fix RequestMetrics dataclass field ordering --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 495637e93..b7d9b5486 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 495637e93f19bc54e51735b403678d34f6125dea +Subproject commit b7d9b54865c1331d355b6fa4823da092cf06299e From 13f6d00f70f317ce688286cf61bdcd916b41eaeb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 20 Apr 2026 10:59:35 -0500 Subject: [PATCH 73/78] feat: add --debug-trace flag and flamegraph generator for cache analysis - Add --debug-trace flag to trace replay tester (stores full request/response bodies including reasoning_content to JSONL) - Plumb debug_trace through GHA workflows (multiturn-sweep.yml, benchmark-multiturn-tmpl.yml) and all 4 benchmark scripts - Add b200-fp4-dsr1-weka-trace-debug config (tp4, 2 users, offload off) - Add flamegraph generator script for visualizing per-trace cache hit/miss patterns as icicle charts - Bump kv-cache-tester submodule Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/multiturn-agentic-trace.yaml | 6 + .../workflows/benchmark-multiturn-tmpl.yml | 6 + .github/workflows/multiturn-sweep.yml | 6 + .../multiturn_fp4_b200_trace_replay.sh | 3 + .../multiturn_fp4_mi355x_trace_replay.sh | 3 + .../multiturn_fp8_h200_trace_replay.sh | 3 + .../multiturn_fp8_mi355x_trace_replay.sh | 3 + .../flamegraphs/generate_flamegraphs.py | 201 ++++++++++++++++++ .../multiturn/vllm_benchmark/kv-cache-tester | 2 +- 9 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 experimental/multiturn/vllm_benchmark/flamegraphs/generate_flamegraphs.py diff --git a/.github/configs/multiturn-agentic-trace.yaml b/.github/configs/multiturn-agentic-trace.yaml index 11824e8c9..b8a7a46fe 100644 --- a/.github/configs/multiturn-agentic-trace.yaml +++ b/.github/configs/multiturn-agentic-trace.yaml @@ -52,6 +52,12 @@ b200-fp4-dsr1-weka-trace: users: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512] offload: ["on", "off"] +b200-fp4-dsr1-weka-trace-debug: + tp4: + ep: 4 + users: [2] + offload: ["off"] + mi355x-fp4-dsr1-weka-trace: tp8: users: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512] diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index e409aeb54..39060eafe 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -70,6 +70,11 @@ on: required: false type: string default: '' + debug-trace: + description: "Store full request/response text to debug_trace.jsonl" + required: false + type: string + default: 'false' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -91,6 +96,7 @@ env: IGNORE_EOS: ${{ inputs.ignore-eos }} HASH_BLOCK_MODE: ${{ inputs.hash-block-mode }} TRACE_DIR: ${{ inputs.trace-dir }} + DEBUG_TRACE: ${{ inputs.debug-trace }} permissions: contents: read diff --git a/.github/workflows/multiturn-sweep.yml b/.github/workflows/multiturn-sweep.yml index 890951685..9a3e4d59a 100644 --- a/.github/workflows/multiturn-sweep.yml +++ b/.github/workflows/multiturn-sweep.yml @@ -103,6 +103,11 @@ on: required: false default: '' type: string + debug_trace: + description: 'Store full request/response text for every API call to debug_trace.jsonl (true/false)' + required: false + default: 'false' + type: string jobs: # --------------------------------------------------------------------------- @@ -209,6 +214,7 @@ jobs: ignore-eos: ${{ inputs.ignore_eos }} hash-block-mode: ${{ inputs.hash_block_mode }} trace-dir: ${{ inputs.trace_dir }} + debug-trace: ${{ inputs.debug_trace }} # --------------------------------------------------------------------------- # Collect & aggregate results diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh index b6f50361f..a9f0104f3 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -174,6 +174,9 @@ fi if [ "${HASH_BLOCK_MODE:-false}" = "true" ]; then REPLAY_CMD+=" --hash-block-mode" fi +if [ "${DEBUG_TRACE:-false}" = "true" ]; then + REPLAY_CMD+=" --debug-trace" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh index 9916029b9..c229ebfa0 100755 --- a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh @@ -173,6 +173,9 @@ fi if [ "${HASH_BLOCK_MODE:-false}" = "true" ]; then REPLAY_CMD+=" --hash-block-mode" fi +if [ "${DEBUG_TRACE:-false}" = "true" ]; then + REPLAY_CMD+=" --debug-trace" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh index 6deb9f68b..72035e9fd 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh @@ -164,6 +164,9 @@ fi if [ "${HASH_BLOCK_MODE:-false}" = "true" ]; then REPLAY_CMD+=" --hash-block-mode" fi +if [ "${DEBUG_TRACE:-false}" = "true" ]; then + REPLAY_CMD+=" --debug-trace" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh index 239f4fed3..a89839872 100755 --- a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -165,6 +165,9 @@ fi if [ "${HASH_BLOCK_MODE:-false}" = "true" ]; then REPLAY_CMD+=" --hash-block-mode" fi +if [ "${DEBUG_TRACE:-false}" = "true" ]; then + REPLAY_CMD+=" --debug-trace" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/experimental/multiturn/vllm_benchmark/flamegraphs/generate_flamegraphs.py b/experimental/multiturn/vllm_benchmark/flamegraphs/generate_flamegraphs.py new file mode 100644 index 000000000..b67e7b5b3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/flamegraphs/generate_flamegraphs.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +"""Generate per-trace icicle/flame charts showing KV cache hit/miss patterns. + +For each trace JSON, produces a horizontal stacked bar chart where: + - Each row is a request (bottom = first, top = last) + - Green = cache hit blocks (prefix shared with a prior request) + - Red = cache miss blocks (new blocks not in any prior prefix) + - Width = total hash_id block count (shows context growth) + +Cache hit logic: for each request, find the longest prefix match against +ALL prior requests (simulating an infinite prefix cache). Blocks in the +matched prefix are hits; the rest are misses. +""" + +import argparse +import json +import os +import sys +from multiprocessing import Pool, cpu_count +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches + + +def compute_cache_hits(requests: list[dict]) -> list[tuple[int, int]]: + """Return (hit_blocks, miss_blocks) for each request. + + Simulates an infinite prefix cache: for each request, the longest + prefix match against any prior request determines hit blocks. + """ + results = [] + prior_hash_id_lists: list[list] = [] + + for req in requests: + hids = req.get("hash_ids", []) + n = len(hids) + + if not prior_hash_id_lists: + results.append((0, n)) + prior_hash_id_lists.append(hids) + continue + + best_overlap = 0 + for prior in prior_hash_id_lists: + overlap = 0 + for a, b in zip(prior, hids): + if a == b: + overlap += 1 + else: + break + best_overlap = max(best_overlap, overlap) + + results.append((best_overlap, n - best_overlap)) + prior_hash_id_lists.append(hids) + + return results + + +def generate_flamegraph(trace_path: str, output_path: str) -> dict: + """Generate a single flamegraph PNG for a trace file. Returns summary stats.""" + with open(trace_path) as f: + data = json.load(f) + + trace_id = data.get("id", Path(trace_path).stem) + models = data.get("models", []) + block_size = data.get("block_size", 64) + requests = data.get("requests", []) + + if not requests: + return {"trace_id": trace_id, "num_requests": 0, "skipped": True} + + hit_miss = compute_cache_hits(requests) + num_requests = len(hit_miss) + + total_hit = sum(h for h, _ in hit_miss) + total_blocks = sum(h + m for h, m in hit_miss) + overall_hit_rate = total_hit / total_blocks if total_blocks > 0 else 0.0 + + fig_height = max(4, num_requests * 0.18 + 1.5) + fig_width = 12 + fig, ax = plt.subplots(figsize=(fig_width, fig_height)) + + y_positions = list(range(num_requests)) + hit_widths = [h for h, _ in hit_miss] + miss_widths = [m for _, m in hit_miss] + + bar_height = 0.8 + ax.barh(y_positions, hit_widths, height=bar_height, color="#2ecc71", label="Cache Hit") + ax.barh( + y_positions, + miss_widths, + left=hit_widths, + height=bar_height, + color="#e74c3c", + label="Cache Miss", + ) + + ax.set_xlabel(f"Hash-ID Blocks ({block_size} tokens/block)") + ax.set_ylabel("Request Index") + ax.set_yticks(range(0, num_requests, max(1, num_requests // 20))) + + model_str = ", ".join(models) if models else "unknown" + if len(model_str) > 60: + model_str = model_str[:57] + "..." + ax.set_title( + f"{trace_id} | {model_str}\n" + f"{num_requests} requests | Cache Hit Rate: {overall_hit_rate:.1%} | " + f"Max context: {max(h + m for h, m in hit_miss):,} blocks " + f"({max(h + m for h, m in hit_miss) * block_size:,} tokens)", + fontsize=10, + ) + + hit_patch = mpatches.Patch(color="#2ecc71", label="Cache Hit (prefix reuse)") + miss_patch = mpatches.Patch(color="#e74c3c", label="Cache Miss (new blocks)") + ax.legend(handles=[hit_patch, miss_patch], loc="lower right", fontsize=8) + + ax.set_xlim(0, max(h + m for h, m in hit_miss) * 1.05) + plt.tight_layout() + fig.savefig(output_path, dpi=100, bbox_inches="tight") + plt.close(fig) + + return { + "trace_id": trace_id, + "num_requests": num_requests, + "overall_hit_rate": overall_hit_rate, + "total_blocks": total_blocks, + "skipped": False, + } + + +def _worker(args): + trace_path, output_path = args + try: + return generate_flamegraph(trace_path, output_path) + except Exception as e: + return {"trace_path": trace_path, "error": str(e)} + + +def main(): + parser = argparse.ArgumentParser(description="Generate per-trace cache hit/miss flamegraphs") + parser.add_argument( + "--input-dir", + required=True, + help="Directory containing trace_NNNN.json files", + ) + parser.add_argument( + "--output-dir", + required=True, + help="Directory to write flamegraph PNGs", + ) + parser.add_argument( + "--workers", + type=int, + default=min(cpu_count(), 8), + help="Number of parallel workers (default: min(cpu_count, 8))", + ) + args = parser.parse_args() + + input_dir = Path(args.input_dir) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + trace_files = sorted(input_dir.glob("trace_*.json")) + if not trace_files: + print(f"No trace_*.json files found in {input_dir}") + sys.exit(1) + + print(f"Found {len(trace_files)} traces. Generating flamegraphs with {args.workers} workers...") + + work_items = [] + for tf in trace_files: + out_path = output_dir / f"{tf.stem}.png" + work_items.append((str(tf), str(out_path))) + + with Pool(args.workers) as pool: + results = pool.map(_worker, work_items) + + errors = [r for r in results if "error" in r] + skipped = [r for r in results if r.get("skipped")] + success = [r for r in results if not r.get("skipped") and "error" not in r] + + print(f"\nDone: {len(success)} generated, {len(skipped)} skipped (empty), {len(errors)} errors") + if errors: + for e in errors[:5]: + print(f" ERROR: {e['trace_path']}: {e['error']}") + + if success: + hit_rates = [r["overall_hit_rate"] for r in success] + print(f"\nCache hit rate across traces:") + print(f" Mean: {sum(hit_rates)/len(hit_rates):.1%}") + print(f" Min: {min(hit_rates):.1%}") + print(f" Max: {max(hit_rates):.1%}") + print(f" Median: {sorted(hit_rates)[len(hit_rates)//2]:.1%}") + + +if __name__ == "__main__": + main() diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index b7d9b5486..6af686b83 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit b7d9b54865c1331d355b6fa4823da092cf06299e +Subproject commit 6af686b830b9d25942251eed8f24cfcb57fa29f0 From fc0be5b995c3f2d8efc6629f9ad36517691c2ded Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 20 Apr 2026 11:20:58 -0500 Subject: [PATCH 74/78] fix: include debug_trace.jsonl in artifact upload Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index 39060eafe..192aa4539 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -188,6 +188,7 @@ jobs: results/workload_distribution_summary.txt results/workload_distribution_plots.png results/trace_replay/detailed_results.csv + results/trace_replay/debug_trace.jsonl results/status.txt if-no-files-found: ignore From b5ef2288dea07acb23149f319b7fbd7d30611a58 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 20 Apr 2026 11:34:32 -0500 Subject: [PATCH 75/78] fix: trim idle leading rows from metrics CSV after benchmark The metrics collector starts ~2 minutes before the trace replay sends its first request. Strip rows with zero activity and reset relative_time so the CSV starts at first actual usage. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../multiturn_fp4_b200_trace_replay.sh | 24 +++++++++++++++++++ .../multiturn_fp4_mi355x_trace_replay.sh | 24 +++++++++++++++++++ .../multiturn_fp8_h200_trace_replay.sh | 24 +++++++++++++++++++ .../multiturn_fp8_mi355x_trace_replay.sh | 24 +++++++++++++++++++ 4 files changed, 96 insertions(+) diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh index a9f0104f3..6ff8252c0 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -202,6 +202,30 @@ if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then wait "$METRICS_PID" 2>/dev/null || true fi +# Trim leading idle rows (no requests running, no tokens processed) +python3 -c " +import csv, sys +f = '$RESULT_DIR/metrics_server_metrics.csv' +try: + with open(f) as fh: + reader = csv.DictReader(fh) + header = reader.fieldnames + rows = list(reader) + first = next((i for i, r in enumerate(rows) if float(r.get('num_requests_running', 0)) > 0 or float(r.get('prompt_tokens_total', 0)) > 0), 0) + if first > 0: + trimmed = rows[first:] + t0 = float(trimmed[0]['timestamp_sec']) + for r in trimmed: + r['relative_time_sec'] = f'{float(r[\"timestamp_sec\"]) - t0:.3f}' + with open(f, 'w', newline='') as fh: + w = csv.DictWriter(fh, fieldnames=header) + w.writeheader() + w.writerows(trimmed) + print(f'Trimmed {first} idle rows from metrics CSV') +except Exception as e: + print(f'Warning: could not trim metrics CSV: {e}', file=sys.stderr) +" 2>&1 || true + # ---- Cleanup ----------------------------------------------------------------- echo "Stopping vllm server..." kill "$SERVER_PID" 2>/dev/null || true diff --git a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh index c229ebfa0..60c9d7608 100755 --- a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh @@ -201,6 +201,30 @@ if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then wait "$METRICS_PID" 2>/dev/null || true fi +# Trim leading idle rows (no requests running, no tokens processed) +python3 -c " +import csv, sys +f = '$RESULT_DIR/metrics_server_metrics.csv' +try: + with open(f) as fh: + reader = csv.DictReader(fh) + header = reader.fieldnames + rows = list(reader) + first = next((i for i, r in enumerate(rows) if float(r.get('num_requests_running', 0)) > 0 or float(r.get('prompt_tokens_total', 0)) > 0), 0) + if first > 0: + trimmed = rows[first:] + t0 = float(trimmed[0]['timestamp_sec']) + for r in trimmed: + r['relative_time_sec'] = f'{float(r[\"timestamp_sec\"]) - t0:.3f}' + with open(f, 'w', newline='') as fh: + w = csv.DictWriter(fh, fieldnames=header) + w.writeheader() + w.writerows(trimmed) + print(f'Trimmed {first} idle rows from metrics CSV') +except Exception as e: + print(f'Warning: could not trim metrics CSV: {e}', file=sys.stderr) +" 2>&1 || true + # ---- Cleanup ----------------------------------------------------------------- echo "Stopping vllm server..." kill "$SERVER_PID" 2>/dev/null || true diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh index 72035e9fd..21378bbdc 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh @@ -192,6 +192,30 @@ if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then wait "$METRICS_PID" 2>/dev/null || true fi +# Trim leading idle rows (no requests running, no tokens processed) +python3 -c " +import csv, sys +f = '$RESULT_DIR/metrics_server_metrics.csv' +try: + with open(f) as fh: + reader = csv.DictReader(fh) + header = reader.fieldnames + rows = list(reader) + first = next((i for i, r in enumerate(rows) if float(r.get('num_requests_running', 0)) > 0 or float(r.get('prompt_tokens_total', 0)) > 0), 0) + if first > 0: + trimmed = rows[first:] + t0 = float(trimmed[0]['timestamp_sec']) + for r in trimmed: + r['relative_time_sec'] = f'{float(r[\"timestamp_sec\"]) - t0:.3f}' + with open(f, 'w', newline='') as fh: + w = csv.DictWriter(fh, fieldnames=header) + w.writeheader() + w.writerows(trimmed) + print(f'Trimmed {first} idle rows from metrics CSV') +except Exception as e: + print(f'Warning: could not trim metrics CSV: {e}', file=sys.stderr) +" 2>&1 || true + # ---- Cleanup ----------------------------------------------------------------- echo "Stopping vllm server..." kill "$SERVER_PID" 2>/dev/null || true diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh index a89839872..c6aeb17f9 100755 --- a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -193,6 +193,30 @@ if [ -n "$METRICS_PID" ] && kill -0 "$METRICS_PID" 2>/dev/null; then wait "$METRICS_PID" 2>/dev/null || true fi +# Trim leading idle rows (no requests running, no tokens processed) +python3 -c " +import csv, sys +f = '$RESULT_DIR/metrics_server_metrics.csv' +try: + with open(f) as fh: + reader = csv.DictReader(fh) + header = reader.fieldnames + rows = list(reader) + first = next((i for i, r in enumerate(rows) if float(r.get('num_requests_running', 0)) > 0 or float(r.get('prompt_tokens_total', 0)) > 0), 0) + if first > 0: + trimmed = rows[first:] + t0 = float(trimmed[0]['timestamp_sec']) + for r in trimmed: + r['relative_time_sec'] = f'{float(r[\"timestamp_sec\"]) - t0:.3f}' + with open(f, 'w', newline='') as fh: + w = csv.DictWriter(fh, fieldnames=header) + w.writeheader() + w.writerows(trimmed) + print(f'Trimmed {first} idle rows from metrics CSV') +except Exception as e: + print(f'Warning: could not trim metrics CSV: {e}', file=sys.stderr) +" 2>&1 || true + # ---- Cleanup ----------------------------------------------------------------- echo "Stopping vllm server..." kill "$SERVER_PID" 2>/dev/null || true From f57f7c82e90e0d259ecd2ce3ec3322ddd5884a38 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 20 Apr 2026 11:52:55 -0500 Subject: [PATCH 76/78] bump kv-cache-tester: add raw_chunks to debug trace Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 6af686b83..b98a141b9 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 6af686b830b9d25942251eed8f24cfcb57fa29f0 +Subproject commit b98a141b94567dc54299805dd013b210927481aa From a01b775b7fbb2b48e0048c6acec1a5b1db7e6d26 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 20 Apr 2026 13:41:56 -0500 Subject: [PATCH 77/78] feat: add --no-max-tokens flag for reasoning models (R1) Plumb no_max_tokens through GHA workflows and all benchmark scripts. When enabled, the trace replayer doesn't enforce max_tokens from the trace, letting models like DeepSeek R1 generate freely so they can close blocks and produce visible output. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/benchmark-multiturn-tmpl.yml | 6 ++++++ .github/workflows/multiturn-sweep.yml | 6 ++++++ benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh | 3 +++ benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh | 3 +++ benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh | 3 +++ benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh | 3 +++ experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 7 files changed, 25 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml index 192aa4539..056ae32c3 100644 --- a/.github/workflows/benchmark-multiturn-tmpl.yml +++ b/.github/workflows/benchmark-multiturn-tmpl.yml @@ -75,6 +75,11 @@ on: required: false type: string default: 'false' + no-max-tokens: + description: "Don't enforce max_tokens from trace; let model generate freely" + required: false + type: string + default: 'false' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -97,6 +102,7 @@ env: HASH_BLOCK_MODE: ${{ inputs.hash-block-mode }} TRACE_DIR: ${{ inputs.trace-dir }} DEBUG_TRACE: ${{ inputs.debug-trace }} + NO_MAX_TOKENS: ${{ inputs.no-max-tokens }} permissions: contents: read diff --git a/.github/workflows/multiturn-sweep.yml b/.github/workflows/multiturn-sweep.yml index 9a3e4d59a..068c62622 100644 --- a/.github/workflows/multiturn-sweep.yml +++ b/.github/workflows/multiturn-sweep.yml @@ -108,6 +108,11 @@ on: required: false default: 'false' type: string + no_max_tokens: + description: 'Don''t enforce max_tokens from trace; let model generate freely until EOS (true/false). Use for reasoning models like DeepSeek R1.' + required: false + default: 'false' + type: string jobs: # --------------------------------------------------------------------------- @@ -215,6 +220,7 @@ jobs: hash-block-mode: ${{ inputs.hash_block_mode }} trace-dir: ${{ inputs.trace_dir }} debug-trace: ${{ inputs.debug_trace }} + no-max-tokens: ${{ inputs.no_max_tokens }} # --------------------------------------------------------------------------- # Collect & aggregate results diff --git a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh index 6ff8252c0..2110f9f6a 100755 --- a/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh @@ -177,6 +177,9 @@ fi if [ "${DEBUG_TRACE:-false}" = "true" ]; then REPLAY_CMD+=" --debug-trace" fi +if [ "${NO_MAX_TOKENS:-false}" = "true" ]; then + REPLAY_CMD+=" --no-max-tokens" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh index 60c9d7608..da6849cd2 100755 --- a/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp4_mi355x_trace_replay.sh @@ -176,6 +176,9 @@ fi if [ "${DEBUG_TRACE:-false}" = "true" ]; then REPLAY_CMD+=" --debug-trace" fi +if [ "${NO_MAX_TOKENS:-false}" = "true" ]; then + REPLAY_CMD+=" --no-max-tokens" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh index 21378bbdc..51b1c6046 100755 --- a/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh @@ -167,6 +167,9 @@ fi if [ "${DEBUG_TRACE:-false}" = "true" ]; then REPLAY_CMD+=" --debug-trace" fi +if [ "${NO_MAX_TOKENS:-false}" = "true" ]; then + REPLAY_CMD+=" --no-max-tokens" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh index c6aeb17f9..b7f939882 100755 --- a/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh +++ b/benchmarks/single_node/multiturn_fp8_mi355x_trace_replay.sh @@ -168,6 +168,9 @@ fi if [ "${DEBUG_TRACE:-false}" = "true" ]; then REPLAY_CMD+=" --debug-trace" fi +if [ "${NO_MAX_TOKENS:-false}" = "true" ]; then + REPLAY_CMD+=" --no-max-tokens" +fi echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index b98a141b9..5a40d7488 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit b98a141b94567dc54299805dd013b210927481aa +Subproject commit 5a40d74880516dae713a470a7631fd31609ff710 From 816e410c3db02172c886072c79f38cbba2de14ed Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 20 Apr 2026 14:02:19 -0500 Subject: [PATCH 78/78] bump kv-cache-tester: fix no_max_tokens attribute error Co-Authored-By: Claude Opus 4.6 (1M context) --- experimental/multiturn/vllm_benchmark/kv-cache-tester | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester b/experimental/multiturn/vllm_benchmark/kv-cache-tester index 5a40d7488..bd791bde4 160000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester @@ -1 +1 @@ -Subproject commit 5a40d74880516dae713a470a7631fd31609ff710 +Subproject commit bd791bde4c0ea7d1921dfa1abf3d3fb3a0f38615