From bbc91bc588b059135eccff77facb9cd476a99c44 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Tue, 14 Apr 2026 19:14:27 -0700 Subject: [PATCH 01/18] feat(isb1): add KV cache stress benchmark with multi-turn synthetic traces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ISB-1 (Inference Stress Benchmark) — a multi-turn, long-context KV cache stress testing dataset for InferenceX V3. ## What this adds **35 synthetic multi-turn traces** across 7 context bands (8K → 1M+ tokens): - 6 workload families: long_chat, coding, agent, rag, cache_stress, multimodal - KV stress patterns: prefix reuse, offload cliff, compaction, reactivation, fanout - Real conversation content with 60-95% prefix overlap (enables prefix cache testing) - Context assets from 15KB to 6.6MB inlined into traces for honest token counts **Export bundles** for vLLM + SGLang replay: - extension_131k: DeepSeek-R1, GPT-OSS, Qwen 3.5 (H200/B200) - preview/long_context_500k: Qwen 3.5 500K context stress test - preview/long_context_1m: Qwen 3.5 1M context stress test **10 KV stress sweep configs** (isb1-kv-stress-pr993.yaml): - 3 models × 2 GPUs × 2 engines - Sweep: 2→256 concurrent users × on/off/noprefix offload modes × 1800s ## Coexistence with kv-cache-tester This dataset complements PR #993's kv-cache-tester (522 real Claude Code traces): - kv-cache-tester: real workload distribution, natural performance profile - ISB1: controlled KV stress patterns that force offload cliffs and cache pressure No files in experimental/multiturn/ are modified. Separate config files, separate data directory (datasets/isb1/), shared replay infrastructure. ## Benchmark infrastructure - benchmark_export_replay.py: replay harness with actual_context_len telemetry - process_result_isb1.py: result aggregation with KV metrics - Prometheus metrics: kv_cache_usage, prefix_cache_hits, kv_offload_bytes - Pareto frontier: throughput vs p99 TTFT at each concurrency level --- .gitattributes | 2 + .github/configs/isb1-kv-stress-pr993.yaml | 4326 +++++++++++++ .github/configs/isb1-kv-stress.yaml | 96 + .github/configs/isb1-master.yaml | 1723 +++++ .github/configs/isb1-qwen-1m-preview.yaml | 53 + .github/configs/isb1-triattn-preview.yaml | 291 + .github/workflows/benchmark-isb1-tmpl.yml | 451 ++ .github/workflows/collect-results.yml | 22 + .../workflows/run-isb1-kv-stress-sweep.yml | 110 + .github/workflows/run-isb1-sweep.yml | 256 + .gitignore | 5 +- benchmarks/benchmark_lib.sh | 698 +++ benchmarks/single_node/dsr1_fp4_b200.sh | 27 +- benchmarks/single_node/dsr1_fp8_b200.sh | 43 +- benchmarks/single_node/dsr1_fp8_b200_vllm.sh | 108 + benchmarks/single_node/dsr1_fp8_h200.sh | 40 +- benchmarks/single_node/dsr1_fp8_h200_vllm.sh | 92 + .../single_node/dsr1triattn_fp8_h100_vllm.sh | 117 + .../single_node/dsr1triattn_fp8_h200_vllm.sh | 117 + benchmarks/single_node/gptoss_fp4_b200.sh | 34 +- .../single_node/gptoss_fp4_b200_sglang.sh | 97 + benchmarks/single_node/gptoss_fp4_h100.sh | 40 +- .../single_node/gptoss_fp4_h100_sglang.sh | 85 + benchmarks/single_node/gptoss_fp4_h200.sh | 25 +- .../single_node/gptoss_fp4_h200_sglang.sh | 83 + .../gptosstriattn_fp4_h100_vllm.sh | 127 + .../gptosstriattn_fp4_h200_vllm.sh | 127 + .../single_node/qwen3.5_fp8_b200_sglang.sh | 102 + .../single_node/qwen3.5_fp8_b200_vllm.sh | 95 + .../single_node/qwen3.5_fp8_h100_sglang.sh | 91 + .../single_node/qwen3.5_fp8_h100_vllm.sh | 104 + .../single_node/qwen3.5_fp8_h200_sglang.sh | 98 + .../single_node/qwen3.5_fp8_h200_vllm.sh | 93 + .../qwen3.5triattn_fp8_h100_vllm.sh | 127 + .../qwen3.5triattn_fp8_h200_vllm.sh | 127 + datasets/isb1/.gitattributes | 2 + .../isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md | 122 + datasets/isb1/GMI_EXECUTION_PLAN.md | 175 + datasets/isb1/README.md | 125 + datasets/isb1/exports/core/chat_8k1k.json | 3 + .../isb1/exports/core/chat_8k1k_qwen3.5.json | 3 + datasets/isb1/exports/core/code_8k1k.json | 3 + .../isb1/exports/core/code_8k1k_qwen3.5.json | 3 + .../exports/extension_131k/chat_131k1k.json | 3 + .../extension_131k/chat_131k1k_dsr1.json | 3 + .../extension_131k/chat_131k1k_qwen3.5.json | 3 + .../exports/extension_131k/code_131k1k.json | 3 + .../extension_131k/code_131k1k_qwen3.5.json | 3 + .../exports/extension_32k/chat_32k1k.json | 3 + .../extension_32k/chat_32k1k_qwen3.5.json | 3 + .../exports/extension_32k/code_32k1k.json | 3 + .../extension_32k/code_32k1k_qwen3.5.json | 3 + .../exports/extension_64k/chat_64k1k.json | 3 + .../extension_64k/chat_64k1k_qwen3.5.json | 3 + .../exports/extension_64k/code_64k1k.json | 3 + .../extension_64k/code_64k1k_qwen3.5.json | 3 + .../exports/preview/long_context_1m/README.md | 33 + ...play__chat_qwen3.5_ulc2_1m_preview_v1.json | 3 + ...ay__coding_qwen3.5_ulc2_1m_preview_v1.json | 3 + .../preview/long_context_1m/manifest.json | 3 + .../preview/long_context_500k/README.md | 45 + ...lay__chat_gptoss_xlc2_500k_preview_v1.json | 3 + ...ay__chat_qwen3.5_xlc2_500k_preview_v1.json | 3 + ...y__coding_gptoss_xlc2_500k_preview_v1.json | 3 + ...__coding_qwen3.5_xlc2_500k_preview_v1.json | 3 + .../preview/long_context_500k/manifest.json | 3 + .../long_context_500k/manifest_qwen3.5.json | 3 + .../isb1/scripts/adapt_trace_replay_result.py | 214 + .../analyze_benchmark_distributions.py | 157 + .../isb1/scripts/collect_sweep_results.py | 183 + .../generate_qwen35_low_band_exports.py | 98 + datasets/isb1/scripts/gmi_analyze_sweep.py | 250 + datasets/isb1/scripts/gmi_full_suite.sh | 135 + datasets/isb1/scripts/gmi_kv_sweep.sh | 176 + .../isb1/scripts/gmi_portable_benchmark.sh | 1019 +++ datasets/isb1/scripts/gmi_test_matrix.sh | 88 + .../isb1/scripts/gpu_profile_collector.sh | 42 + datasets/isb1/scripts/isb1_results_db.py | 816 +++ datasets/isb1/scripts/metrics_collector.py | 356 ++ datasets/isb1/scripts/plot_pareto.py | 210 + experimental/README.md | 10 +- experimental/multiturn/README.md | 43 +- .../multiturn/vllm_benchmark/.gitignore | 7 + .../multiturn/vllm_benchmark/README.md | 33 + .../aiperf_synthetic_traces.json | 5559 +++++++++++++++++ .../aiperf_traces/generate_aiperf_traces.py | 81 + .../vllm_benchmark/kv-cache-tester/README.md | 11 + .../kv-cache-tester/traces/.gitkeep | 0 .../multiturn/vllm_benchmark/launch/README.md | 8 + .../launch/lmcache_vllm_b200.sh | 25 + .../launch/lmcache_vllm_h200.sh | 25 + .../trace_replay_dsr1_fp8_b200_vllm.sh | 34 + .../trace_replay_dsr1_fp8_h200_vllm.sh | 34 + .../trace_replay_gptoss_fp4_b200_sglang.sh | 32 + .../trace_replay_gptoss_fp4_b200_vllm.sh | 34 + .../trace_replay_gptoss_fp4_h200_sglang.sh | 32 + .../trace_replay_gptoss_fp4_h200_vllm.sh | 34 + .../trace_replay_qwen3.5_fp8_b200_sglang.sh | 32 + .../trace_replay_qwen3.5_fp8_b200_vllm.sh | 34 + .../trace_replay_qwen3.5_fp8_h200_sglang.sh | 32 + .../trace_replay_qwen3.5_fp8_h200_vllm.sh | 34 + runners/launch_b200-dgxc-slurm.sh | 9 +- runners/launch_b200-dgxc.sh | 11 +- runners/launch_b200-nb.sh | 7 +- runners/launch_h100-cr.sh | 11 +- runners/launch_h100-cw.sh | 5 +- runners/launch_h100-dgxc-slurm.sh | 5 +- runners/launch_h200-cw.sh | 7 +- runners/launch_h200-dgxc-slurm.sh | 5 +- runners/launch_h200-nb.sh | 7 +- runners/lib_single_node_script.sh | 41 + .../bench_serving/benchmark_export_replay.py | 1536 +++++ utils/gate_isb1.py | 298 + utils/matrix_logic/generate_sweep_configs.py | 334 +- .../test_generate_sweep_configs.py | 773 +++ utils/matrix_logic/test_validation.py | 541 ++ utils/matrix_logic/validation.py | 647 +- utils/process_result.py | 17 + utils/process_result_isb1.py | 490 ++ utils/summarize_isb1.py | 238 + utils/test_benchmark_export_replay.py | 766 +++ utils/test_gate_isb1.py | 218 + utils/test_process_result.py | 27 + utils/test_process_result_isb1.py | 1006 +++ utils/test_summarize_isb1.py | 105 + utils/test_verify_producer_sync.py | 64 + utils/verify_producer_sync.py | 117 + 127 files changed, 27676 insertions(+), 99 deletions(-) create mode 100644 .gitattributes create mode 100644 .github/configs/isb1-kv-stress-pr993.yaml create mode 100644 .github/configs/isb1-kv-stress.yaml create mode 100644 .github/configs/isb1-master.yaml create mode 100644 .github/configs/isb1-qwen-1m-preview.yaml create mode 100644 .github/configs/isb1-triattn-preview.yaml create mode 100644 .github/workflows/benchmark-isb1-tmpl.yml create mode 100644 .github/workflows/run-isb1-kv-stress-sweep.yml create mode 100644 .github/workflows/run-isb1-sweep.yml create mode 100644 benchmarks/single_node/dsr1_fp8_b200_vllm.sh create mode 100644 benchmarks/single_node/dsr1_fp8_h200_vllm.sh create mode 100755 benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh create mode 100755 benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh create mode 100644 benchmarks/single_node/gptoss_fp4_b200_sglang.sh create mode 100644 benchmarks/single_node/gptoss_fp4_h100_sglang.sh create mode 100644 benchmarks/single_node/gptoss_fp4_h200_sglang.sh create mode 100755 benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh create mode 100755 benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh create mode 100755 benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh create mode 100755 benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh create mode 100755 benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh create mode 100755 benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh create mode 100755 benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh create mode 100755 benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh create mode 100755 benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh create mode 100755 benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh create mode 100644 datasets/isb1/.gitattributes create mode 100644 datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md create mode 100644 datasets/isb1/GMI_EXECUTION_PLAN.md create mode 100644 datasets/isb1/README.md create mode 100644 datasets/isb1/exports/core/chat_8k1k.json create mode 100644 datasets/isb1/exports/core/chat_8k1k_qwen3.5.json create mode 100644 datasets/isb1/exports/core/code_8k1k.json create mode 100644 datasets/isb1/exports/core/code_8k1k_qwen3.5.json create mode 100644 datasets/isb1/exports/extension_131k/chat_131k1k.json create mode 100644 datasets/isb1/exports/extension_131k/chat_131k1k_dsr1.json create mode 100644 datasets/isb1/exports/extension_131k/chat_131k1k_qwen3.5.json create mode 100644 datasets/isb1/exports/extension_131k/code_131k1k.json create mode 100644 datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json create mode 100644 datasets/isb1/exports/extension_32k/chat_32k1k.json create mode 100644 datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json create mode 100644 datasets/isb1/exports/extension_32k/code_32k1k.json create mode 100644 datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json create mode 100644 datasets/isb1/exports/extension_64k/chat_64k1k.json create mode 100644 datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json create mode 100644 datasets/isb1/exports/extension_64k/code_64k1k.json create mode 100644 datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json create mode 100644 datasets/isb1/exports/preview/long_context_1m/README.md create mode 100644 datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1.json create mode 100644 datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json create mode 100644 datasets/isb1/exports/preview/long_context_1m/manifest.json create mode 100644 datasets/isb1/exports/preview/long_context_500k/README.md create mode 100644 datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1.json create mode 100644 datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1.json create mode 100644 datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json create mode 100644 datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json create mode 100644 datasets/isb1/exports/preview/long_context_500k/manifest.json create mode 100644 datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json create mode 100644 datasets/isb1/scripts/adapt_trace_replay_result.py create mode 100644 datasets/isb1/scripts/analyze_benchmark_distributions.py create mode 100644 datasets/isb1/scripts/collect_sweep_results.py create mode 100755 datasets/isb1/scripts/generate_qwen35_low_band_exports.py create mode 100644 datasets/isb1/scripts/gmi_analyze_sweep.py create mode 100755 datasets/isb1/scripts/gmi_full_suite.sh create mode 100644 datasets/isb1/scripts/gmi_kv_sweep.sh create mode 100755 datasets/isb1/scripts/gmi_portable_benchmark.sh create mode 100755 datasets/isb1/scripts/gmi_test_matrix.sh create mode 100755 datasets/isb1/scripts/gpu_profile_collector.sh create mode 100644 datasets/isb1/scripts/isb1_results_db.py create mode 100644 datasets/isb1/scripts/metrics_collector.py create mode 100644 datasets/isb1/scripts/plot_pareto.py create mode 100644 experimental/multiturn/vllm_benchmark/.gitignore create mode 100644 experimental/multiturn/vllm_benchmark/README.md create mode 100644 experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json create mode 100644 experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py create mode 100644 experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md create mode 100644 experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/.gitkeep create mode 100644 experimental/multiturn/vllm_benchmark/launch/README.md create mode 100755 experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh create mode 100755 experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh create mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh create mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh create mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh create mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh create mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh create mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh create mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh create mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh create mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh create mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh create mode 100644 runners/lib_single_node_script.sh create mode 100644 utils/bench_serving/benchmark_export_replay.py create mode 100644 utils/gate_isb1.py create mode 100644 utils/process_result_isb1.py create mode 100644 utils/summarize_isb1.py create mode 100644 utils/test_benchmark_export_replay.py create mode 100644 utils/test_gate_isb1.py create mode 100644 utils/test_process_result_isb1.py create mode 100644 utils/test_summarize_isb1.py create mode 100644 utils/test_verify_producer_sync.py create mode 100644 utils/verify_producer_sync.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..476a21b1c --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +datasets/isb1/exports/preview/long_context_1m/*.json filter=lfs diff=lfs merge=lfs -text +datasets/isb1/exports/**/*.json filter=lfs diff=lfs merge=lfs -text diff --git a/.github/configs/isb1-kv-stress-pr993.yaml b/.github/configs/isb1-kv-stress-pr993.yaml new file mode 100644 index 000000000..583d51302 --- /dev/null +++ b/.github/configs/isb1-kv-stress-pr993.yaml @@ -0,0 +1,4326 @@ +dsr1-fp4-b200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-r1-fp4 + model-prefix: dsr1 + multinode: true + precision: fp4 + runner: b200-multinode + runtime-stack-id: standalone:dynamo-sglang +dsr1-fp4-b200-dynamo-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-r1-fp4 + model-prefix: dsr1 + multinode: true + precision: fp4 + runner: b200-multinode + runtime-stack-id: standalone:dynamo-sglang +dsr1-fp4-b200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-r1-fp4 + model-prefix: dsr1 + multinode: true + precision: fp4 + runner: b200-multinode + runtime-stack-id: standalone:dynamo-trt +dsr1-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + multinode: false + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp4-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + multinode: false + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp4-b200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + multinode: false + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp4-b300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:b300_sxm_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-r1-fp4 + model-prefix: dsr1 + multinode: true + precision: fp4 + runner: b300 + runtime-stack-id: standalone:dynamo-trt +dsr1-fp4-gb200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-sglang + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: lmsysorg/sglang:v0.5.8-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + multinode: true + precision: fp4 + runner: gb200 + runtime-stack-id: standalone:dynamo-sglang +dsr1-fp4-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + multinode: true + precision: fp4 + runner: gb200 + runtime-stack-id: standalone:dynamo-trt +dsr1-fp4-gb300-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-sglang + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + multinode: true + precision: fp4 + runner: gb300 + runtime-stack-id: standalone:dynamo-sglang +dsr1-fp4-gb300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + multinode: true + precision: fp4 + runner: gb300 + runtime-stack-id: standalone:dynamo-trt +dsr1-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + multinode: false + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp4-mi355x-atom-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + multinode: false + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp4-mi355x-sglang-disagg-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + multinode: true + precision: fp4 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp4-mi355x-sglang-disagg-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + multinode: true + precision: fp4 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp4-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + multinode: false + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: b200-multinode + runtime-stack-id: standalone:dynamo-sglang +dsr1-fp8-b200-dynamo-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: b200-multinode + runtime-stack-id: standalone:dynamo-sglang +dsr1-fp8-b200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: b200-multinode + runtime-stack-id: standalone:dynamo-trt +dsr1-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp8-b200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp8-b300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:b300_sxm_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: b300 + runtime-stack-id: standalone:dynamo-trt +dsr1-fp8-gb200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-sglang + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: gb200 + runtime-stack-id: standalone:dynamo-sglang +dsr1-fp8-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: gb200 + runtime-stack-id: standalone:dynamo-trt +dsr1-fp8-gb300-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-sglang + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: gb300 + runtime-stack-id: standalone:dynamo-sglang +dsr1-fp8-gb300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: gb300 + runtime-stack-id: standalone:dynamo-trt +dsr1-fp8-h100-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-sglang + hardware-profile-id: nvidia:h100_sxm_80gb + image: lmsysorg/sglang:v0.5.8-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: h100-multinode + runtime-stack-id: standalone:dynamo-sglang +dsr1-fp8-h100-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:h100_sxm_80gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: h100-multinode + runtime-stack-id: standalone:dynamo-trt +dsr1-fp8-h200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: h200-multinode + runtime-stack-id: standalone:dynamo-sglang +dsr1-fp8-h200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: h200-multinode + runtime-stack-id: standalone:dynamo-trt +dsr1-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +dsr1-fp8-h200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: h200 + runtime-stack-id: standalone:trt +dsr1-fp8-h200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: h200 + runtime-stack-id: standalone:trt +dsr1-fp8-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:sglang +dsr1-fp8-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:sglang +dsr1-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp8-mi355x-atom-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp8-mi355x-sglang-disagg-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp8-mi355x-sglang-disagg-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + disagg: true + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: true + precision: fp8 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: dsr1 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + multinode: false + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang +glm5-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/GLM-5-NVFP4 + model-prefix: glm5 + multinode: false + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +glm5-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + multinode: false + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +glm5-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm5 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:glm5-hopper + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + multinode: false + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +glm5-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + multinode: false + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +glm5-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm5 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + multinode: false + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang +gptoss-fp4-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gptoss + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: openai/gpt-oss-120b + model-prefix: gptoss + multinode: false + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +gptoss-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gptoss + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.15.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: openai/gpt-oss-120b + model-prefix: gptoss + multinode: false + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +gptoss-fp4-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gptoss + disagg: true + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: openai/gpt-oss-120b + model-prefix: gptoss + multinode: true + precision: fp4 + runner: gb200 + runtime-stack-id: standalone:dynamo-trt +gptoss-fp4-h100-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gptoss + framework: vllm + hardware-profile-id: nvidia:h100_sxm_80gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: openai/gpt-oss-120b + model-prefix: gptoss + multinode: false + precision: fp4 + runner: h100 + runtime-stack-id: standalone:vllm +gptoss-fp4-h200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gptoss + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc11 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: openai/gpt-oss-120b + model-prefix: gptoss + multinode: false + precision: fp4 + runner: h200 + runtime-stack-id: standalone:trt +gptoss-fp4-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gptoss + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: openai/gpt-oss-120b + model-prefix: gptoss + multinode: false + precision: fp4 + runner: h200 + runtime-stack-id: standalone:vllm +gptoss-fp4-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gptoss + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: openai/gpt-oss-120b + model-prefix: gptoss + multinode: false + precision: fp4 + runner: mi300x + runtime-stack-id: standalone:vllm +gptoss-fp4-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gptoss + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: openai/gpt-oss-120b + model-prefix: gptoss + multinode: false + precision: fp4 + runner: mi325x + runtime-stack-id: standalone:vllm +gptoss-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gptoss + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: openai/gpt-oss-120b + model-prefix: gptoss + multinode: false + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +gptoss-fp4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gptoss + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: amd/gpt-oss-120b-w-mxfp4-a-fp8 + model-prefix: gptoss + multinode: false + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:vllm +kimik2.5-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimik2.5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + multinode: false + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +kimik2.5-fp4-gb200-dynamo-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimik2.5 + disagg: true + framework: dynamo-vllm + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: vllm/vllm-openai:v0.18.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + multinode: true + precision: fp4 + runner: gb200 + runtime-stack-id: standalone:dynamo-vllm +kimik2.5-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimik2.5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + multinode: false + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +kimik2.5-fp4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimik2.5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + multinode: false + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:vllm +kimik2.5-int4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimik2.5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.15.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + multinode: false + precision: int4 + runner: b200 + runtime-stack-id: standalone:vllm +kimik2.5-int4-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimik2.5 + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.16.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + multinode: false + precision: int4 + runner: h200 + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimik2.5 + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + multinode: false + precision: int4 + runner: mi300x + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimik2.5 + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + multinode: false + precision: int4 + runner: mi325x + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimik2.5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + multinode: false + precision: int4 + runner: mi355x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimaxm2.5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.19.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + multinode: false + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimaxm2.5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.19.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + multinode: false + precision: fp8 + runner: b200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimaxm2.5 + framework: vllm + hardware-profile-id: nvidia:h100_sxm_80gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + multinode: false + precision: fp8 + runner: h100 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimaxm2.5 + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + multinode: false + precision: fp8 + runner: h200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimaxm2.5 + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.16.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + multinode: false + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimaxm2.5 + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + multinode: false + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimaxm2.5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + multinode: false + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +minimaxm2.5-fp8-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimaxm2.5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.19.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + multinode: false + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:vllm +qwen3.5-bf16-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + multinode: false + precision: bf16 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + multinode: false + precision: bf16 + runner: mi300x + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + multinode: false + precision: bf16 + runner: mi325x + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + multinode: false + precision: bf16 + runner: mi355x + runtime-stack-id: standalone:sglang +qwen3.5-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: nvidia/Qwen3.5-397B-A17B-NVFP4 + model-prefix: qwen3.5 + multinode: false + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp4-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.10-rocm720-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + multinode: false + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + multinode: false + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-b200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + multinode: false + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.9-cu129-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + multinode: false + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-h200-sglang-isb1-kv-stress-500k: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.9-cu129-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + users: + - 1 + - 2 + - 4 + - 8 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + tp: 8 + users: + - 1 + - 2 + - 4 + - 8 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + multinode: false + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-h200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.10.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + multinode: false + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + multinode: false + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + multinode: false + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + multinode: false + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi355x-sglang-isb1-kv-stress-500k: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3.5 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + users: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + tp: 8 + users: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + multinode: false + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang diff --git a/.github/configs/isb1-kv-stress.yaml b/.github/configs/isb1-kv-stress.yaml new file mode 100644 index 000000000..9ee07ef5d --- /dev/null +++ b/.github/configs/isb1-kv-stress.yaml @@ -0,0 +1,96 @@ +# Dedicated ISB1 KV cache stress sweeps (CTO-approved schema). +# +# This file is intentionally separate from isb1-master.yaml and uses +# benchmark-type: isb1_kv_stress with kv-stress-configs. + +gptoss-fp4-h200-isb1-kv-stress-vllm-code: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_kv_stress + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + workload-type: code + search-space: + - users: [2, 4, 8, 16, 32, 64, 128, 256] + offload-modes: ["on", "off", "noprefix"] + duration-s: 1800 + +gptoss-fp4-b200-isb1-kv-stress-vllm-code: + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: b200 + benchmark-type: isb1_kv_stress + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + workload-type: code + search-space: + - users: [2, 4, 8, 16, 32, 64, 128, 256] + offload-modes: ["on", "off", "noprefix"] + duration-s: 1800 + +qwen3.5-fp8-h200-isb1-kv-stress-vllm-code: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_kv_stress + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 131272 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + workload-type: code + search-space: + - users: [2, 4, 8, 16, 32, 64, 128, 256] + offload-modes: ["on", "off", "noprefix"] + duration-s: 1800 + +qwen3.5-fp8-b200-isb1-kv-stress-vllm-code: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_kv_stress + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 131272 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + workload-type: code + search-space: + - users: [2, 4, 8, 16, 32, 64, 128, 256] + offload-modes: ["on", "off", "noprefix"] + duration-s: 1800 diff --git a/.github/configs/isb1-master.yaml b/.github/configs/isb1-master.yaml new file mode 100644 index 000000000..99c111967 --- /dev/null +++ b/.github/configs/isb1-master.yaml @@ -0,0 +1,1723 @@ +# PR2 packaged the core 8k1k replay bundles. +# PR4 adds truthful long-context extension replay lanes using only the materialized +# extension_32k / extension_64k / extension_131k code bundles. +# These extension lanes are served-shape replay artifacts derived from larger source +# workloads; they are not native 500k+/1M+ InferenceX served-lane claims. +# +# Core entries keep an explicit 8k1k max-model-len. Extension entries intentionally +# omit max-model-len so the ISB1 workflow derives the served-shape value from the +# export stem (32k1k / 64k1k / 131k1k) at execution time. +# +# Official replay-configs pin support-status: supported so the workflow only replays +# the supported subset of mixed-status export bundles. +# All currently runnable rows also resolve to +# benchmark_certification_status=dataset_replay_verified. +# Phase 2 adds truthful chat-extension widening plus bounded preview/offload +# lanes. Preview rows stay explicit via support-status: reviewed_preview and the +# dedicated preview export paths. The current replay closure covers dsr1, +# gptoss, and qwen3.5 across core 8k1k plus extension bands, with bounded +# 500k code preview for gptoss and qwen3.5 on standalone sglang/vllm across +# b200/h100/h200. + +dsr1-fp8-b200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +dsr1-fp8-h200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +dsr1-fp8-b200-isb1-vllm: + image: vllm/vllm-openai:v0.19.0-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +dsr1-fp8-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-b200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-h100-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-h200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-b200-isb1-vllm: + # Keep the existing B200 GPT-OSS vLLM pin from the official throughput lane. + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-h100-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-b200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-h100-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-h200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-b200-isb1-vllm: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-h100-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +dsr1-fp8-b200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +dsr1-fp8-h200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +dsr1-fp8-b200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.19.0-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +dsr1-fp8-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-b200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-h100-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-h200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-b200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-b200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-h100-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-h200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-b200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-b200-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +qwen3.5-fp8-h100-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +qwen3.5-fp8-h200-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +qwen3.5-fp8-b200-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +qwen3.5-fp8-h100-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +qwen3.5-fp8-h200-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-b200-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-h100-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-h200-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-b200-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-h100-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-h200-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-b200-isb1-sglang-offload-core-preview-chat: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 + +gptoss-fp4-h100-isb1-sglang-offload-core-preview-chat: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 + +gptoss-fp4-h200-isb1-sglang-offload-core-preview-chat: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 + +gptoss-fp4-b200-isb1-vllm-offload-core-preview-code: + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 + +gptoss-fp4-h100-isb1-vllm-offload-core-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 + +gptoss-fp4-h200-isb1-vllm-offload-core-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 diff --git a/.github/configs/isb1-qwen-1m-preview.yaml b/.github/configs/isb1-qwen-1m-preview.yaml new file mode 100644 index 000000000..1de9c7339 --- /dev/null +++ b/.github/configs/isb1-qwen-1m-preview.yaml @@ -0,0 +1,53 @@ +# Manual-only gated Qwen 1M preview surface. +# The selected export cells remain support-status=reviewed_preview and +# benchmark_certification_status=dataset_replay_verified, but this file is +# intentionally separate from isb1-master.yaml so the lane stays out of the +# ordinary runnable support statement. +# +# Use only for explicit validation dispatches while KV-offload observability and +# correctness remain under review. Running this file does not imply native 1M +# served-lane support or KV-offload certification. + +qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 1048576 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 1 + max-turns-per-session: 3 + num-warmup-sessions: 0 + +qwen3.5-fp8-b200-isb1-vllm-1m-gated-preview-code: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 1048576 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 1 + max-turns-per-session: 3 + num-warmup-sessions: 0 diff --git a/.github/configs/isb1-triattn-preview.yaml b/.github/configs/isb1-triattn-preview.yaml new file mode 100644 index 000000000..629cb8fe9 --- /dev/null +++ b/.github/configs/isb1-triattn-preview.yaml @@ -0,0 +1,291 @@ +# TriAttention KV-compression preview lanes for ISB1 replay benchmarks. +# +# These entries deploy vLLM with the TriAttention plugin enabled for runtime +# KV-cache compression on H100/H200 Hopper-class GPUs. The plugin uses env +# vars TRIATTN_RUNTIME_KV_BUDGET and TRIATTN_RUNTIME_SPARSE_STATS_PATH, +# configured in the benchmark scripts. +# +# Key differences from baseline vLLM ISB1 entries: +# - model-prefix includes "triattn" suffix to route to dedicated scripts +# - Prefix caching disabled (incompatible with KV compression) +# - max-num-batched-tokens lowered to 1024 (prevents OOM from large prefills) +# - KV budget auto-detected: 2048 for code workloads, 12000 for chat workloads +# +# This file is intentionally separate from isb1-master.yaml — TriAttention +# preview lanes stay out of the ordinary runnable support statement. +# Use only for explicit validation dispatches. +# +# Prerequisites: +# - triattention pip package installed in the container (or installed at runtime) +# - Optional: pre-calibrated stats at /workspace/triattn_stats/_stats.pt + +# --------------------------------------------------------------------------- +# DeepSeek-R1 FP8 — H100/H200 with TriAttention — core 8k1k +# --------------------------------------------------------------------------- + +dsr1triattn-fp8-h100-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1triattn + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +dsr1triattn-fp8-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1triattn + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +# --------------------------------------------------------------------------- +# DeepSeek-R1 FP8 — H100/H200 with TriAttention — long-context extensions +# --------------------------------------------------------------------------- + +dsr1triattn-fp8-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1triattn + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + +dsr1triattn-fp8-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1triattn + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + +# --------------------------------------------------------------------------- +# Qwen 3.5 FP8 — H100/H200 with TriAttention — extension only +# (Qwen 3.5 is not present in core 8k1k exports; only extension 131k) +# --------------------------------------------------------------------------- + +qwen3.5triattn-fp8-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5triattn + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + +qwen3.5triattn-fp8-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5triattn + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + +# --------------------------------------------------------------------------- +# GPT-OSS-120B FP4 — H100/H200 with TriAttention — core 8k1k +# --------------------------------------------------------------------------- + +gptosstriattn-fp4-h100-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptosstriattn + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptosstriattn-fp4-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptosstriattn + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +# --------------------------------------------------------------------------- +# GPT-OSS-120B FP4 — H100/H200 with TriAttention — long-context extensions +# --------------------------------------------------------------------------- + +gptosstriattn-fp4-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptosstriattn + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + +gptosstriattn-fp4-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptosstriattn + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 diff --git a/.github/workflows/benchmark-isb1-tmpl.yml b/.github/workflows/benchmark-isb1-tmpl.yml new file mode 100644 index 000000000..d152d2062 --- /dev/null +++ b/.github/workflows/benchmark-isb1-tmpl.yml @@ -0,0 +1,451 @@ +name: Template - Benchmark ISB1 +on: + workflow_call: + inputs: + runner: + required: true + type: string + image: + required: true + type: string + model: + required: true + type: string + model-prefix: + required: true + type: string + precision: + required: true + type: string + framework: + required: true + type: string + exp-name: + required: true + type: string + benchmark-type: + required: true + type: string + export-file: + required: true + type: string + runtime-stack-id: + required: true + type: string + hardware-profile-id: + required: true + type: string + canonical-model-id: + required: true + type: string + support-status: + required: false + type: string + default: '' + request-mode: + required: true + type: string + max-concurrency: + required: true + type: string + max-sessions: + required: false + type: string + default: '' + max-turns-per-session: + required: false + type: string + default: '' + max-output-len: + required: false + type: string + default: '' + num-warmup-sessions: + required: false + type: string + default: '0' + ignore-waits: + required: false + type: boolean + default: false + ignore-eos: + required: false + type: boolean + default: false + max-model-len: + required: false + type: string + default: '' + tp-override: + required: false + type: string + default: '' + ep-override: + required: false + type: string + default: '' + trace-source: + required: false + type: string + default: '' + offload-mode: + required: false + type: string + default: '' + kv-cache-dtype: + required: false + type: string + default: '' + disable-prefix-caching: + required: false + type: boolean + default: false + benchmark-duration-s: + required: false + type: string + default: '' + workload-type: + required: false + type: string + default: '' + vllm-cpu-offload-gb: + required: false + type: string + default: '' + vllm-swap-space-gb: + required: false + type: string + default: '' + sglang-mem-fraction-override: + required: false + type: string + default: '' + sglang-chunked-prefill-override: + required: false + type: string + default: '' + ref: + description: Git ref (branch/sha) to checkout + required: false + type: string + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_CACHE: '/mnt/hf_hub_cache/' + EXP_NAME: ${{ inputs.exp-name }} + MODEL: ${{ inputs.model }} + MODEL_PREFIX: ${{ inputs.model-prefix }} + IMAGE: ${{ inputs.image }} + FRAMEWORK: ${{ inputs.framework }} + PRECISION: ${{ inputs.precision }} + BENCHMARK_TYPE: ${{ inputs.benchmark-type }} + EXPORT_FILE: ${{ inputs.export-file }} + RUNTIME_STACK_ID: ${{ inputs.runtime-stack-id }} + HARDWARE_PROFILE_ID: ${{ inputs.hardware-profile-id }} + CANONICAL_MODEL_ID: ${{ inputs.canonical-model-id }} + SUPPORT_STATUS: ${{ inputs.support-status }} + REQUEST_MODE: ${{ inputs.request-mode }} + MAX_CONCURRENCY: ${{ inputs.max-concurrency }} + MAX_SESSIONS: ${{ inputs.max-sessions }} + MAX_TURNS_PER_SESSION: ${{ inputs.max-turns-per-session }} + MAX_OUTPUT_LEN: ${{ inputs.max-output-len }} + NUM_WARMUP_SESSIONS: ${{ inputs.num-warmup-sessions }} + IGNORE_WAITS: ${{ inputs.ignore-waits }} + IGNORE_EOS: ${{ inputs.ignore-eos }} + OFFLOAD_MODE: ${{ inputs.offload-mode }} + KV_CACHE_DTYPE: ${{ inputs.kv-cache-dtype }} + DISABLE_PREFIX_CACHING: ${{ inputs.disable-prefix-caching }} + BENCHMARK_DURATION_S: ${{ inputs.benchmark-duration-s }} + WORKLOAD_TYPE: ${{ inputs.workload-type }} + VLLM_CPU_OFFLOAD_GB: ${{ inputs.vllm-cpu-offload-gb }} + VLLM_SWAP_SPACE_GB: ${{ inputs.vllm-swap-space-gb }} + SGLANG_MEM_FRACTION_OVERRIDE: ${{ inputs.sglang-mem-fraction-override }} + SGLANG_CHUNKED_PREFILL_OVERRIDE: ${{ inputs.sglang-chunked-prefill-override }} + TP_OVERRIDE: ${{ inputs.tp-override }} + EP_OVERRIDE: ${{ inputs.ep-override }} + TRACE_SOURCE: ${{ inputs.trace-source }} + PYTHONDONTWRITEBYTECODE: '1' + PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache + +permissions: + contents: read + +jobs: + benchmark: + runs-on: ${{ inputs.runner }} + timeout-minutes: 300 + name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | ${{ inputs.benchmark-type }} conc-${{ inputs.max-concurrency }}" + steps: + - name: Resource cleanup (pre-run) + run: &resource-cleanup | + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + echo "[Docker] Cleaning up resources ..." + docker ps -aq | xargs -r docker rm -f + docker network prune -f + while [ -n "$(docker ps -aq)" ]; do + docker ps -a + sleep 5 + done + fi + + if command -v squeue >/dev/null 2>&1; then + if [[ "${{ runner.name }}" == h100-* || "${{ runner.name }}" == h200-* || "${{ runner.name }}" == b200-* ]]; then + echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." + scancel --name="${{ runner.name }}" || true + while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do + squeue --name="${{ runner.name }}" + sleep 5 + done + else + echo "[Slurm] Cleaning up jobs for user: $USER ..." + scancel -u "$USER" || true + while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do + squeue -u "$USER" + sleep 5 + done + fi + fi + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + ref: ${{ inputs.ref || github.ref }} + clean: false + + - name: Certify ISB1 export contract + env: + INPUT_EXPORT_FILE: ${{ inputs.export-file }} + INPUT_RUNTIME_STACK_ID: ${{ inputs.runtime-stack-id }} + INPUT_HARDWARE_PROFILE_ID: ${{ inputs.hardware-profile-id }} + INPUT_CANONICAL_MODEL_ID: ${{ inputs.canonical-model-id }} + INPUT_SUPPORT_STATUS: ${{ inputs.support-status }} + INPUT_MAX_MODEL_LEN: ${{ inputs.max-model-len }} + run: | + python3 - <<'PY' + import json + import os + import re + from pathlib import Path + + export_path = Path(os.environ["INPUT_EXPORT_FILE"]) + if not export_path.exists(): + raise SystemExit(f"Missing ISB1 export file: {export_path}") + + payload = json.loads(export_path.read_text()) + exports = payload.get("exports") + if not isinstance(exports, list) or not exports: + raise SystemExit( + f"ISB1 export file must contain a non-empty 'exports' list: {export_path}" + ) + + support_status = os.environ.get("INPUT_SUPPORT_STATUS", "").strip() or None + explicit_max_model_len = os.environ.get("INPUT_MAX_MODEL_LEN", "").strip() + if not re.search(r"(?P\d+)k(?P\d+)k", export_path.stem) and not explicit_max_model_len: + raise SystemExit( + "Mixed-shape ISB1 exports require explicit max-model-len in the workflow input. " + f"Missing for '{export_path}'." + ) + + identity_cells = [ + cell + for cell in exports + if cell.get("runtime_stack_id") == os.environ["INPUT_RUNTIME_STACK_ID"] + and cell.get("hardware_profile_id") == os.environ["INPUT_HARDWARE_PROFILE_ID"] + and cell.get("canonical_model_id") == os.environ["INPUT_CANONICAL_MODEL_ID"] + ] + identity_statuses = sorted( + { + cell.get("support_status") + for cell in identity_cells + if cell.get("support_status") is not None + } + ) + matching_cells = [ + cell + for cell in identity_cells + if support_status is None or cell.get("support_status") == support_status + ] + + if support_status is None and len(identity_statuses) > 1: + raise SystemExit( + f"Ambiguous ISB1 support tier for {export_path}; identity spans {identity_statuses}. " + "Pin support-status explicitly." + ) + if not matching_cells: + raise SystemExit( + "No ISB1 export cell matches the requested workflow identity/tier for " + f"{export_path}. Available tiers for that identity: {identity_statuses or ['']}" + ) + + certification_statuses = sorted( + { + cell.get("benchmark_certification_status") + for cell in matching_cells + if cell.get("benchmark_certification_status") is not None + } + ) + if not certification_statuses: + raise SystemExit( + "Selected ISB1 export cells must declare benchmark_certification_status. " + f"Missing for '{export_path}'." + ) + if certification_statuses != ["dataset_replay_verified"]: + raise SystemExit( + "Current InferenceX ISB1 consumer lanes only accept " + "benchmark_certification_status=dataset_replay_verified. " + f"Selected cells for '{export_path}' resolved to {certification_statuses}." + ) + + print( + "Certified ISB1 export contract for " + f"{export_path} with support-status={support_status or ''} " + f"and benchmark_certification_status={certification_statuses[0]}" + ) + PY + + - name: Derive ISB1 runner env + env: + INPUT_RUNNER: ${{ inputs.runner }} + INPUT_EXPORT_FILE: ${{ inputs.export-file }} + INPUT_MAX_MODEL_LEN: ${{ inputs.max-model-len }} + INPUT_MAX_CONCURRENCY: ${{ inputs.max-concurrency }} + INPUT_TP_OVERRIDE: ${{ inputs.tp-override }} + run: | + python3 - <<'PY' >> "$GITHUB_ENV" + import json + import os + import re + from pathlib import Path + + runner = os.environ["INPUT_RUNNER"].lower() + export_file = os.environ["INPUT_EXPORT_FILE"] + explicit_max_model_len = os.environ.get("INPUT_MAX_MODEL_LEN", "").strip() + max_concurrency = os.environ["INPUT_MAX_CONCURRENCY"] + + if runner.startswith(("h100", "h200", "b200")): + tp = 8 + else: + raise SystemExit( + f"ISB1 replay lane is NVIDIA-first in PR1b; unsupported runner '{runner}'." + ) + + tp_override = os.environ.get("INPUT_TP_OVERRIDE", "").strip() + if tp_override: + tp = int(tp_override) + + if tp < 8: + raise SystemExit( + f"ISB1 replay requires TP=8 on NVIDIA runners; derived TP={tp} for runner '{runner}'." + ) + + export_path = Path(export_file) + match = re.search(r"(?P\d+)k(?P\d+)k", export_path.stem) + + if match: + isl = int(match.group("isl")) * 1024 + osl = int(match.group("osl")) * 1024 + else: + try: + payload = json.loads(export_path.read_text()) + except Exception as exc: + raise SystemExit( + f"Could not inspect preview export metadata from '{export_file}': {exc}" + ) + served_shape = payload.get("served_shape") or {} + isl = int(served_shape.get("isl", 0) or 0) + osl = int(served_shape.get("osl", 0) or 0) + if not explicit_max_model_len: + raise SystemExit( + "Mixed-shape preview exports require explicit max-model-len in the ISB1 config. " + f"Missing for '{export_file}'." + ) + + if explicit_max_model_len: + max_model_len = int(explicit_max_model_len) + else: + max_model_len = isl + osl + (200 if max(isl, osl) >= 8192 else 20) + + print(f"TP={tp}") + print("EP_SIZE=1") + print("DP_ATTENTION=false") + print("SPEC_DECODING=none") + print("DISAGG=false") + print(f"CONC={max_concurrency}") + print(f"ISL={isl}") + print(f"OSL={osl}") + print(f"MAX_MODEL_LEN={max_model_len}") + print("RANDOM_RANGE_RATIO=1.0") + print(f"EXPORT_STEM={Path(export_file).stem}") + PY + + - id: launch + name: Launch job script + env: + RUNNER_NAME: ${{ runner.name }} + RUNNER_TYPE: ${{ inputs.runner }} + run: | + RESULT_FILENAME="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_${BENCHMARK_TYPE}_${EXPORT_STEM}_conc${MAX_CONCURRENCY}_${RUNNER_NAME}" + echo "RESULT_FILENAME=${RESULT_FILENAME}" >> "$GITHUB_ENV" + echo "result_filename=${RESULT_FILENAME}" >> "$GITHUB_OUTPUT" + bash ./runners/launch_${RUNNER_NAME%%_*}.sh + + FOUND_RESULT_FILE= + for i in {1..10}; do + if [ -f "$RESULT_FILENAME.json" ]; then + FOUND_RESULT_FILE=true + break + fi + echo "Waiting for result file... (attempt $i)" + sleep 1 + done + + if [ -z "$FOUND_RESULT_FILE" ]; then + echo "Run failed: Replay result $RESULT_FILENAME.json not found." >&2 + exit 1 + fi + + - name: Process result + run: | + python3 utils/process_result_isb1.py + + - name: Upload result + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: isb1_${{ steps.launch.outputs.result_filename }} + path: agg_${{ steps.launch.outputs.result_filename }}.json + + - name: Upload raw replay result + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: replay_${{ steps.launch.outputs.result_filename }} + path: ${{ steps.launch.outputs.result_filename }}.json + if-no-files-found: ignore + + - name: Upload server logs + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: server_logs_${{ steps.launch.outputs.result_filename }} + path: server.log + if-no-files-found: ignore + + - name: Upload GPU metrics + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: gpu_metrics_${{ steps.launch.outputs.result_filename }} + path: gpu_metrics.csv + if-no-files-found: ignore + + - name: Upload KV metrics + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: kv_metrics_${{ steps.launch.outputs.result_filename }} + path: kv_metrics.csv + if-no-files-found: ignore + + - name: Resource cleanup (post-run) + if: always() + run: *resource-cleanup diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index 353918609..6582914ca 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -29,6 +29,7 @@ jobs: pattern: ${{ inputs.result-prefix && format('{0}_*', inputs.result-prefix) || '*' }} - name: Print summary + if: inputs.result-prefix != 'isb1' run: | pip install tabulate python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY @@ -38,8 +39,29 @@ jobs: pip install tabulate python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }} + - name: ISB1 operator summary + if: inputs.result-prefix == 'isb1' + run: | + pip install tabulate + python3 utils/summarize_isb1.py results/ >> $GITHUB_STEP_SUMMARY + + - name: ISB1 gate report + if: inputs.result-prefix == 'isb1' + run: | + AGGREGATE_PATH="agg_${{ inputs.result-prefix }}.json" + python3 utils/gate_isb1.py "$AGGREGATE_PATH" | tee isb1_gate_report.json + python3 utils/gate_isb1.py "$AGGREGATE_PATH" --format markdown >> $GITHUB_STEP_SUMMARY + - name: Upload aggregated results uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: results_${{ inputs.result-prefix || 'all' }} path: agg_${{ inputs.result-prefix || 'all' }}.json + + - name: Upload ISB1 gate report + if: inputs.result-prefix == 'isb1' + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: isb1_gate_report + path: isb1_gate_report.json + if-no-files-found: ignore diff --git a/.github/workflows/run-isb1-kv-stress-sweep.yml b/.github/workflows/run-isb1-kv-stress-sweep.yml new file mode 100644 index 000000000..f72ef3307 --- /dev/null +++ b/.github/workflows/run-isb1-kv-stress-sweep.yml @@ -0,0 +1,110 @@ +name: Run ISB1 KV Stress Sweep +run-name: ISB1 KV Stress - ${{ github.event.inputs.config-file || '.github/configs/isb1-kv-stress.yaml' }} + +on: + workflow_dispatch: + inputs: + config-file: + description: ISB1 KV stress config file path + required: true + default: .github/configs/isb1-kv-stress.yaml + runner-type: + description: Optional space-separated runner filters (e.g. h200 b200) + required: false + default: '' + runner-config: + description: Runner config YAML + required: false + default: .github/configs/runners.yaml + ref: + description: Git ref to checkout + required: false + default: '' + +jobs: + setup: + runs-on: ubuntu-latest + outputs: + kv-stress-matrix: ${{ steps.generate.outputs.kv-stress-matrix }} + has-matrix: ${{ steps.generate.outputs.has-matrix }} + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + ref: ${{ inputs.ref || github.ref }} + + - name: Install dependencies + run: pip install pydantic pyyaml + + - id: generate + env: + CONFIG_FILE: ${{ inputs.config-file }} + RUNNER_CONFIG: ${{ inputs.runner-config }} + RUNNER_TYPE: ${{ inputs.runner-type }} + run: | + if [ ! -f "$CONFIG_FILE" ]; then + echo "Missing ISB1 KV stress config file: $CONFIG_FILE" >&2 + exit 1 + fi + + cmd=(python3 utils/matrix_logic/generate_sweep_configs.py isb1-kv-stress-sweep --config-files "$CONFIG_FILE" --runner-config "$RUNNER_CONFIG") + + if [ -n "$RUNNER_TYPE" ]; then + read -r -a runner_types <<< "$RUNNER_TYPE" + cmd+=(--runner-type "${runner_types[@]}") + fi + + matrix_json="$("${cmd[@]}")" + compact_matrix="$(printf '%s' "$matrix_json" | python3 -c 'import json,sys; print(json.dumps(json.load(sys.stdin)))')" + has_matrix="$(printf '%s' "$compact_matrix" | python3 -c 'import json,sys; print("true" if json.load(sys.stdin) else "false")')" + + { + echo "kv-stress-matrix=$compact_matrix" + echo "has-matrix=$has_matrix" + } >> "$GITHUB_OUTPUT" + + sweep: + needs: setup + if: ${{ needs.setup.outputs.has-matrix == 'true' }} + uses: ./.github/workflows/benchmark-isb1-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.kv-stress-matrix) }} + secrets: inherit + with: + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + precision: ${{ matrix.config.precision }} + framework: ${{ matrix.config.framework }} + exp-name: ${{ matrix.config.exp-name }} + benchmark-type: ${{ matrix.config.benchmark-type }} + export-file: ${{ matrix.config.export-file }} + runtime-stack-id: ${{ matrix.config.runtime-stack-id }} + hardware-profile-id: ${{ matrix.config.hardware-profile-id }} + canonical-model-id: ${{ matrix.config.canonical-model-id }} + support-status: ${{ matrix.config.support-status || '' }} + request-mode: ${{ matrix.config.request-mode }} + max-concurrency: ${{ matrix.config.max-concurrency }} + max-model-len: ${{ matrix.config.max-model-len || '' }} + tp-override: ${{ matrix.config.tp || '' }} + ep-override: ${{ matrix.config.ep || '' }} + trace-source: ${{ matrix.config.trace-source || '' }} + offload-mode: ${{ matrix.config.offload-mode }} + kv-cache-dtype: ${{ matrix.config.kv-cache-dtype }} + disable-prefix-caching: ${{ matrix.config.disable-prefix-caching }} + benchmark-duration-s: ${{ matrix.config.benchmark-duration-s }} + workload-type: ${{ matrix.config.workload-type }} + ref: ${{ inputs.ref || github.ref }} + + collect-results: + needs: [setup, sweep] + if: ${{ always() && needs.setup.outputs.has-matrix == 'true' && needs.sweep.result != 'skipped' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + result-prefix: isb1 diff --git a/.github/workflows/run-isb1-sweep.yml b/.github/workflows/run-isb1-sweep.yml new file mode 100644 index 000000000..a8f3177de --- /dev/null +++ b/.github/workflows/run-isb1-sweep.yml @@ -0,0 +1,256 @@ +name: Run ISB1 Sweep +run-name: ISB1 Sweep - ${{ github.event.inputs.config-files || '.github/configs/isb1-master.yaml' }} + +on: + workflow_dispatch: + inputs: + config-files: + description: Space-separated ISB1 config file paths + required: true + default: .github/configs/isb1-master.yaml + runner-config: + description: Runner config YAML + required: false + default: .github/configs/runners.yaml + model-prefix: + description: Optional space-separated model-prefix filters + required: false + default: '' + precision: + description: Optional space-separated precision filters + required: false + default: '' + framework: + description: Optional space-separated framework filters + required: false + default: '' + runner-type: + description: Optional space-separated runner filters + required: false + default: '' + runner-node-filter: + description: Optional runner-node substring filter + required: false + default: '' + max-concurrency: + description: Optional cap applied to replay max-concurrency + required: false + default: '' + vllm-cpu-offload-gb: + description: Optional vLLM CPU offload budget in GB for long-context runs + required: false + default: '' + vllm-swap-space-gb: + description: Optional vLLM swap-space budget in GB for long-context runs + required: false + default: '' + sglang-mem-fraction-override: + description: Optional SGLang mem-fraction-static override for long-context runs + required: false + default: '' + sglang-chunked-prefill-override: + description: Optional SGLang chunked-prefill-size override for long-context runs + required: false + default: '' + ref: + description: Git ref to checkout + required: false + default: '' + +jobs: + setup: + runs-on: ubuntu-latest + outputs: + replay-matrix: ${{ steps.generate.outputs.replay-matrix }} + has-matrix: ${{ steps.generate.outputs.has-matrix }} + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + ref: ${{ inputs.ref || github.ref }} + + - name: Install dependencies + run: pip install pydantic pyyaml + + - id: generate + env: + CONFIG_FILES: ${{ inputs.config-files }} + RUNNER_CONFIG: ${{ inputs.runner-config }} + MODEL_PREFIX: ${{ inputs.model-prefix }} + PRECISION: ${{ inputs.precision }} + FRAMEWORK: ${{ inputs.framework }} + RUNNER_TYPE: ${{ inputs.runner-type }} + RUNNER_NODE_FILTER: ${{ inputs.runner-node-filter }} + MAX_CONCURRENCY: ${{ inputs.max-concurrency }} + run: | + read -r -a config_files <<< "$CONFIG_FILES" + + for config_file in "${config_files[@]}"; do + if [ ! -f "$config_file" ]; then + echo "Missing ISB1 config file: $config_file" >&2 + echo "PR1b adds the workflow lane only; the committed config arrives in PR2." >&2 + exit 1 + fi + done + + cmd=(python3 utils/matrix_logic/generate_sweep_configs.py isb1-sweep --config-files "${config_files[@]}" --runner-config "$RUNNER_CONFIG") + + if [ -n "$MODEL_PREFIX" ]; then + read -r -a model_prefixes <<< "$MODEL_PREFIX" + cmd+=(--model-prefix "${model_prefixes[@]}") + fi + if [ -n "$PRECISION" ]; then + read -r -a precisions <<< "$PRECISION" + cmd+=(--precision "${precisions[@]}") + fi + if [ -n "$FRAMEWORK" ]; then + read -r -a frameworks <<< "$FRAMEWORK" + cmd+=(--framework "${frameworks[@]}") + fi + if [ -n "$RUNNER_TYPE" ]; then + read -r -a runner_types <<< "$RUNNER_TYPE" + cmd+=(--runner-type "${runner_types[@]}") + fi + if [ -n "$RUNNER_NODE_FILTER" ]; then + cmd+=(--runner-node-filter "$RUNNER_NODE_FILTER") + fi + if [ -n "$MAX_CONCURRENCY" ]; then + cmd+=(--max-concurrency "$MAX_CONCURRENCY") + fi + + matrix_json="$("${cmd[@]}")" + compact_matrix="$(printf '%s' "$matrix_json" | python3 -c 'import json,sys; print(json.dumps(json.load(sys.stdin)))')" + has_matrix="$(printf '%s' "$compact_matrix" | python3 -c 'import json,sys; print("true" if json.load(sys.stdin) else "false")')" + + { + echo "replay-matrix=$compact_matrix" + echo "has-matrix=$has_matrix" + } >> "$GITHUB_OUTPUT" + + - name: Write ISB1 preflight run manifest + env: + REPLAY_MATRIX: ${{ steps.generate.outputs.replay-matrix }} + HAS_MATRIX: ${{ steps.generate.outputs.has-matrix }} + INPUT_CONFIG_FILES: ${{ inputs.config-files }} + INPUT_RUNNER_CONFIG: ${{ inputs.runner-config }} + INPUT_MODEL_PREFIX: ${{ inputs.model-prefix }} + INPUT_PRECISION: ${{ inputs.precision }} + INPUT_FRAMEWORK: ${{ inputs.framework }} + INPUT_RUNNER_TYPE: ${{ inputs.runner-type }} + INPUT_RUNNER_NODE_FILTER: ${{ inputs.runner-node-filter }} + INPUT_MAX_CONCURRENCY: ${{ inputs.max-concurrency }} + INPUT_VLLM_CPU_OFFLOAD_GB: ${{ inputs.vllm-cpu-offload-gb }} + INPUT_VLLM_SWAP_SPACE_GB: ${{ inputs.vllm-swap-space-gb }} + INPUT_SGLANG_MEM_FRACTION_OVERRIDE: ${{ inputs.sglang-mem-fraction-override }} + INPUT_SGLANG_CHUNKED_PREFILL_OVERRIDE: ${{ inputs.sglang-chunked-prefill-override }} + INPUT_REF: ${{ inputs.ref || github.ref }} + WORKFLOW_RUN_ID: ${{ github.run_id }} + WORKFLOW_RUN_ATTEMPT: ${{ github.run_attempt }} + WORKFLOW_SHA: ${{ github.sha }} + run: | + python3 - <<'PY' + import json + import os + from collections import Counter + + matrix_rows = json.loads(os.environ.get("REPLAY_MATRIX") or "[]") + + def count_by(field: str) -> dict[str, int]: + values = [row.get(field) for row in matrix_rows] + normalized = ["" if value is None else str(value) for value in values] + return dict(sorted(Counter(normalized).items())) + + manifest = { + "dispatch_inputs": { + "config-files": os.environ.get("INPUT_CONFIG_FILES", ""), + "runner-config": os.environ.get("INPUT_RUNNER_CONFIG", ""), + "model-prefix": os.environ.get("INPUT_MODEL_PREFIX", ""), + "precision": os.environ.get("INPUT_PRECISION", ""), + "framework": os.environ.get("INPUT_FRAMEWORK", ""), + "runner-type": os.environ.get("INPUT_RUNNER_TYPE", ""), + "runner-node-filter": os.environ.get("INPUT_RUNNER_NODE_FILTER", ""), + "max-concurrency": os.environ.get("INPUT_MAX_CONCURRENCY", ""), + "vllm-cpu-offload-gb": os.environ.get("INPUT_VLLM_CPU_OFFLOAD_GB", ""), + "vllm-swap-space-gb": os.environ.get("INPUT_VLLM_SWAP_SPACE_GB", ""), + "sglang-mem-fraction-override": os.environ.get("INPUT_SGLANG_MEM_FRACTION_OVERRIDE", ""), + "sglang-chunked-prefill-override": os.environ.get("INPUT_SGLANG_CHUNKED_PREFILL_OVERRIDE", ""), + "ref": os.environ.get("INPUT_REF", ""), + }, + "matrix_summary": { + "has_matrix": os.environ.get("HAS_MATRIX", "false"), + "total_cells": len(matrix_rows), + "by_model_prefix": count_by("model-prefix"), + "by_framework": count_by("framework"), + "by_runner": count_by("runner"), + "by_support_status": count_by("support-status"), + }, + "workflow_context": { + "run_id": os.environ.get("WORKFLOW_RUN_ID", ""), + "run_attempt": os.environ.get("WORKFLOW_RUN_ATTEMPT", ""), + "sha": os.environ.get("WORKFLOW_SHA", ""), + }, + "matrix_rows": matrix_rows, + } + + with open("isb1_run_manifest.json", "w", encoding="utf-8") as fh: + json.dump(manifest, fh, indent=2, sort_keys=True) + PY + + - name: Upload ISB1 run manifest + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: isb1_run_manifest + path: isb1_run_manifest.json + if-no-files-found: error + + sweep: + needs: setup + if: ${{ needs.setup.outputs.has-matrix == 'true' }} + uses: ./.github/workflows/benchmark-isb1-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.replay-matrix) }} + secrets: inherit + with: + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + precision: ${{ matrix.config.precision }} + framework: ${{ matrix.config.framework }} + exp-name: ${{ matrix.config.exp-name }} + benchmark-type: ${{ matrix.config.benchmark-type }} + export-file: ${{ matrix.config.export-file }} + runtime-stack-id: ${{ matrix.config.runtime-stack-id }} + hardware-profile-id: ${{ matrix.config.hardware-profile-id }} + canonical-model-id: ${{ matrix.config.canonical-model-id }} + support-status: ${{ matrix.config.support-status || '' }} + request-mode: ${{ matrix.config.request-mode }} + max-concurrency: ${{ matrix.config.max-concurrency }} + max-sessions: ${{ matrix.config.max-sessions || '' }} + max-turns-per-session: ${{ matrix.config.max-turns-per-session || '' }} + max-output-len: ${{ matrix.config.max-output-len || '' }} + num-warmup-sessions: ${{ matrix.config.num-warmup-sessions || '0' }} + ignore-waits: ${{ matrix.config.ignore-waits || false }} + ignore-eos: ${{ matrix.config.ignore-eos || false }} + max-model-len: ${{ matrix.config.max-model-len || '' }} + offload-mode: ${{ matrix.config.offload-mode || '' }} + kv-cache-dtype: ${{ matrix.config.kv-cache-dtype || '' }} + disable-prefix-caching: ${{ matrix.config.disable-prefix-caching || false }} + benchmark-duration-s: ${{ matrix.config.benchmark-duration-s || '' }} + vllm-cpu-offload-gb: ${{ inputs.vllm-cpu-offload-gb || '' }} + vllm-swap-space-gb: ${{ inputs.vllm-swap-space-gb || '' }} + sglang-mem-fraction-override: ${{ inputs.sglang-mem-fraction-override || '' }} + sglang-chunked-prefill-override: ${{ inputs.sglang-chunked-prefill-override || '' }} + ref: ${{ inputs.ref || github.ref }} + + collect-results: + needs: [setup, sweep] + if: ${{ always() && needs.setup.outputs.has-matrix == 'true' && needs.sweep.result != 'skipped' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + result-prefix: isb1 diff --git a/.gitignore b/.gitignore index 03d36472a..1b87019c5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ **/__pycache__/** -**/.coverage \ No newline at end of file +**/.coverage +**/.DS_Store +prompt-exports/ +.claude \ No newline at end of file diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 535313252..ea35df323 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -66,6 +66,304 @@ stop_gpu_monitor() { GPU_MONITOR_PID="" } +KV_METRICS_PID="" +KV_METRICS_CSV="/workspace/kv_metrics.csv" +VLLM_OFFLOAD_EXTRA_ARGS="" +VLLM_EXTRA_ARGS="" +SGLANG_EXTRA_ARGS="" + +build_yarn_override_json() { + local max_model_len="${1:?}" + local factor="2.0" + if (( max_model_len > 600000 )); then + factor="4.0" + fi + echo "{\"text_config\":{\"rope_parameters\":{\"mrope_interleaved\":true,\"mrope_section\":[11,11,10],\"rope_type\":\"yarn\",\"rope_theta\":10000000,\"partial_rotary_factor\":0.25,\"factor\":${factor},\"original_max_position_embeddings\":262144}}}" +} + +apply_yarn_config_if_needed() { + local model="${1:?}" + local max_model_len="${2:?}" + if [[ "$model" == *"Qwen3.5"* || "$model" == *"qwen3.5"* || "$model" == *"Qwen3_5"* ]] && (( max_model_len > 262144 )); then + YARN_OVERRIDE_JSON=$(build_yarn_override_json "$max_model_len") + export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 + export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 + echo "YaRN enabled: factor=$(echo "$YARN_OVERRIDE_JSON" | grep -o '"factor":[0-9.]*' | cut -d: -f2) for max-model-len=$max_model_len" + fi +} + +_append_config_kv_once() { + local key="$1" + local value="$2" + + if [[ ! -f config.yaml ]]; then + return 0 + fi + + if ! grep -Eq "^${key}:" config.yaml; then + echo "${key}: ${value}" >> config.yaml + fi +} + +_remove_config_kv() { + local key="$1" + + if [[ ! -f config.yaml ]]; then + return 0 + fi + + local tmp_file + tmp_file=$(mktemp) + grep -Ev "^${key}:" config.yaml > "$tmp_file" + mv "$tmp_file" config.yaml +} + +_detect_total_cpu_dram_gb() { + if [[ -n "${TOTAL_CPU_DRAM_GB:-}" ]]; then + echo "${TOTAL_CPU_DRAM_GB}" + return 0 + fi + + if [[ -f /proc/meminfo ]]; then + awk '/MemTotal/{printf "%.0f", $2/1048576}' /proc/meminfo + return 0 + fi + + if command -v sysctl >/dev/null 2>&1; then + local mem_bytes + mem_bytes=$(sysctl -n hw.memsize 2>/dev/null || echo "") + if [[ -n "$mem_bytes" ]]; then + awk -v bytes="$mem_bytes" 'BEGIN {printf "%.0f", bytes/1073741824}' + return 0 + fi + fi + + echo "64" +} + +apply_vllm_offload_config() { + local mode="${OFFLOAD_MODE:-legacy}" + local detected_dram_gb="" + + VLLM_OFFLOAD_EXTRA_ARGS="" + VLLM_EXTRA_ARGS="" + + case "$mode" in + on) + PREFIX_CACHING_CONFIG="" + _remove_config_kv "no-enable-prefix-caching" + _remove_config_kv "cpu-offload-gb" + _remove_config_kv "swap-space" + detected_dram_gb="$(_detect_total_cpu_dram_gb)" + VLLM_OFFLOAD_EXTRA_ARGS="--kv_offloading_backend native --kv_offloading_size ${detected_dram_gb} --disable-hybrid-kv-cache-manager" + ;; + off) + PREFIX_CACHING_CONFIG="" + _remove_config_kv "no-enable-prefix-caching" + _remove_config_kv "cpu-offload-gb" + _remove_config_kv "swap-space" + ;; + noprefix) + PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" + _remove_config_kv "cpu-offload-gb" + _remove_config_kv "swap-space" + _append_config_kv_once "no-enable-prefix-caching" "true" + ;; + legacy|"") + if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + _append_config_kv_once "cpu-offload-gb" "${VLLM_CPU_OFFLOAD_GB}" + fi + if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + _append_config_kv_once "swap-space" "${VLLM_SWAP_SPACE_GB}" + fi + ;; + *) + echo "WARN: Unknown OFFLOAD_MODE='${mode}', falling back to legacy behavior" >&2 + if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + _append_config_kv_once "cpu-offload-gb" "${VLLM_CPU_OFFLOAD_GB}" + fi + if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + _append_config_kv_once "swap-space" "${VLLM_SWAP_SPACE_GB}" + fi + ;; + esac + + if [[ "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then + PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" + _append_config_kv_once "no-enable-prefix-caching" "true" + fi + + if [[ "${KV_CACHE_DTYPE:-}" == "fp8" ]]; then + _append_config_kv_once "kv-cache-dtype" "fp8" + fi + + if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then + VLLM_EXTRA_ARGS="${VLLM_EXTRA_ARGS:-} --hf-overrides '${YARN_OVERRIDE_JSON}'" + fi +} + +apply_sglang_offload_config() { + local mode="${OFFLOAD_MODE:-legacy}" + + SGLANG_EXTRA_ARGS="" + + case "$mode" in + on) + echo "WARN: OFFLOAD_MODE=on requested for SGLang, but native KV offload is not supported. Leaving cache mode unchanged." >&2 + ;; + off) + RADIX_CACHE_ARGS="" + ;; + noprefix) + RADIX_CACHE_ARGS="--disable-radix-cache" + ;; + legacy|"") + ;; + *) + echo "WARN: Unknown OFFLOAD_MODE='${mode}' for SGLang; leaving radix cache args unchanged." >&2 + ;; + esac + + if [[ "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then + RADIX_CACHE_ARGS="--disable-radix-cache" + fi + + if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then + SGLANG_EXTRA_ARGS="${SGLANG_EXTRA_ARGS:-} --json-model-override-args '${YARN_OVERRIDE_JSON}'" + fi +} + +# launch_vllm_server [extra args...] +# Sets: SERVER_PID, SERVER_LOG +launch_vllm_server() { + local model="$1" + local port="$2" + local config_yaml_path="$3" + shift 3 || true + local extra_args=("$@") + + if [[ -z "$model" || -z "$port" || -z "$config_yaml_path" ]]; then + echo "launch_vllm_server requires: model port config_yaml_path" >&2 + return 1 + fi + + hf download "$model" + apply_vllm_offload_config + + SERVER_LOG="${SERVER_LOG:-/workspace/server.log}" + + local vllm_max_num_seqs="${VLLM_MAX_NUM_SEQS:-}" + if [[ -z "$vllm_max_num_seqs" ]]; then + local conc_value="${CONC:-256}" + if [[ "$conc_value" =~ ^[0-9]+$ ]] && (( conc_value > 256 )); then + vllm_max_num_seqs="$conc_value" + else + vllm_max_num_seqs="256" + fi + fi + + local vllm_tp="${TP:-1}" + local vllm_gpu_mem_util="${VLLM_GPU_MEMORY_UTILIZATION:-0.9}" + + local offload_args=() + if [[ -n "$VLLM_OFFLOAD_EXTRA_ARGS" ]]; then + # shellcheck disable=SC2206 + offload_args=($VLLM_OFFLOAD_EXTRA_ARGS) + fi + + PYTHONNOUSERSITE=1 vllm serve "$model" --host 0.0.0.0 --port "$port" \ + --config "$config_yaml_path" \ + --gpu-memory-utilization "$vllm_gpu_mem_util" \ + --tensor-parallel-size "$vllm_tp" \ + --max-num-seqs "$vllm_max_num_seqs" \ + "${extra_args[@]}" \ + "${offload_args[@]}" \ + > "$SERVER_LOG" 2>&1 & + + SERVER_PID=$! + export SERVER_PID + export SERVER_LOG +} + +# launch_sglang_server [extra args...] +# Sets: SERVER_PID, SERVER_LOG +launch_sglang_server() { + local model="$1" + local port="$2" + shift 2 || true + local extra_args=("$@") + + if [[ -z "$model" || -z "$port" ]]; then + echo "launch_sglang_server requires: model port" >&2 + return 1 + fi + + hf download "$model" + if [[ -n "${OFFLOAD_MODE:-}" || "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then + apply_sglang_offload_config + fi + + SERVER_LOG="${SERVER_LOG:-/workspace/server.log}" + + local sglang_tp="${TP:-1}" + local sglang_dp="${DP_SIZE:-1}" + + PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ + --model-path "$model" \ + --host 0.0.0.0 \ + --port "$port" \ + --tensor-parallel-size "$sglang_tp" \ + --data-parallel-size "$sglang_dp" \ + "${extra_args[@]}" \ + > "$SERVER_LOG" 2>&1 & + + SERVER_PID=$! + export SERVER_PID + export SERVER_LOG +} + +start_kv_metrics_collector() { + local port="${1:-8888}" + local output="${2:-$KV_METRICS_CSV}" + local interval="${3:-2.0}" + local collector_script + + collector_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../datasets/isb1/scripts" && pwd)/metrics_collector.py" + + if [[ ! -f "$collector_script" ]]; then + echo "[KV Metrics] Collector script not found at $collector_script, skipping" + return 0 + fi + + if [[ -n "$KV_METRICS_PID" ]] && kill -0 "$KV_METRICS_PID" 2>/dev/null; then + echo "[KV Metrics] Collector already running (PID=$KV_METRICS_PID)" + return 0 + fi + + KV_METRICS_CSV="$output" + python3 "$collector_script" \ + --metrics-url "http://0.0.0.0:${port}/metrics" \ + --output "$output" \ + --interval "$interval" >/tmp/kv_metrics_collector.log 2>&1 & + KV_METRICS_PID=$! + + echo "[KV Metrics] Started (PID=$KV_METRICS_PID, interval=${interval}s, output=$output)" +} + +stop_kv_metrics_collector() { + if [[ -n "$KV_METRICS_PID" ]] && kill -0 "$KV_METRICS_PID" 2>/dev/null; then + kill "$KV_METRICS_PID" 2>/dev/null || true + wait "$KV_METRICS_PID" 2>/dev/null || true + echo "[KV Metrics] Stopped (PID=$KV_METRICS_PID)" + if [[ -f "$KV_METRICS_CSV" ]]; then + local lines + lines=$(wc -l < "$KV_METRICS_CSV") + echo "[KV Metrics] Collected $lines rows -> $KV_METRICS_CSV" + fi + fi + KV_METRICS_PID="" +} + # Check if required environment variables are set # Usage: check_env_vars VAR1 VAR2 VAR3 ... # Exits with code 1 if any variable is not set @@ -395,6 +693,194 @@ run_benchmark_serving() { return $benchmark_exit_code } +is_isb1_replay_benchmark() { + [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] +} + +is_isb1_kv_stress_benchmark() { + [[ "${BENCHMARK_TYPE:-}" == "isb1_kv_stress" ]] +} + +resolve_replay_request_mode_for_harness() { + local requested_mode="${1:-auto}" + + case "$requested_mode" in + ""|auto|chat|completions) + printf '%s' "${requested_mode:-auto}" + ;; + multi-turn|multi_turn|multiturn) + printf 'auto' + ;; + *) + echo "WARN: Unsupported replay request mode '$requested_mode'; using 'auto' for the harness boundary" >&2 + printf 'auto' + ;; + esac +} + +run_isb1_kv_stress_campaign_cell() { + check_env_vars \ + BENCHMARK_TYPE \ + EXPORT_FILE \ + MAX_CONCURRENCY \ + OFFLOAD_MODE \ + BENCHMARK_DURATION_S \ + KV_CACHE_DTYPE \ + WORKLOAD_TYPE + + if ! is_isb1_kv_stress_benchmark; then + echo "Error: run_isb1_kv_stress_campaign_cell called with BENCHMARK_TYPE='${BENCHMARK_TYPE:-}'" >&2 + return 1 + fi + + local port="${PORT:-8888}" + local kv_metrics_output="/workspace/kv_metrics.csv" + local metadata_path="/workspace/kv_stress_campaign_metadata.json" + local replay_exit_code=0 + + start_gpu_monitor + start_kv_metrics_collector "$port" "$kv_metrics_output" 2.0 + + run_benchmark_export_replay "$@" || replay_exit_code=$? + + stop_kv_metrics_collector + stop_gpu_monitor + + python3 - <<'PY' +import json +import os +import time + +metadata = { + "benchmark_type": os.getenv("BENCHMARK_TYPE", ""), + "export_file": os.getenv("EXPORT_FILE", ""), + "runtime_stack_id": os.getenv("RUNTIME_STACK_ID", ""), + "hardware_profile_id": os.getenv("HARDWARE_PROFILE_ID", ""), + "canonical_model_id": os.getenv("CANONICAL_MODEL_ID", ""), + "request_mode": os.getenv("REQUEST_MODE", ""), + "max_concurrency": os.getenv("MAX_CONCURRENCY", ""), + "offload_mode": os.getenv("OFFLOAD_MODE", ""), + "disable_prefix_caching": os.getenv("DISABLE_PREFIX_CACHING", ""), + "kv_cache_dtype": os.getenv("KV_CACHE_DTYPE", ""), + "benchmark_duration_s": os.getenv("BENCHMARK_DURATION_S", ""), + "workload_type": os.getenv("WORKLOAD_TYPE", ""), + "metrics_files": { + "gpu": "/workspace/gpu_metrics.csv", + "kv": "/workspace/kv_metrics.csv", + }, + "captured_at_epoch_s": int(time.time()), +} +with open("/workspace/kv_stress_campaign_metadata.json", "w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2, sort_keys=True) +PY + + echo "[KV Stress] Campaign metadata written to $metadata_path" + return "$replay_exit_code" +} + +run_single_node_benchmark() { + if ! is_isb1_replay_benchmark && ! is_isb1_kv_stress_benchmark; then + run_benchmark_serving "$@" + return $? + fi + + set +x + local model="" + local port="" + local result_filename="" + local result_dir="" + local workspace_dir="" + local trust_remote_code=false + local server_pid="" + + while [[ $# -gt 0 ]]; do + case $1 in + --model) model="$2"; shift 2 ;; + --port) port="$2"; shift 2 ;; + --result-filename) result_filename="$2"; shift 2 ;; + --result-dir) result_dir="$2"; shift 2 ;; + --bench-serving-dir) workspace_dir="$2"; shift 2 ;; + --trust-remote-code) trust_remote_code=true; shift ;; + --server-pid) server_pid="$2"; shift 2 ;; + --backend|--input-len|--output-len|--random-range-ratio|--num-prompts|--max-concurrency) + shift 2 + ;; + --use-chat-template) + shift + ;; + *) + echo "Unknown parameter: $1" + return 1 + ;; + esac + done + + if [[ -z "$model" ]]; then + echo "Error: --model is required" + return 1 + fi + if [[ -z "$port" ]]; then + echo "Error: --port is required" + return 1 + fi + if [[ -z "$result_filename" ]]; then + echo "Error: --result-filename is required" + return 1 + fi + if [[ -z "$result_dir" ]]; then + echo "Error: --result-dir is required" + return 1 + fi + + local replay_args=( + --model "$model" + --port "$port" + --export-file "${EXPORT_FILE}" + --runtime-stack-id "${RUNTIME_STACK_ID}" + --hardware-profile-id "${HARDWARE_PROFILE_ID}" + --canonical-model-id "${CANONICAL_MODEL_ID}" + --request-mode "${REQUEST_MODE:-auto}" + --max-concurrency "${MAX_CONCURRENCY}" + --num-warmup-sessions "${NUM_WARMUP_SESSIONS:-0}" + --result-filename "$result_filename" + --result-dir "$result_dir" + ) + + if [[ -n "$workspace_dir" ]]; then + replay_args+=(--bench-serving-dir "$workspace_dir") + fi + if [[ -n "${MAX_SESSIONS:-}" ]]; then + replay_args+=(--max-sessions "${MAX_SESSIONS}") + fi + if [[ -n "${SUPPORT_STATUS:-}" ]]; then + replay_args+=(--support-status "${SUPPORT_STATUS}") + fi + if [[ -n "${MAX_TURNS_PER_SESSION:-}" ]]; then + replay_args+=(--max-turns-per-session "${MAX_TURNS_PER_SESSION}") + fi + if [[ -n "${MAX_OUTPUT_LEN:-}" ]]; then + replay_args+=(--max-output-len "${MAX_OUTPUT_LEN}") + fi + if [[ "${IGNORE_WAITS:-false}" == "true" ]]; then + replay_args+=(--ignore-waits) + fi + if [[ "${IGNORE_EOS:-false}" == "true" ]]; then + replay_args+=(--ignore-eos) + fi + if [[ "$trust_remote_code" == true ]]; then + replay_args+=(--trust-remote-code) + fi + if [[ -n "$server_pid" ]]; then + replay_args+=(--server-pid "$server_pid") + fi + + if is_isb1_kv_stress_benchmark; then + run_isb1_kv_stress_campaign_cell "${replay_args[@]}" + else + run_benchmark_export_replay "${replay_args[@]}" + fi +} + # -------------------------------- # Profiling trace helpers @@ -805,3 +1291,215 @@ run_eval() { fi return $eval_rc } + + +# --------------------------------------------------------------------------- +# Multi-turn benchmark wrapper +# --------------------------------------------------------------------------- + +# Run multi-turn chat benchmark with standardized parameters. +# Exercises growing KV cache across conversation turns via /v1/chat/completions. +# +# IMPORTANT: The server MUST be started with prefix/radix caching ENABLED +# for meaningful multi-turn results. Do NOT use --disable-radix-cache or +# --no-enable-prefix-caching with multi-turn benchmarks. +# Replay ISB1 export sessions/events against a running server. +# +# Supports: +# - inferencex_multiturn exports via /v1/chat/completions (standalone vLLM/SGLang) +# - inferencex_trace_replay exports via either chat or projected completions +# mode (useful for TRT / Dynamo-style cells) +# +# Parameters: +# --model: Model name sent to the target server +# --port: Server port +# --export-file: Path to export JSON +# --runtime-stack-id: Filter selected export cells to one runtime stack +# --hardware-profile-id: Filter selected export cells to one hardware row +# --canonical-model-id: Filter selected export cells to one canonical model row +# --request-mode: auto|chat|completions (default: auto) +# --max-concurrency: Max concurrent replay sessions +# --num-warmup-sessions: Warmup sessions before measurement +# --result-filename: Result filename without extension +# --result-dir: Result directory +# --max-sessions: Optional session limit for smoke runs +# --max-turns-per-session: Optional turn cap for smoke runs +# --max-output-len: Optional per-turn output cap +# --ignore-waits: Ignore inter-turn wait gaps from export metadata +# --trust-remote-code: Optional flag +# --server-pid: Optional server process ID to monitor +run_benchmark_export_replay() { + set +x + local model="" + local port="" + local export_file="" + local runtime_stack_id="" + local hardware_profile_id="" + local canonical_model_id="" + local trace_id="" + local support_status="" + local request_mode="auto" + local max_concurrency="8" + local num_warmup_sessions="1" + local result_filename="" + local result_dir="" + local workspace_dir="" + local max_sessions="" + local max_turns_per_session="" + local max_output_len="" + local ignore_waits=false + local trust_remote_code=false + local ignore_eos=false + local server_pid="" + + while [[ $# -gt 0 ]]; do + case $1 in + --model) model="$2"; shift 2 ;; + --port) port="$2"; shift 2 ;; + --export-file) export_file="$2"; shift 2 ;; + --runtime-stack-id) runtime_stack_id="$2"; shift 2 ;; + --hardware-profile-id) hardware_profile_id="$2"; shift 2 ;; + --canonical-model-id) canonical_model_id="$2"; shift 2 ;; + --trace-id) trace_id="$2"; shift 2 ;; + --support-status) support_status="$2"; shift 2 ;; + --request-mode) request_mode="$2"; shift 2 ;; + --max-concurrency) max_concurrency="$2"; shift 2 ;; + --num-warmup-sessions) num_warmup_sessions="$2"; shift 2 ;; + --result-filename) result_filename="$2"; shift 2 ;; + --result-dir) result_dir="$2"; shift 2 ;; + --bench-serving-dir) workspace_dir="$2"; shift 2 ;; + --max-sessions) max_sessions="$2"; shift 2 ;; + --max-turns-per-session) max_turns_per_session="$2"; shift 2 ;; + --max-output-len) max_output_len="$2"; shift 2 ;; + --ignore-waits) ignore_waits=true; shift ;; + --trust-remote-code) trust_remote_code=true; shift ;; + --ignore-eos) ignore_eos=true; shift ;; + --server-pid) server_pid="$2"; shift 2 ;; + *) echo "Unknown parameter: $1"; return 1 ;; + esac + done + + if [[ -z "$model" ]]; then echo "Error: --model is required"; return 1; fi + if [[ -z "$port" ]]; then echo "Error: --port is required"; return 1; fi + if [[ -z "$export_file" ]]; then echo "Error: --export-file is required"; return 1; fi + if [[ -z "$result_filename" ]]; then echo "Error: --result-filename is required"; return 1; fi + if [[ -z "$result_dir" ]]; then echo "Error: --result-dir is required"; return 1; fi + + if [[ -z "$workspace_dir" ]]; then + workspace_dir=$(pwd) + fi + + local requested_request_mode="$request_mode" + local harness_request_mode + harness_request_mode=$(resolve_replay_request_mode_for_harness "$request_mode") + + local benchmark_cmd=( + python3 "$workspace_dir/utils/bench_serving/benchmark_export_replay.py" + --model "$model" + --base-url "http://0.0.0.0:$port" + --export-file "$export_file" + --request-mode "$harness_request_mode" + --max-concurrency "$max_concurrency" + --num-warmup-sessions "$num_warmup_sessions" + --save-result + --result-dir "$result_dir" + --result-filename "$result_filename.json" + --metadata + "benchmark_type=${BENCHMARK_TYPE:-isb1_replay}" + "export_file=$export_file" + "runtime_stack_id=$runtime_stack_id" + "hardware_profile_id=$hardware_profile_id" + "canonical_model_id=$canonical_model_id" + "request_mode=$requested_request_mode" + "harness_request_mode=$harness_request_mode" + ) + + if [[ -n "${WORKLOAD_TYPE:-}" ]]; then + benchmark_cmd+=(--metadata "workload_type=${WORKLOAD_TYPE}") + fi + if [[ -n "${BENCHMARK_DURATION_S:-}" ]]; then + benchmark_cmd+=(--metadata "benchmark_duration_s=${BENCHMARK_DURATION_S}") + fi + if [[ -n "${OFFLOAD_MODE:-}" ]]; then + benchmark_cmd+=(--metadata "offload_mode=${OFFLOAD_MODE}") + fi + if [[ -n "${KV_CACHE_DTYPE:-}" ]]; then + benchmark_cmd+=(--metadata "kv_cache_dtype=${KV_CACHE_DTYPE}") + fi + if [[ -n "${DISABLE_PREFIX_CACHING:-}" ]]; then + benchmark_cmd+=(--metadata "disable_prefix_caching=${DISABLE_PREFIX_CACHING}") + fi + + if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + benchmark_cmd+=(--metadata "vllm_cpu_offload_gb=${VLLM_CPU_OFFLOAD_GB}") + fi + if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + benchmark_cmd+=(--metadata "vllm_swap_space_gb=${VLLM_SWAP_SPACE_GB}") + fi + if [[ -n "${SGLANG_MEM_FRACTION_OVERRIDE:-}" ]]; then + benchmark_cmd+=(--metadata "sglang_mem_fraction_override=${SGLANG_MEM_FRACTION_OVERRIDE}") + fi + if [[ -n "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-}" ]]; then + benchmark_cmd+=(--metadata "sglang_chunked_prefill_override=${SGLANG_CHUNKED_PREFILL_OVERRIDE}") + fi + + if [[ -n "$runtime_stack_id" ]]; then + benchmark_cmd+=(--runtime-stack-id "$runtime_stack_id") + fi + if [[ -n "$hardware_profile_id" ]]; then + benchmark_cmd+=(--hardware-profile-id "$hardware_profile_id") + fi + if [[ -n "$canonical_model_id" ]]; then + benchmark_cmd+=(--canonical-model-id "$canonical_model_id") + fi + if [[ -n "$trace_id" ]]; then + benchmark_cmd+=(--trace-id "$trace_id") + fi + if [[ -n "$support_status" ]]; then + benchmark_cmd+=(--support-status "$support_status") + fi + if [[ -n "$max_sessions" ]]; then + benchmark_cmd+=(--max-sessions "$max_sessions") + fi + if [[ -n "$max_turns_per_session" ]]; then + benchmark_cmd+=(--max-turns-per-session "$max_turns_per_session") + fi + if [[ -n "$max_output_len" ]]; then + benchmark_cmd+=(--max-output-len "$max_output_len") + fi + if [[ "$ignore_waits" == true ]]; then + benchmark_cmd+=(--ignore-waits) + fi + if [[ "$trust_remote_code" == true ]]; then + benchmark_cmd+=(--trust-remote-code) + fi + if [[ "$ignore_eos" == true ]]; then + benchmark_cmd+=(--ignore-eos) + fi + + set -x + if [[ -n "$server_pid" ]]; then + "${benchmark_cmd[@]}" & + local benchmark_pid=$! + + while kill -0 "$benchmark_pid" 2>/dev/null; do + if ! kill -0 "$server_pid" 2>/dev/null; then + echo "ERROR: Server process $server_pid died during export replay benchmark" + kill "$benchmark_pid" 2>/dev/null + wait "$benchmark_pid" 2>/dev/null + set +x + return 1 + fi + sleep 2 + done + + wait "$benchmark_pid" + local benchmark_exit_code=$? + else + "${benchmark_cmd[@]}" + local benchmark_exit_code=$? + fi + set +x + + return $benchmark_exit_code +} diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/dsr1_fp4_b200.sh index d88941628..e11290b95 100644 --- a/benchmarks/single_node/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/dsr1_fp4_b200.sh @@ -31,13 +31,26 @@ else fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" -EVAL_CONTEXT_ARGS="" +RUNTIME_CONTEXT_ARGS="" +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN" +fi if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context - EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" + RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ @@ -45,7 +58,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +--enable-symm-mem $RADIX_CACHE_ARGS --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -54,7 +67,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -64,7 +77,8 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -73,5 +87,8 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/dsr1_fp8_b200.sh index e6d8a0e9c..0fbe9bd6c 100644 --- a/benchmarks/single_node/dsr1_fp8_b200.sh +++ b/benchmarks/single_node/dsr1_fp8_b200.sh @@ -38,9 +38,9 @@ if [[ $TP -eq 8 ]]; then MAX_RUNNING_REQUESTS=128 CUDA_GRAPH_MAX_BATCH_SIZE=128 - MEM_FRAC_STATIC=0.82 - CHUNKED_PREFILL_SIZE=32768 - MAX_PREFILL_TOKENS=32768 + MEM_FRAC_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.82}" + CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" + MAX_PREFILL_TOKENS="$CHUNKED_PREFILL_SIZE" elif [[ $TP -eq 4 ]]; then if [[ $ISL -ne 8192 ]] || [[ $OSL -ne 1024 ]]; then echo "TP=4 not yet supported for ISL=$ISL OSL=$OSL!" @@ -52,9 +52,9 @@ elif [[ $TP -eq 4 ]]; then MAX_RUNNING_REQUESTS=32 CUDA_GRAPH_MAX_BATCH_SIZE=32 - MEM_FRAC_STATIC=0.95 - CHUNKED_PREFILL_SIZE=8192 - MAX_PREFILL_TOKENS=8192 + MEM_FRAC_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.95}" + CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-8192}" + MAX_PREFILL_TOKENS="$CHUNKED_PREFILL_SIZE" SCHEDULER_RECV_INTERVAL=10 else @@ -63,21 +63,34 @@ else fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" -EVAL_CONTEXT_ARGS="" +RUNTIME_CONTEXT_ARGS="" +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN" +fi if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context - EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" + RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ --tensor-parallel-size=$TP --data-parallel-size=1 \ --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ ---enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ ---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +--enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL $RADIX_CACHE_ARGS \ +--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -86,7 +99,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -96,7 +109,8 @@ run_benchmark_serving \ --num-prompts "$((CONC * 10))" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -105,5 +119,8 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi stop_gpu_monitor -set +x \ No newline at end of file +set +x diff --git a/benchmarks/single_node/dsr1_fp8_b200_vllm.sh b/benchmarks/single_node/dsr1_fp8_b200_vllm.sh new file mode 100644 index 000000000..5c3639fa9 --- /dev/null +++ b/benchmarks/single_node/dsr1_fp8_b200_vllm.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark; then + PREFIX_CACHING_CONFIG="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' +$PREFIX_CACHING_CONFIG +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 +export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size $TP \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/dsr1_fp8_h200.sh b/benchmarks/single_node/dsr1_fp8_h200.sh index c820d180b..a9730917a 100644 --- a/benchmarks/single_node/dsr1_fp8_h200.sh +++ b/benchmarks/single_node/dsr1_fp8_h200.sh @@ -23,34 +23,50 @@ PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi export TORCH_CUDA_ARCH_LIST="9.0" -EVAL_CONTEXT_ARGS="" +RUNTIME_CONTEXT_ARGS="" +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN" +fi if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context - EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" + RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.82}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" set -x if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ - --disable-radix-cache --max-running-requests 512 --cuda-graph-max-bs 512 \ - --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \ + $RADIX_CACHE_ARGS --max-running-requests 512 --cuda-graph-max-bs 512 \ + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \ --attention-backend flashinfer --stream-interval 10 \ --decode-log-interval 1 \ - $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & else PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ - --disable-radix-cache --max-running-requests 256 --cuda-graph-max-bs 256 \ - --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \ + $RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \ + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \ --attention-backend flashinfer --stream-interval 10 \ --decode-log-interval 1 \ - $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & fi SERVER_PID=$! @@ -58,7 +74,7 @@ SERVER_PID=$! # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -68,7 +84,8 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -77,5 +94,8 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/dsr1_fp8_h200_vllm.sh b/benchmarks/single_node/dsr1_fp8_h200_vllm.sh new file mode 100644 index 000000000..65348e831 --- /dev/null +++ b/benchmarks/single_node/dsr1_fp8_h200_vllm.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + PREFIX_CACHING_CONFIG="" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' +$PREFIX_CACHING_CONFIG +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +apply_vllm_offload_config + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi + +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size $TP \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi +set +x diff --git a/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh b/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh new file mode 100755 index 000000000..60f06b13e --- /dev/null +++ b/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for DeepSeek-R1 FP8 on H100. +# +# Differences from baseline dsr1_fp8_h200_vllm.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/deepseek_r1_0528_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size $TP \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh b/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh new file mode 100755 index 000000000..1c4722964 --- /dev/null +++ b/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for DeepSeek-R1 FP8 on H200. +# +# Differences from baseline dsr1_fp8_h200_vllm.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/deepseek_r1_0528_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size $TP \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh index f6a6f72e9..95240230e 100644 --- a/benchmarks/single_node/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/gptoss_fp4_b200.sh @@ -34,15 +34,33 @@ if [ "${EVAL_ONLY}" = "true" ]; then CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark; then + PREFIX_CACHING_CONFIG="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + cat > config.yaml << EOF kv-cache-dtype: fp8 compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' -no-enable-prefix-caching: true +$PREFIX_CACHING_CONFIG max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $CALCULATED_MAX_MODEL_LEN EOF +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 @@ -52,6 +70,9 @@ PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ @@ -59,7 +80,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ --max-num-seqs 512 \ ---disable-log-requests > $SERVER_LOG 2>&1 & +--disable-log-requests $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -68,7 +90,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -78,7 +100,8 @@ run_benchmark_serving \ --num-prompts $(( CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -87,5 +110,8 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/gptoss_fp4_b200_sglang.sh b/benchmarks/single_node/gptoss_fp4_b200_sglang.sh new file mode 100644 index 000000000..f3d9ad82c --- /dev/null +++ b/benchmarks/single_node/gptoss_fp4_b200_sglang.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export NCCL_NVLS_ENABLE=1 +export SGL_ENABLE_JIT_DEEPGEMM=false +export SGLANG_ENABLE_FLASHINFER_GEMM=true +export PYTHONUNBUFFERED=1 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path="$MODEL" --host=0.0.0.0 --port="$PORT" \ +--trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 \ +--cuda-graph-max-bs 128 --max-running-requests 128 \ +--mem-fraction-static "$MEM_FRACTION_STATIC" --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 \ +--context-length "$CONTEXT_LENGTH" --kv-cache-dtype fp8_e4m3 \ +$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \ +--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" \ +--reasoning-parser gpt-oss --tokenizer-worker-num 6 --stream-interval 30 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index 8d0e773a2..dc5baf287 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -17,20 +17,42 @@ fi hf download "$MODEL" -MAX_MODEL_LEN=10240 +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + MAX_MODEL_LEN="${MAX_MODEL_LEN}" +else + MAX_MODEL_LEN=10240 +fi if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark; then + PREFIX_CACHING_CONFIG="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + cat > config.yaml << EOF -no-enable-prefix-caching: true +$PREFIX_CACHING_CONFIG max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $MAX_MODEL_LEN EOF +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + export PYTHONNOUSERSITE=1 export VLLM_MXFP4_USE_MARLIN=1 SERVER_LOG=/workspace/server.log @@ -38,13 +60,17 @@ PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --config config.yaml \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ ---max-num-seqs=$CONC > $SERVER_LOG 2>&1 & +--max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -53,7 +79,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -63,7 +89,8 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -72,5 +99,8 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/gptoss_fp4_h100_sglang.sh b/benchmarks/single_node/gptoss_fp4_h100_sglang.sh new file mode 100644 index 000000000..a045cd99c --- /dev/null +++ b/benchmarks/single_node/gptoss_fp4_h100_sglang.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export TORCH_CUDA_ARCH_LIST="9.0" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ +--host 0.0.0.0 --port "$PORT" --trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 \ +$RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \ +--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 --mem-fraction-static "$MEM_FRACTION_STATIC" \ +--context-length "$CONTEXT_LENGTH" --reasoning-parser gpt-oss --stream-interval 10 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh index 2a9359b96..9be9959bf 100644 --- a/benchmarks/single_node/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/gptoss_fp4_h200.sh @@ -18,7 +18,9 @@ fi hf download "$MODEL" # Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi set -x pip install datasets pandas @@ -37,14 +39,21 @@ if [ "${EVAL_ONLY}" = "true" ]; then CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + PREFIX_CACHING_CONFIG="" +fi + # Create config.yaml cat > config.yaml << EOF -no-enable-prefix-caching: true +$PREFIX_CACHING_CONFIG max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $CALCULATED_MAX_MODEL_LEN EOF +apply_vllm_offload_config + SERVER_LOG=/workspace/server.log export TORCH_CUDA_ARCH_LIST="9.0" PORT=${PORT:-8888} @@ -55,14 +64,15 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --config config.yaml \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ - --max-num-seqs $CONC > $SERVER_LOG 2>&1 & + --max-num-seqs $CONC $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & SERVER_PID=$! # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -72,7 +82,8 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -81,5 +92,7 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring -stop_gpu_monitor +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi set +x diff --git a/benchmarks/single_node/gptoss_fp4_h200_sglang.sh b/benchmarks/single_node/gptoss_fp4_h200_sglang.sh new file mode 100644 index 000000000..069b1a452 --- /dev/null +++ b/benchmarks/single_node/gptoss_fp4_h200_sglang.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export TORCH_CUDA_ARCH_LIST="9.0" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" + +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ +--host 0.0.0.0 --port "$PORT" --trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 \ +$RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \ +--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 --mem-fraction-static "$MEM_FRACTION_STATIC" \ +--context-length "$CONTEXT_LENGTH" --reasoning-parser gpt-oss --stream-interval 10 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi +set +x diff --git a/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh b/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh new file mode 100755 index 000000000..cfff2a12d --- /dev/null +++ b/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for GPT-OSS-120B FP4 on H100. +# +# Differences from baseline gptoss_fp4_h100.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/gpt_oss_120b_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + MAX_MODEL_LEN="${MAX_MODEL_LEN}" +else + MAX_MODEL_LEN=10240 +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export PYTHONNOUSERSITE=1 +export VLLM_MXFP4_USE_MARLIN=1 +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ +--config config.yaml \ +--gpu-memory-utilization=0.9 \ +--tensor-parallel-size=$TP \ +--max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh b/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh new file mode 100755 index 000000000..fc6f465bc --- /dev/null +++ b/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for GPT-OSS-120B FP4 on H200. +# +# Differences from baseline gptoss_fp4_h100.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/gpt_oss_120b_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + MAX_MODEL_LEN="${MAX_MODEL_LEN}" +else + MAX_MODEL_LEN=10240 +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export PYTHONNOUSERSITE=1 +export VLLM_MXFP4_USE_MARLIN=1 +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ +--config config.yaml \ +--gpu-memory-utilization=0.9 \ +--tensor-parallel-size=$TP \ +--max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh new file mode 100755 index 000000000..97fb5127c --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export NCCL_NVLS_ENABLE=1 +export SGL_ENABLE_JIT_DEEPGEMM=false +export SGLANG_ENABLE_FLASHINFER_GEMM=true +export PYTHONUNBUFFERED=1 +export TORCH_CUDA_ARCH_LIST="10.0" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path="$MODEL" --host=0.0.0.0 --port="$PORT" \ +--trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \ +--quantization fp8 --kv-cache-dtype fp8_e4m3 \ +--mamba-ssm-dtype bfloat16 \ +--cuda-graph-max-bs "$CONC" --max-running-requests 128 \ +--mem-fraction-static "$MEM_FRACTION_STATIC" --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" \ +--context-length "$CONTEXT_LENGTH" \ +--attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \ +$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \ +--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" \ +--reasoning-parser qwen3 --tool-call-parser qwen3_coder \ +--tokenizer-worker-num 6 --stream-interval 30 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh new file mode 100755 index 000000000..e48c56700 --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +apply_yarn_config_if_needed "$MODEL" "$CALCULATED_MAX_MODEL_LEN" + +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + PREFIX_CACHING_CONFIG="" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' +$PREFIX_CACHING_CONFIG +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +apply_vllm_offload_config + +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 +export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl + +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size "$TP" \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS $VLLM_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh new file mode 100755 index 000000000..61df75cff --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export TORCH_CUDA_ARCH_LIST="9.0" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-16384}" + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ +--host 0.0.0.0 --port "$PORT" --trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \ +--quantization fp8 --kv-cache-dtype fp8_e4m3 \ +--mamba-ssm-dtype bfloat16 \ +$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \ +--max-running-requests 128 --cuda-graph-max-bs 128 \ +--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \ +--context-length "$CONTEXT_LENGTH" \ +--reasoning-parser qwen3 --tool-call-parser qwen3_coder \ +--attention-backend flashinfer \ +--stream-interval 30 --tokenizer-worker-num 6 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh new file mode 100755 index 000000000..6f576ea0f --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark; then + PREFIX_CACHING_CONFIG="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +$PREFIX_CACHING_CONFIG +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size "$TP" \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh new file mode 100755 index 000000000..b3d5ea50b --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export TORCH_CUDA_ARCH_LIST="9.0" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +apply_yarn_config_if_needed "$MODEL" "$CONTEXT_LENGTH" + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-16384}" + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ +--host 0.0.0.0 --port "$PORT" --trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \ +--reasoning-parser qwen3 --tool-call-parser qwen3_coder \ +--enable-flashinfer-allreduce-fusion \ +--max-running-requests 128 \ +--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \ +--mem-fraction-static "$MEM_FRACTION_STATIC" \ +--cuda-graph-max-bs 128 \ +--context-length "$CONTEXT_LENGTH" \ +--kv-cache-dtype fp8_e4m3 \ +--quantization fp8 \ +--attention-backend flashinfer \ +--stream-interval 30 \ +--tokenizer-worker-num 6 \ +--mamba-ssm-dtype bfloat16 \ +$RADIX_CACHE_ARGS \ +$SGLANG_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh new file mode 100755 index 000000000..de5c66c44 --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +apply_yarn_config_if_needed "$MODEL" "$CALCULATED_MAX_MODEL_LEN" + +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + PREFIX_CACHING_CONFIG="" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +$PREFIX_CACHING_CONFIG +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +apply_vllm_offload_config + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size "$TP" \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS $VLLM_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi +set +x diff --git a/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh b/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh new file mode 100755 index 000000000..87e81ab22 --- /dev/null +++ b/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for Qwen 3.5 FP8 on H100. +# +# Differences from baseline qwen3.5_fp8_h100_vllm.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/qwen3_5_397b_a17b_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size "$TP" \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh b/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh new file mode 100755 index 000000000..83fb3b8c6 --- /dev/null +++ b/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for Qwen 3.5 FP8 on H200. +# +# Differences from baseline qwen3.5_fp8_h200_vllm.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/qwen3_5_397b_a17b_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size "$TP" \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/datasets/isb1/.gitattributes b/datasets/isb1/.gitattributes new file mode 100644 index 000000000..d7fa37c52 --- /dev/null +++ b/datasets/isb1/.gitattributes @@ -0,0 +1,2 @@ +exports/**/*.json linguist-generated=true +exports/**/*.json text eol=lf diff --git a/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md new file mode 100644 index 000000000..175765ab1 --- /dev/null +++ b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md @@ -0,0 +1,122 @@ +--- +version: 1.0.0 +date: 2026-04-14 +author: William Chen +status: proposed +--- + +# ISB1 ↔ kv-cache-tester Coexistence Plan + +## The Two Systems + +| | kv-cache-tester (Cameron's) | ISB1 (ours) | +|---|---|---| +| **Location** | `experimental/multiturn/vllm_benchmark/kv-cache-tester/` | `datasets/isb1/exports/` | +| **Traces** | 522 real Claude Code sessions | 35 synthetic multi-turn traces | +| **Source** | Real production agentic workloads | Synthetic with controlled stress patterns | +| **Replay** | `trace_replay_tester.py` | `benchmark_export_replay.py` | +| **Config** | `multiturn-agentic-trace.yaml` | `isb1-kv-stress-pr993.yaml` | +| **Metrics** | Prometheus sidecar (`metrics_collector.py`) | `process_result_isb1.py` | + +## Why Both Are Needed + +**kv-cache-tester** shows how chips perform under **real workloads** — actual Claude Code +sessions with natural token distributions. This is the ground truth for "how does inference +actually work in production?" + +**ISB1** shows how chips perform under **controlled stress conditions** — specific KV cache +behaviors that real workloads rarely trigger but production systems must handle: + +| Stress Pattern | kv-cache-tester | ISB1 | +|---|---|---| +| Natural agentic workload distribution | ✅ (522 real traces) | ❌ | +| Targeted prefix reuse testing | ❌ | ✅ (high_prefix stress class) | +| Forced KV offload cliff | ❌ (depends on trace) | ✅ (offload_cliff stress, 128K-1M context) | +| Session reactivation after idle | ❌ | ✅ (reactivation stress, idle windows) | +| KV compaction under long sessions | ❌ | ✅ (compaction_heavy stress, 25+ turns) | +| Shared prefix fanout | ❌ | ✅ (fanout stress, branching requests) | +| 500K-1M context depth | ❌ (real traces are shorter) | ✅ (xlc2/ulc1/ulc2 bands) | + +Together they give the Pareto frontier Cameron wants: kv-cache-tester at realistic operating +points, ISB1 at stress-test extremes. + +## How They Coexist in PR #993 + +### Configs (no conflict) +```yaml +# Cameron's existing config — uses kv-cache-tester traces +# .github/configs/multiturn-agentic-trace.yaml +h200-fp8-llama70b: + trace-file: experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/... + +# Our config — uses ISB1 export traces +# .github/configs/isb1-kv-stress-pr993.yaml +dsr1-fp8-h200-isb1-kv-stress-vllm-pr993: + export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json +``` + +### Workflows (no conflict) +```yaml +# Cameron's workflow +# .github/workflows/multiturn-sweep.yml → benchmark-multiturn-tmpl.yml +# Uses: trace_replay_tester.py + +# Our workflow +# .github/workflows/run-isb1-sweep.yml → benchmark-isb1-tmpl.yml +# Uses: benchmark_export_replay.py +``` + +### Data directories (no conflict) +``` +experimental/multiturn/vllm_benchmark/ ← Cameron's (untouched) + kv-cache-tester/ 522 real traces + replayer + aiperf/ AIPerf submodule + bench/metrics_collector.py Prometheus sidecar + analysis/plot_pareto.py Pareto charts + +datasets/isb1/ ← Ours (separate directory) + exports/ ISB1 replay bundles + extension_131k/ 131K context (DSR1, GPT-OSS, Qwen) + preview/long_context_500k/ 500K Qwen preview + preview/long_context_1m/ 1M Qwen preview +``` + +### Shared infrastructure we USE from PR #993 +- vLLM offload API flags (`--kv_offloading_backend native`, etc.) +- Prometheus metrics collector (could share `metrics_collector.py`) +- Offload mode sweep pattern (on/off/noprefix) +- Runner launch scripts (`runners/launch_*.sh`) +- Concurrency sweep structure + +### What we DO NOT touch +- `experimental/multiturn/vllm_benchmark/` — entirely Cameron's +- `kv-cache-tester/` submodule — real traces, don't modify +- `aiperf/` submodule — alternative benchmark, don't modify +- `benchmark-multiturn-tmpl.yml` — Cameron's workflow template + +## Recommended PR Structure + +### Option A: Single PR with two benchmark lanes (cleanest) +PR #993 ships with BOTH: +- Lane 1: kv-cache-tester (real traces) — Cameron's existing work +- Lane 2: ISB1 (synthetic stress traces) — our addition + +Both use the same vLLM server configs, offload modes, and concurrency sweeps. +Results are compared side by side — real vs stress. + +### Option B: ISB1 as follow-up PR (safest) +PR #993 ships with kv-cache-tester only (Cameron's work). +We submit a follow-up PR that adds ISB1 as a second benchmark lane. +Uses the same runner infrastructure and offload configs. + +### Recommendation: Option A +Cameron explicitly asked for "realistic multi-turn benchmarks" at GTC. Having both +real traces AND synthetic stress traces in the same PR makes a stronger story: +"Here's how chips perform under real workloads AND here's where they break under +targeted KV stress." That's the complete Pareto frontier. + +## What We Need From Cameron's Team +1. Confirm ISB1 configs don't conflict with multiturn-agentic-trace.yaml +2. Confirm datasets/isb1/exports/ is the right location for our files +3. Decide: do we share metrics_collector.py or use process_result_isb1.py? +4. Agree on result format for combined Pareto visualization diff --git a/datasets/isb1/GMI_EXECUTION_PLAN.md b/datasets/isb1/GMI_EXECUTION_PLAN.md new file mode 100644 index 000000000..1ae696acd --- /dev/null +++ b/datasets/isb1/GMI_EXECUTION_PLAN.md @@ -0,0 +1,175 @@ +# ISB1 KV Cache Benchmark — GMI Cloud Execution Plan + +## Available Hardware + +| GPU | HBM | Available | Max Context Before Offload | +|-----|-----|-----------|---------------------------| +| **GB200** | 192GB HBM3e | ✅ | ~384K tokens (FP8 KV) | +| **H100** | 80GB HBM3 | ✅ | ~128K tokens (FP8 KV) | + +## Execution Order + +Run benchmarks in this order — cheapest/fastest first to validate the setup works. + +### Phase 1: Validation Run (1 hour) + +Prove the pipeline works end-to-end before burning GPU hours. + +```bash +# On H100 — single model, single concurrency, 5 min duration +export MODEL=deepseek-ai/DeepSeek-R1-0528 +export TP=8 +export EXPORT_FILE=datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + +# Launch server +bash benchmarks/single_node/dsr1_fp8_h100_vllm.sh + +# Run ONE cell: 2 users, offload=off, 300s +python utils/bench_serving/benchmark_export_replay.py \ + --export-file $EXPORT_FILE \ + --max-concurrency 2 \ + --duration 300 \ + --request-mode multi-turn + +# Verify result has actual_context_len > 0 +python utils/process_result_isb1.py --result-file results/*.json +``` + +**Pass criteria:** TTFT and throughput numbers appear. `actual_context_len` > 100K. + +### Phase 2: H100 KV Stress Sweep (8 hours) + +H100 80GB is the interesting GPU — KV cache fills up first. + +```bash +# Models to test: +# 1. DeepSeek-R1 FP8 (TP8) +# 2. GPT-OSS FP4 (TP8) + +# Sweep per model: +# users: [2, 4, 8, 16, 32, 64] # H100 can't do 128+ at 131K +# offload-modes: [on, off, noprefix] +# duration: 1800s (30 min) +# export: extension_131k/vllm/code_131k1k.json + +# Total cells: 2 models × 6 concurrency × 3 offload = 36 cells +# Time: 36 × 30min = 18 hours → with 2 models sequential = ~9 hours +``` + +**What to look for:** +- Offload cliff: at what concurrency does offload=on start helping? +- Prefix cache hit rate: does it stay >50% under load? +- Preemption count: how many requests get evicted? +- TTFT degradation: when does p99 TTFT exceed 10s? + +### Phase 3: GB200 KV Stress Sweep (8 hours) + +GB200 192GB has 2.4x more HBM — the cliff comes later. + +```bash +# Same sweep but higher concurrency (more HBM room): +# users: [2, 4, 8, 16, 32, 64, 128, 256] +# offload-modes: [on, off, noprefix] +# duration: 1800s + +# Add Qwen 3.5 (needs more memory for MoE): +# 3 models × 8 concurrency × 3 offload = 72 cells +# Time: 72 × 30min = 36 hours → might need to cut duration to 900s +``` + +**What to look for:** +- At what concurrency does GB200 hit its offload cliff? +- Is the cliff at ~3x H100's cliff (proportional to HBM)? +- Does 192GB allow prefix caching to stay effective longer? + +### Phase 4: Long Context Preview (4 hours, GB200 only) + +500K and 1M token traces — only GB200 has enough memory. + +```bash +# 500K preview (Qwen 3.5 only): +export EXPORT_FILE=datasets/isb1/exports/preview/long_context_500k/\ +inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json + +# 1M preview (Qwen 3.5 only): +export EXPORT_FILE=datasets/isb1/exports/preview/long_context_1m/\ +inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json + +# Low concurrency (these are HUGE contexts): +# users: [1, 2, 4] +# offload-modes: [on, off] +# duration: 900s +``` + +**What to look for:** +- Can GB200 serve 1M context at all? +- What's the TTFT for a 1M token prefill? +- Does KV offload work at this scale? + +## Estimated GPU Time + +| Phase | GPU | Duration | Cost (est) | +|-------|-----|----------|------------| +| 1. Validation | H100 | 1 hour | ~$3 | +| 2. H100 sweep | H100 | 9 hours | ~$27 | +| 3. GB200 sweep | GB200 | 18 hours | ~$90 | +| 4. Long context | GB200 | 4 hours | ~$20 | +| **Total** | | **32 hours** | **~$140** | + +## Portable Run Script + +Use `gmi_portable_benchmark.sh` for manual runs without GitHub Actions: + +```bash +# Set GMI-specific env vars +export GMI_API_KEY="..." +export HF_TOKEN="..." +export MODEL=deepseek-ai/DeepSeek-R1-0528 +export GPU_TYPE=h100 # or gb200 + +# Run the portable benchmark +bash datasets/isb1/scripts/gmi_portable_benchmark.sh \ + --model $MODEL \ + --gpu $GPU_TYPE \ + --export-file datasets/isb1/exports/extension_131k/vllm/code_131k1k.json \ + --users 2,4,8,16,32,64 \ + --offload-modes on,off,noprefix \ + --duration 1800 +``` + +## Result Collection + +After each phase, results go to: +``` +results/ +├── h100_dsr1_fp8_kv_stress/ +│ ├── users_2_offload_on.json +│ ├── users_2_offload_off.json +│ └── ... +└── gb200_dsr1_fp8_kv_stress/ + └── ... +``` + +Process and visualize: +```bash +# Aggregate results +python datasets/isb1/scripts/collect_sweep_results.py \ + --results-dir results/ \ + --output results/sweep_summary.json + +# Generate Pareto frontier chart +python datasets/isb1/scripts/plot_pareto.py \ + --summary results/sweep_summary.json \ + --output results/pareto_frontier.png +``` + +## What Success Looks Like + +After all phases, we have: +1. **Pareto frontier chart:** throughput vs p99 TTFT for H100 and GB200 +2. **Offload cliff identification:** exact concurrency where offload starts helping +3. **Prefix cache benefit:** measured hit rate under realistic multi-turn load +4. **HBM scaling evidence:** does 2.4x more HBM give 2.4x more capacity? +5. **Long context feasibility:** can GB200 serve 500K/1M context at all? + +These results go into the InferenceX PR as evidence that the benchmark works. diff --git a/datasets/isb1/README.md b/datasets/isb1/README.md new file mode 100644 index 000000000..e3746eb58 --- /dev/null +++ b/datasets/isb1/README.md @@ -0,0 +1,125 @@ +# ISB1 replay artifacts for InferenceX + +This directory is the InferenceX-side consumer package for ISB1 replay. + +InferenceX consumes committed file artifacts only: +- replay export JSON bundles under `datasets/isb1/exports/` +- consumer configs in `.github/configs/isb1-*.yaml` +- replay processing through `utils/bench_serving/benchmark_export_replay.py` +- result normalization through `utils/process_result_isb1.py` + + +## Why not random data? + +Random data benchmarks show worst-case performance. Real inference workloads +have multi-turn conversations where each turn shares context with previous +turns. This enables: + +- **Prefix caching** — 60-95% of each request's tokens are shared with the + previous turn. Prefix cache hit rates directly affect throughput. +- **KV cache reuse** — the server reuses computed KV cache entries instead of + recomputing them. This is the biggest performance optimization in production. +- **Realistic offload behavior** — KV cache grows across turns, eventually + exceeding GPU memory and requiring CPU offload. Random data never reaches + this point because each request is independent. + +These traces stress-test the exact KV cache behaviors that determine real +production performance. + +InferenceX does **not** import external runtime code and does **not** make live-serving claims from export-file existence alone. + +--- + +## Current ground truth (verified 2026-04-12) + +The definitive strict audit found: + +- **26 PASSED** +- **0 FAILED** +- **10 N/A** + +Strict audit rule: count only model-architecture-valid cells. + +### Strict verified coverage + +| Model | Chat | Code | +|---|---|---| +| `dsr1` | `8k`, `32k`, `64k`, `131k` | `8k`, `32k`, `64k`, `131k` | +| `gptoss` | `8k`, `32k`, `64k`, `131k` | `8k`, `32k`, `64k`, `131k` | +| `qwen3.5` | `8k`, `32k`, `64k`, `131k`, `500k` | `8k`, `32k`, `64k`, `131k`, `500k` | + +### Existing but excluded from the strict pass count + +- `gptoss` `500k` chat/code preview files exist, but strict coverage stops at `131k` +- `qwen3.5` `1M` chat/code preview files exist, but were excluded from the strict audit +- `dsr1` has no strict `500k` or `1M` lane because the model tops out at `163840` + +--- + +## Inventory + +### Export-file counts + +- **50 export files** +- **3 JSON manifests** +- **53 total JSON files** under `datasets/isb1/exports/` +- **888 total cells** +- **5,094 total turns** +- **13 MB actual message content** +- **All export files are valid JSON** + +### Export-file breakdown + +| Class | Count | +|---|---:| +| Core `8k1k` | 8 | +| Extension `32k1k` | 8 | +| Extension `64k1k` | 8 | +| Extension `131k1k` | 10 | +| Preview `offload_core` | 4 | +| Preview `500k` | 8 | +| Preview `1M` | 4 | +| JSON manifests | 3 | + +--- + +## Claim boundary + +Safe claims: +- InferenceX carries the full audited ISB1 replay corpus described above. +- Strict replay-file coverage is **26 passed / 0 failed / 10 N/A**. +- DSR1 strict coverage stops at `131k`. +- GPT-OSS strict coverage stops at `131k`. +- Qwen strict coverage reaches `500k`. +- GPT-OSS `500k` and Qwen `1M` files exist, but are excluded from the strict pass count. + +Unsafe claims: +- `26/26` valid cells verified (10 N/A due to model `max_position_embeddings` limits: DSR1=163,840, GPT-OSS=131,072, Qwen3.5=1,010,000) +- strict GPT-OSS `500k` coverage +- strict Qwen `1M` coverage +- turning preview-file existence into live benchmark certification + +--- + +## Key docs + +- [`COVERAGE_AUDIT_2026-04-11.md`](COVERAGE_AUDIT_2026-04-11.md) — definitive strict audit, file-path mapping, and N/A rationale +- [`LONG_CONTEXT_TRUTH_MATRIX.md`](LONG_CONTEXT_TRUTH_MATRIX.md) — canonical claim boundary +- [`SUPPORT_MATRIX.md`](SUPPORT_MATRIX.md) — lane-by-lane audited support table +- [`PRODUCER_GAPS.md`](PRODUCER_GAPS.md) — what remains truly open vs no longer applicable +- [`RUNBOOK_EXTERNAL_GMI.md`](RUNBOOK_EXTERNAL_GMI.md) — external operator path +- [`RUNBOOK_INTERNAL_SEMIANALYSIS.md`](RUNBOOK_INTERNAL_SEMIANALYSIS.md) — internal workflow-backed path +- [`INVESTIGATION_KV_CACHE_PROFILING_2026-04-11.md`](INVESTIGATION_KV_CACHE_PROFILING_2026-04-11.md) — what the long-context preview paths actually measure + +--- + +## Export roots + +- `datasets/isb1/exports/core/` +- `datasets/isb1/exports/extension_32k/` +- `datasets/isb1/exports/extension_64k/` +- `datasets/isb1/exports/extension_131k/` +- `datasets/isb1/exports/preview/offload_core/` +- `datasets/isb1/exports/preview/long_context_500k/` +- `datasets/isb1/exports/preview/long_context_1m/` + diff --git a/datasets/isb1/exports/core/chat_8k1k.json b/datasets/isb1/exports/core/chat_8k1k.json new file mode 100644 index 000000000..c3c2e1124 --- /dev/null +++ b/datasets/isb1/exports/core/chat_8k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08070a63d22aa247e38475fdd7e206ea41bab731f2499f0d32210b317933b075 +size 3615534 diff --git a/datasets/isb1/exports/core/chat_8k1k_qwen3.5.json b/datasets/isb1/exports/core/chat_8k1k_qwen3.5.json new file mode 100644 index 000000000..243cea119 --- /dev/null +++ b/datasets/isb1/exports/core/chat_8k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04d60ff42c01d6bf117a6bddac7eae99cef2d052235101fa540fd3a7eb6466de +size 136407 diff --git a/datasets/isb1/exports/core/code_8k1k.json b/datasets/isb1/exports/core/code_8k1k.json new file mode 100644 index 000000000..1c1dd2461 --- /dev/null +++ b/datasets/isb1/exports/core/code_8k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c746a650eb624d9d40ee19aad4a9d126b4e60602f13793c09a6a8cfde81d6ee +size 2605444 diff --git a/datasets/isb1/exports/core/code_8k1k_qwen3.5.json b/datasets/isb1/exports/core/code_8k1k_qwen3.5.json new file mode 100644 index 000000000..52957e59e --- /dev/null +++ b/datasets/isb1/exports/core/code_8k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e4fc73e3ff51469ad736fda8e15df09a14bd2d430d8a9a1600ae2ca1cd13075 +size 138620 diff --git a/datasets/isb1/exports/extension_131k/chat_131k1k.json b/datasets/isb1/exports/extension_131k/chat_131k1k.json new file mode 100644 index 000000000..daefd2dad --- /dev/null +++ b/datasets/isb1/exports/extension_131k/chat_131k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eab224b3f15a3118204a912a3e53f3081c96ac2be1f4861b4dda5593580b2da1 +size 1231308 diff --git a/datasets/isb1/exports/extension_131k/chat_131k1k_dsr1.json b/datasets/isb1/exports/extension_131k/chat_131k1k_dsr1.json new file mode 100644 index 000000000..e1ce42508 --- /dev/null +++ b/datasets/isb1/exports/extension_131k/chat_131k1k_dsr1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea824f39557d4bc7cc5a3e09c61815ebd32b2a7c3e78046c62c4d9da340f92d2 +size 312933 diff --git a/datasets/isb1/exports/extension_131k/chat_131k1k_qwen3.5.json b/datasets/isb1/exports/extension_131k/chat_131k1k_qwen3.5.json new file mode 100644 index 000000000..c25a74094 --- /dev/null +++ b/datasets/isb1/exports/extension_131k/chat_131k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20550fdc8fbb1aeaa9cf2b4fdb7807f4e8abcac5b2f871de573ea061f88e8dc5 +size 312996 diff --git a/datasets/isb1/exports/extension_131k/code_131k1k.json b/datasets/isb1/exports/extension_131k/code_131k1k.json new file mode 100644 index 000000000..1b29e38f6 --- /dev/null +++ b/datasets/isb1/exports/extension_131k/code_131k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:082c4b75b81ca680adccf6f00fc8e4069098cfb25e20ebc5ca88ce0dd47c3cc3 +size 1802776 diff --git a/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json b/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json new file mode 100644 index 000000000..1b955eb08 --- /dev/null +++ b/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:385d33f9705f6c0227ea04b03d0ed2c47730a3ce408b1619445e50b67429a9d2 +size 398078 diff --git a/datasets/isb1/exports/extension_32k/chat_32k1k.json b/datasets/isb1/exports/extension_32k/chat_32k1k.json new file mode 100644 index 000000000..7378882af --- /dev/null +++ b/datasets/isb1/exports/extension_32k/chat_32k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606a6174834ddac7704bd199995d1b3f7c1d34b39ad4a904b80b09a22b1b04dc +size 1390574 diff --git a/datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json b/datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json new file mode 100644 index 000000000..8fd721f45 --- /dev/null +++ b/datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a44061cd4fac9b02347afcd4cbbfc4e5152020f23d6eccfccf548e198b4b7c70 +size 351049 diff --git a/datasets/isb1/exports/extension_32k/code_32k1k.json b/datasets/isb1/exports/extension_32k/code_32k1k.json new file mode 100644 index 000000000..5a09c88f5 --- /dev/null +++ b/datasets/isb1/exports/extension_32k/code_32k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49442fa6a1ec7114c26da5aa61ec7b7dfc6662f5e636edd95e5a019ae47ca2be +size 1337748 diff --git a/datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json b/datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json new file mode 100644 index 000000000..a110e6c14 --- /dev/null +++ b/datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f74b077263ea89567e9a09cfcecc5ea90040891170d4d65636156f9349733aa +size 337547 diff --git a/datasets/isb1/exports/extension_64k/chat_64k1k.json b/datasets/isb1/exports/extension_64k/chat_64k1k.json new file mode 100644 index 000000000..709a833b2 --- /dev/null +++ b/datasets/isb1/exports/extension_64k/chat_64k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0e7fa8895d4774cf36d9d78d9f02a35282f420598e7b373c5378330ea663b05 +size 2473612 diff --git a/datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json b/datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json new file mode 100644 index 000000000..79ad2cb87 --- /dev/null +++ b/datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0533834348310306dc9e56ad4d54671a7615c9d7852fa677320bad51ee2ceaa6 +size 621810 diff --git a/datasets/isb1/exports/extension_64k/code_64k1k.json b/datasets/isb1/exports/extension_64k/code_64k1k.json new file mode 100644 index 000000000..bb1ca8974 --- /dev/null +++ b/datasets/isb1/exports/extension_64k/code_64k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1804919d069fb037802c0d97605fb8bc6b12050f242f9ca00fc7aa7f372db81b +size 788105 diff --git a/datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json b/datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json new file mode 100644 index 000000000..73beb4b57 --- /dev/null +++ b/datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9513a2d11519546a701d6b2889cbf18b01f5ba36abc3b6f8fb34669566e6c311 +size 200074 diff --git a/datasets/isb1/exports/preview/long_context_1m/README.md b/datasets/isb1/exports/preview/long_context_1m/README.md new file mode 100644 index 000000000..3e5ea5af9 --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_1m/README.md @@ -0,0 +1,33 @@ +# Gated 1M-class Qwen3.5 preview lane + +This directory carries the committed InferenceX-side Qwen3.5 artifacts for a +bounded `1M`-class ISB1 coding replay preview. + +## What these files are + +- dedicated replay bundles restricted to `qwen3_5_397b_a17b` +- producer cells for standalone `vllm` and standalone `sglang` +- committed bundle coverage for `nvidia:b200_sxm_180gb`, `nvidia:h100_sxm_80gb`, and `nvidia:h200_sxm_141gb` +- restricted to `ulc2_1m_plus` +- restricted to `support_status=reviewed_preview` at the selected export-cell level +- restricted to `benchmark_certification_status=dataset_replay_verified` +- exposed downstream only through the separate manual config + `.github/configs/isb1-qwen-1m-preview.yaml` +- explicit `max-model-len: 1048576` when the manual config is used + +## Current claim boundary + +These files are committed preview artifacts plus a gated/manual validation path. +They do **not** imply ordinary runnable ISB1 support in `isb1-master.yaml`. + +Safe wording: +- InferenceX carries bounded 1M-class Qwen3.5 replay preview artifacts. +- InferenceX carries a separate gated/manual Qwen3.5 1M validation path. + +Unsafe wording: +- native 1M served-lane support +- ordinary/general runnable consumer support +- KV-offload certification + +See `manifest.json` for the exact preview boundary and +`.github/configs/isb1-qwen-1m-preview.yaml` for the manual validation surface. diff --git a/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1.json b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1.json new file mode 100644 index 000000000..a37edd86a --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd16cc4de821cf4803d662e4c5091359b7a5b2b730d03c976eb331be0cd6b1cb +size 286074 diff --git a/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json new file mode 100644 index 000000000..5fd23f78c --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35572a38f071d240519f7fdbd60aa203eb4832d835df97a8a5ef874d5d402456 +size 122465512 diff --git a/datasets/isb1/exports/preview/long_context_1m/manifest.json b/datasets/isb1/exports/preview/long_context_1m/manifest.json new file mode 100644 index 000000000..3c1cfb8db --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_1m/manifest.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e05e30fc8eddf2dd35b21b0575af6943428b2ab7e6ebe5a3df257d0344ad8b +size 2445 diff --git a/datasets/isb1/exports/preview/long_context_500k/README.md b/datasets/isb1/exports/preview/long_context_500k/README.md new file mode 100644 index 000000000..8efb153d5 --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/README.md @@ -0,0 +1,45 @@ +# Bounded 500k-class preview lanes + +This directory carries the smallest honest InferenceX consumer handoff for bounded +`500k`-class ISB1 coding replay paths. + +## What these files are + +- dedicated replay bundles derived from committed `131k1k` extension exports +- restricted to `gpt_oss_120b` or `qwen3_5_397b_a17b` +- restricted to `xlc2_384k_512k` +- restricted to standalone `vllm` and standalone `sglang` +- restricted to `nvidia:b200_sxm_180gb`, `nvidia:h100_sxm_80gb`, and `nvidia:h200_sxm_141gb` +- restricted to `support_status=reviewed_preview` +- restricted to `benchmark_certification_status=dataset_replay_verified` +- wired in the consumer with explicit `max-model-len: 524288` + +## What these files are not + +- not a native InferenceX `500k+` served lane +- not a native InferenceX `1M+` served lane +- not a supported-tier long-context expansion +- not a chat preview lane +- not an offload-depth lane +- not a KV-offload certification claim + +## Why the files exist + +The existing `extension_131k/*/code_131k1k.json` and model-scoped +`code_131k1k_qwen3.5.json` bundles already contain honest `xlc2_384k_512k` +replay cells, but they are mixed with lower-band cells. The InferenceX workflow +selects rows by runtime, hardware, model, and support tier — not by +`context_band`. + +These dedicated files isolate only the `xlc2_384k_512k` rows so InferenceX can +run bounded `500k`-class previews without over-selecting lower-band cells. + +## Consumer contract + +- `isb1-master.yaml` pins these rows as `reviewed_preview` +- `isb1-master.yaml` pins `max-model-len: 524288` +- current search space is intentionally bounded to single-concurrency preview execution +- result processing preserves `context_bands`, `profile_id`, and the producer handoff claim boundary + +See `manifest.json` for the GPT-OSS derivation record and `manifest_qwen3.5.json` +for the Qwen derivation record. diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1.json new file mode 100644 index 000000000..ed88496d8 --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e200fb08b06dffc83189c393c0711e090cf8f579c719e69512e2fcfb3933e33 +size 153848 diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1.json new file mode 100644 index 000000000..37f8e26a2 --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aa883fbca2ea93ec4d3cb748265a1c66e98554c658d8a0e51ed877a95e7faf1 +size 150709 diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json new file mode 100644 index 000000000..f996cc838 --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5102d06da0cf4adfc640f1206cb26812369150d888165813012fe85183fec35 +size 157679 diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json new file mode 100644 index 000000000..00046987f --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18faa3c3271f2f1acf3892379d3e1d13f1e0e6e1bbefdf00e5e7c5cb54bb3c72 +size 32685533 diff --git a/datasets/isb1/exports/preview/long_context_500k/manifest.json b/datasets/isb1/exports/preview/long_context_500k/manifest.json new file mode 100644 index 000000000..deae83d6d --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/manifest.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fb9e807a7f1c9df7cc0244309f594561913d05aeff434eb3d3e1ee322e0ffd5 +size 2344 diff --git a/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json b/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json new file mode 100644 index 000000000..aed23b2db --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99682e56f2fff3506c27ce5b1e3c61273b7a0bdf9abf70e9a254b4af1cf2b936 +size 2303 diff --git a/datasets/isb1/scripts/adapt_trace_replay_result.py b/datasets/isb1/scripts/adapt_trace_replay_result.py new file mode 100644 index 000000000..445ab7d9c --- /dev/null +++ b/datasets/isb1/scripts/adapt_trace_replay_result.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +from pathlib import Path +from statistics import mean +from typing import Any + + +def _to_float(value: Any) -> float | None: + if value in (None, ""): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _percentile(values: list[float], p: float) -> float: + if not values: + return 0.0 + if len(values) == 1: + return values[0] + ordered = sorted(values) + idx = (len(ordered) - 1) * p + lo = int(idx) + hi = min(lo + 1, len(ordered) - 1) + frac = idx - lo + return ordered[lo] * (1 - frac) + ordered[hi] * frac + + +def _read_csv_rows(path: Path) -> list[dict[str, str]]: + with path.open("r", encoding="utf-8", newline="") as handle: + return list(csv.DictReader(handle)) + + +def _pick(row: dict[str, str], *keys: str) -> float | None: + for key in keys: + if key in row: + value = _to_float(row.get(key)) + if value is not None: + return value + return None + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Adapt kv-cache trace replay CSV output into ISB1 replay JSON schema" + ) + parser.add_argument("--input-dir", default="/workspace", help="Directory containing trace replay outputs") + parser.add_argument( + "--detailed-csv", + default="detailed_results.csv", + help="Detailed replay CSV filename (inside --input-dir)", + ) + parser.add_argument( + "--summary-json", + default=None, + help="Optional summary JSON path (used as supplemental source if present)", + ) + parser.add_argument("--output-json", required=True, help="Output adapted replay JSON path") + parser.add_argument("--model-id", default="", help="Model ID for output metadata") + parser.add_argument("--max-concurrency", type=int, default=1, help="Max concurrency used") + parser.add_argument("--request-mode", default="multi-turn", help="Request mode metadata") + parser.add_argument( + "--benchmark-certification-status", + default="dataset_replay_verified", + help="Benchmark certification status to stamp in selection", + ) + parser.add_argument( + "--support-status", + default="reviewed_preview", + help="Support status to stamp in selection", + ) + parser.add_argument( + "--result-stem", + default="", + help="Optional result stem to infer total wall time from /workspace/.json", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + input_dir = Path(args.input_dir) + detailed_csv_path = input_dir / args.detailed_csv + output_path = Path(args.output_json) + + if not detailed_csv_path.exists(): + raise SystemExit(f"Missing detailed CSV: {detailed_csv_path}") + + rows = _read_csv_rows(detailed_csv_path) + ttft_ms: list[float] = [] + tpot_ms: list[float] = [] + output_tokens: list[float] = [] + prompt_tokens: list[float] = [] + session_ids: set[str] = set() + + for row in rows: + ttft = _pick(row, "ttft_ms", "ttft", "time_to_first_token_ms") + if ttft is not None: + ttft_ms.append(ttft) + + tpot = _pick(row, "tpot_ms", "tpot", "time_per_output_token_ms") + if tpot is not None: + tpot_ms.append(tpot) + + out_tok = _pick(row, "output_tokens", "generated_tokens", "completion_tokens") + if out_tok is not None: + output_tokens.append(out_tok) + + in_tok = _pick(row, "input_tokens", "prompt_tokens", "content_token_count") + if in_tok is not None: + prompt_tokens.append(in_tok) + + for key in ("session_id", "session", "conversation_id"): + sid = row.get(key) + if sid: + session_ids.add(str(sid)) + break + + completed_sessions = len(session_ids) if session_ids else len(rows) + total_sessions = completed_sessions + + total_output_tokens = sum(output_tokens) + total_prompt_tokens = sum(prompt_tokens) + total_token_count = total_output_tokens + total_prompt_tokens + + total_wall_time_s = 0.0 + if args.result_stem: + maybe_summary = input_dir / f"{args.result_stem}.json" + if maybe_summary.exists(): + try: + summary = json.loads(maybe_summary.read_text(encoding="utf-8")) + total_wall_time_s = float( + _to_float(summary.get("test_duration_seconds")) + or _to_float(summary.get("duration_s")) + or _to_float(summary.get("total_duration_s")) + or 0.0 + ) + except Exception: + total_wall_time_s = 0.0 + + if total_wall_time_s <= 0 and args.summary_json: + summary_path = Path(args.summary_json) + if summary_path.exists(): + try: + summary = json.loads(summary_path.read_text(encoding="utf-8")) + total_wall_time_s = float( + _to_float(summary.get("test_duration_seconds")) + or _to_float(summary.get("duration_s")) + or _to_float(summary.get("total_duration_s")) + or 0.0 + ) + except Exception: + total_wall_time_s = 0.0 + + if total_wall_time_s <= 0: + total_wall_time_s = 1.0 + + aggregate_metrics = { + "total_token_throughput_tps": total_token_count / total_wall_time_s, + "output_throughput_tps": total_output_tokens / total_wall_time_s, + "mean_ttft_ms": mean(ttft_ms) if ttft_ms else 0.0, + "median_ttft_ms": _percentile(ttft_ms, 0.50), + "p99_ttft_ms": _percentile(ttft_ms, 0.99), + "mean_tpot_ms": mean(tpot_ms) if tpot_ms else 0.0, + "median_tpot_ms": _percentile(tpot_ms, 0.50), + "p99_tpot_ms": _percentile(tpot_ms, 0.99), + "completed_sessions": completed_sessions, + "total_sessions": total_sessions, + "session_throughput_sps": completed_sessions / total_wall_time_s, + "total_wall_time_s": total_wall_time_s, + } + + adapted = { + "model_id": args.model_id, + "max_concurrency": args.max_concurrency, + "request_mode": args.request_mode, + "harness_request_mode": "auto", + "aggregate_metrics": aggregate_metrics, + "selection": { + "support_statuses": [args.support_status], + "benchmark_certification_statuses": [args.benchmark_certification_status], + }, + "server_metrics_summary": { + "observability_status": "unavailable", + "gpu_cache_metric_name": None, + "cpu_cache_metric_name": None, + "gpu_cache_usage_peak": 0.0, + "cpu_cache_usage_peak": 0.0, + "preemption_count": 0, + "kv_offload_observed": False, + "cpu_cache_metric_available": False, + }, + "depth_telemetry": { + "total_actual_input_tokens": int(total_prompt_tokens), + "max_actual_context_len_per_turn": int(max(prompt_tokens) if prompt_tokens else 0), + }, + "num_sessions": total_sessions, + "max_turns": None, + "per_turn_metrics": {}, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(adapted, indent=2, sort_keys=True), encoding="utf-8") + print(f"Wrote adapted replay JSON: {output_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/analyze_benchmark_distributions.py b/datasets/isb1/scripts/analyze_benchmark_distributions.py new file mode 100644 index 000000000..06c5a65f1 --- /dev/null +++ b/datasets/isb1/scripts/analyze_benchmark_distributions.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Analyze ISL/OSL/turn distributions for ISB1 exports or kv-cache traces") + parser.add_argument("--export-file", default=None, help="ISB1 export JSON file") + parser.add_argument("--trace-dir", default=None, help="kv-cache-tester trace directory") + parser.add_argument("--output-dir", required=True, help="Output directory") + return parser.parse_args() + + +def _percentile(values: list[float], p: float) -> float: + if not values: + return 0.0 + if len(values) == 1: + return values[0] + ordered = sorted(values) + idx = (len(ordered) - 1) * p + lo = int(idx) + hi = min(lo + 1, len(ordered) - 1) + frac = idx - lo + return ordered[lo] * (1 - frac) + ordered[hi] * frac + + +def _histogram(values: list[int], bins: list[int]) -> dict[str, int]: + counts: dict[str, int] = {} + for value in values: + placed = False + prev = 0 + for bound in bins: + if value <= bound: + key = f"{prev + 1}-{bound}" + counts[key] = counts.get(key, 0) + 1 + placed = True + break + prev = bound + if not placed: + key = f">{bins[-1]}" + counts[key] = counts.get(key, 0) + 1 + return counts + + +def _extract_isb1(export_payload: dict[str, Any]) -> tuple[list[int], list[int], list[int]]: + isl: list[int] = [] + osl: list[int] = [] + turns_per_session: list[int] = [] + + for cell in export_payload.get("exports", []): + session = cell.get("session") or {} + turns = session.get("turns") or [] + turns_per_session.append(len(turns)) + for turn in turns: + input_tokens = ( + turn.get("actual_input_tokens") + or turn.get("content_token_count") + or turn.get("prompt_tokens") + or turn.get("input_tokens") + or 0 + ) + output_tokens = ( + turn.get("expected_output_tokens") + or turn.get("target_output_tokens") + or turn.get("output_tokens") + or 0 + ) + try: + isl.append(int(input_tokens)) + except Exception: + isl.append(0) + try: + osl.append(int(output_tokens)) + except Exception: + osl.append(0) + + return isl, osl, turns_per_session + + +def _extract_trace_dir(trace_dir: Path) -> tuple[list[int], list[int], list[int]]: + isl: list[int] = [] + osl: list[int] = [] + turns_per_session: list[int] = [] + + files = list(sorted(trace_dir.glob("*.json"))) + if not files: + raise SystemExit(f"No JSON traces found in {trace_dir}") + + for path in files: + payload = json.loads(path.read_text(encoding="utf-8")) + sessions = payload.get("sessions") or [] + for session in sessions: + turns = session.get("turns") or [] + turns_per_session.append(len(turns)) + for turn in turns: + isl.append(int(turn.get("content_token_count", 0) or 0)) + osl.append(int(turn.get("target_output_tokens", 0) or 0)) + + return isl, osl, turns_per_session + + +def build_report(isl: list[int], osl: list[int], turns_per_session: list[int], source: str) -> dict[str, Any]: + return { + "source": source, + "num_sessions": len(turns_per_session), + "num_turns": len(isl), + "isl": { + "p50": _percentile([float(x) for x in isl], 0.50), + "p95": _percentile([float(x) for x in isl], 0.95), + "max": max(isl) if isl else 0, + "histogram": _histogram(isl, [1024, 4096, 8192, 16384, 32768, 65536]), + }, + "osl": { + "p50": _percentile([float(x) for x in osl], 0.50), + "p95": _percentile([float(x) for x in osl], 0.95), + "max": max(osl) if osl else 0, + "histogram": _histogram(osl, [64, 128, 256, 512, 1024, 2048, 4096]), + }, + "turns_per_session": { + "p50": _percentile([float(x) for x in turns_per_session], 0.50), + "p95": _percentile([float(x) for x in turns_per_session], 0.95), + "max": max(turns_per_session) if turns_per_session else 0, + "histogram": _histogram(turns_per_session, [2, 4, 8, 16, 32]), + }, + } + + +def main() -> int: + args = parse_args() + if bool(args.export_file) == bool(args.trace_dir): + raise SystemExit("Provide exactly one of --export-file or --trace-dir") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + if args.export_file: + export_path = Path(args.export_file) + payload = json.loads(export_path.read_text(encoding="utf-8")) + isl, osl, turns_per_session = _extract_isb1(payload) + report = build_report(isl, osl, turns_per_session, source=str(export_path)) + else: + trace_dir = Path(args.trace_dir) + isl, osl, turns_per_session = _extract_trace_dir(trace_dir) + report = build_report(isl, osl, turns_per_session, source=str(trace_dir)) + + output_path = output_dir / "distribution_report.json" + output_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8") + print(f"Wrote: {output_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/collect_sweep_results.py b/datasets/isb1/scripts/collect_sweep_results.py new file mode 100644 index 000000000..0d7155428 --- /dev/null +++ b/datasets/isb1/scripts/collect_sweep_results.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import sqlite3 +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Aggregate sweep results from DB or agg_*.json directory") + parser.add_argument("--db-path", default=None, help="SQLite DB path") + parser.add_argument("--json-dir", default=None, help="Directory containing agg_*.json files") + parser.add_argument("--output-dir", required=True, help="Output directory") + parser.add_argument("--cliff-ttft-ms", type=float, default=5000.0, help="TTFT p99 threshold for capacity cliff") + return parser.parse_args() + + +def _to_float(value: Any) -> float | None: + if value in (None, ""): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _to_int(value: Any) -> int | None: + if value in (None, ""): + return None + try: + return int(float(value)) + except (TypeError, ValueError): + return None + + +def collect_from_db(db_path: Path) -> list[dict[str, Any]]: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + """ + SELECT offload_mode, throughput_tok_s, ttft_p99_ms, max_concurrency, raw_result_json + FROM benchmark_runs + WHERE offload_mode IS NOT NULL + ORDER BY id ASC + """ + ).fetchall() + conn.close() + + out: list[dict[str, Any]] = [] + for row in rows: + concurrency = row["max_concurrency"] + if concurrency in (None, "") and row["raw_result_json"]: + try: + payload = json.loads(row["raw_result_json"]) + concurrency = payload.get("conc") or payload.get("max_concurrency") + except Exception: + pass + out.append( + { + "offload_mode": row["offload_mode"], + "concurrency": _to_int(concurrency), + "throughput_tok_s": _to_float(row["throughput_tok_s"]), + "ttft_p99_ms": _to_float(row["ttft_p99_ms"]), + "source": "db", + } + ) + return out + + +def collect_from_json_dir(json_dir: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for path in sorted(json_dir.glob("agg_*.json")): + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + rows.append( + { + "offload_mode": payload.get("offload_mode"), + "concurrency": _to_int(payload.get("conc") or payload.get("max_concurrency")), + "throughput_tok_s": _to_float(payload.get("throughput_tok_s") or payload.get("tput_per_gpu")), + "ttft_p99_ms": _to_float(payload.get("ttft_p99_ms") or payload.get("p99_ttft_ms")), + "source": str(path.name), + } + ) + return rows + + +def compute_capacity_cliff(rows: list[dict[str, Any]], threshold_ms: float) -> dict[str, Any]: + cliff: dict[str, Any] = {} + for mode in sorted({row.get("offload_mode") for row in rows if row.get("offload_mode")}): + mode_rows = sorted( + [r for r in rows if r.get("offload_mode") == mode and r.get("concurrency") is not None], + key=lambda r: r["concurrency"], + ) + cliff_row = None + for row in mode_rows: + if (row.get("ttft_p99_ms") or 0.0) > threshold_ms: + cliff_row = row + break + cliff[str(mode)] = cliff_row + return cliff + + +def compute_offload_benefit(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + by_conc: dict[int, dict[str, dict[str, Any]]] = {} + for row in rows: + conc = row.get("concurrency") + mode = row.get("offload_mode") + if conc is None or mode is None: + continue + by_conc.setdefault(int(conc), {})[str(mode)] = row + + deltas: list[dict[str, Any]] = [] + for conc in sorted(by_conc): + modes = by_conc[conc] + on = modes.get("on") + off = modes.get("off") + if not on or not off: + continue + on_tput = on.get("throughput_tok_s") or 0.0 + off_tput = off.get("throughput_tok_s") or 0.0 + deltas.append( + { + "concurrency": conc, + "throughput_on": on_tput, + "throughput_off": off_tput, + "offload_benefit_delta_tps": on_tput - off_tput, + } + ) + return deltas + + +def write_csv(path: Path, rows: list[dict[str, Any]]) -> None: + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.writer(handle) + writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms", "source"]) + for row in rows: + writer.writerow([ + row.get("offload_mode"), + row.get("concurrency"), + row.get("throughput_tok_s"), + row.get("ttft_p99_ms"), + row.get("source"), + ]) + + +def main() -> int: + args = parse_args() + if not args.db_path and not args.json_dir: + raise SystemExit("Provide --db-path or --json-dir") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + rows: list[dict[str, Any]] = [] + if args.db_path: + rows.extend(collect_from_db(Path(args.db_path))) + if args.json_dir: + rows.extend(collect_from_json_dir(Path(args.json_dir))) + + summary = { + "num_rows": len(rows), + "capacity_cliff": compute_capacity_cliff(rows, args.cliff_ttft_ms), + "offload_benefit": compute_offload_benefit(rows), + "rows": rows, + } + + json_path = output_dir / "sweep_aggregate.json" + csv_path = output_dir / "sweep_aggregate.csv" + json_path.write_text(json.dumps(summary, indent=2, sort_keys=True), encoding="utf-8") + write_csv(csv_path, rows) + + print(f"Wrote: {json_path}") + print(f"Wrote: {csv_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/generate_qwen35_low_band_exports.py b/datasets/isb1/scripts/generate_qwen35_low_band_exports.py new file mode 100755 index 000000000..51be8b531 --- /dev/null +++ b/datasets/isb1/scripts/generate_qwen35_low_band_exports.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +"""Generate dedicated Qwen 3.5 ISB1 export bundles for 8k/32k/64k lanes. + +These files are derived from the committed generic export bundles by selecting only +GPT-OSS cells that are actually runnable (`supported` or `reviewed_preview`), then +rewriting model identity fields to the Qwen 3.5 replay identity while keeping trace +payloads unchanged. +""" + +from __future__ import annotations + +import json +from copy import deepcopy +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[3] +EXPORT_ROOT = ROOT / "datasets" / "isb1" / "exports" + +QWEN_MODEL_ID = "qwen3_5_397b_a17b" +GPTOSS_MODEL_ID = "gpt_oss_120b" +ALLOWED_SUPPORT_STATUSES = {"supported", "reviewed_preview"} + +TARGETS = [ + ("core", "8k1k", "chat", "vllm"), + ("core", "8k1k", "chat", "sglang"), + ("core", "8k1k", "code", "vllm"), + ("core", "8k1k", "code", "sglang"), + ("extension_32k", "32k1k", "chat", "vllm"), + ("extension_32k", "32k1k", "chat", "sglang"), + ("extension_32k", "32k1k", "code", "vllm"), + ("extension_32k", "32k1k", "code", "sglang"), + ("extension_64k", "64k1k", "chat", "vllm"), + ("extension_64k", "64k1k", "chat", "sglang"), + ("extension_64k", "64k1k", "code", "vllm"), + ("extension_64k", "64k1k", "code", "sglang"), +] + + +def _source_path(lane: str, shape: str, surface: str, engine: str) -> Path: + return EXPORT_ROOT / lane / engine / f"{surface}_{shape}.json" + + +def _target_path(lane: str, shape: str, surface: str, engine: str) -> Path: + return EXPORT_ROOT / lane / engine / f"{surface}_{shape}_qwen3.5.json" + + +def _rewrite_bundle_id(bundle_id: str, lane: str, engine: str, surface: str, shape: str) -> str: + expected_prefix = f"isb1_{lane}_{engine}_{surface}_{shape}" + if bundle_id != expected_prefix: + raise ValueError( + f"Unexpected bundle_id {bundle_id!r}; expected {expected_prefix!r} for {lane}/{engine}/{surface}_{shape}" + ) + return f"{bundle_id}_qwen3_5" + + +def _rewrite_cell(cell: dict) -> dict: + rewritten = deepcopy(cell) + rewritten["canonical_model_id"] = QWEN_MODEL_ID + rewritten["thinking_history_policy"] = "strip_reasoning" + rewritten["history_projection_mode"] = "strip_reasoning_history" + rewritten["support_status"] = "reviewed_preview" + return rewritten + + +def build_export(lane: str, shape: str, surface: str, engine: str) -> tuple[Path, int]: + source_path = _source_path(lane, shape, surface, engine) + target_path = _target_path(lane, shape, surface, engine) + + payload = json.loads(source_path.read_text()) + exports = payload.get("exports") + if not isinstance(exports, list): + raise ValueError(f"Missing exports list in {source_path}") + + filtered = [ + _rewrite_cell(cell) + for cell in exports + if cell.get("canonical_model_id") == GPTOSS_MODEL_ID + and cell.get("support_status") in ALLOWED_SUPPORT_STATUSES + ] + if not filtered: + raise ValueError(f"No runnable GPT-OSS cells found in {source_path}") + + payload["bundle_id"] = _rewrite_bundle_id(payload.get("bundle_id"), lane, engine, surface, shape) + payload["exports"] = filtered + + target_path.write_text(json.dumps(payload, indent=2) + "\n") + return target_path, len(filtered) + + +def main() -> int: + for lane, shape, surface, engine in TARGETS: + target_path, count = build_export(lane, shape, surface, engine) + print(f"wrote {target_path.relative_to(ROOT)} ({count} cells)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/gmi_analyze_sweep.py b/datasets/isb1/scripts/gmi_analyze_sweep.py new file mode 100644 index 000000000..d0c3465b2 --- /dev/null +++ b/datasets/isb1/scripts/gmi_analyze_sweep.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import sqlite3 +import subprocess +import sys +from pathlib import Path +from statistics import median +from typing import Any + +from isb1_results_db import render_table + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Analyze KV sweep runs from ISB1 SQLite results.") + parser.add_argument("--db-path", required=True, help="Path to SQLite DB (isb1_results.db)") + parser.add_argument("--output-dir", default=".", help="Directory to write summary outputs") + parser.add_argument("--pareto", action="store_true", help="Also run plot_pareto.py") + parser.add_argument( + "--distributions", + action="store_true", + help="Also run analyze_benchmark_distributions.py", + ) + parser.add_argument("--export-file", default=None, help="Export JSON for --distributions") + parser.add_argument("--trace-dir", default=None, help="Trace directory for --distributions") + return parser.parse_args() + + +def _to_float(value: Any) -> float | None: + if value in (None, ""): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _to_int(value: Any) -> int | None: + if value in (None, ""): + return None + try: + return int(float(value)) + except (TypeError, ValueError): + return None + + +def _extract_concurrency(raw_result_json: str | None) -> int | None: + if not raw_result_json: + return None + try: + payload = json.loads(raw_result_json) + except json.JSONDecodeError: + return None + return _to_int(payload.get("conc") or payload.get("max_concurrency")) + + +def percentile(values: list[float], p: float) -> float | None: + if not values: + return None + ordered = sorted(values) + if len(ordered) == 1: + return ordered[0] + idx = (len(ordered) - 1) * p + lo = int(idx) + hi = min(lo + 1, len(ordered) - 1) + frac = idx - lo + return ordered[lo] * (1 - frac) + ordered[hi] * frac + + +def load_rows(db_path: Path) -> list[dict[str, Any]]: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + """ + SELECT + id, + offload_mode, + ttft_p50_ms, + ttft_p99_ms, + throughput_tok_s, + preemption_count, + status, + raw_result_json + FROM benchmark_runs + WHERE offload_mode IS NOT NULL + ORDER BY id ASC + """ + ).fetchall() + conn.close() + + normalized: list[dict[str, Any]] = [] + for row in rows: + concurrency = _extract_concurrency(row["raw_result_json"]) + normalized.append( + { + "offload_mode": row["offload_mode"], + "concurrency": concurrency, + "ttft_p50_ms": _to_float(row["ttft_p50_ms"]), + "ttft_p99_ms": _to_float(row["ttft_p99_ms"]), + "throughput_tok_s": _to_float(row["throughput_tok_s"]), + "preemption_count": _to_int(row["preemption_count"]) or 0, + "status": row["status"], + } + ) + return normalized + + +def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: + grouped: dict[tuple[str, int], list[dict[str, Any]]] = {} + for row in rows: + if row["concurrency"] is None: + continue + key = (row["offload_mode"], row["concurrency"]) + grouped.setdefault(key, []).append(row) + + summary_rows: list[dict[str, Any]] = [] + for (offload_mode, concurrency), items in sorted(grouped.items(), key=lambda x: (x[0][0], x[0][1])): + ttft_p50_values = [x["ttft_p50_ms"] for x in items if x["ttft_p50_ms"] is not None] + ttft_p99_values = [x["ttft_p99_ms"] for x in items if x["ttft_p99_ms"] is not None] + throughput_values = [x["throughput_tok_s"] for x in items if x["throughput_tok_s"] is not None] + preemptions = [x["preemption_count"] for x in items] + success_count = sum(1 for x in items if x["status"] == "success") + + summary_rows.append( + { + "offload_mode": offload_mode, + "concurrency": concurrency, + "runs": len(items), + "success_runs": success_count, + "ttft_p50_ms": median(ttft_p50_values) if ttft_p50_values else None, + "ttft_p99_ms": percentile(ttft_p99_values, 0.99), + "throughput_tok_s": median(throughput_values) if throughput_values else None, + "preemptions": int(median(preemptions)) if preemptions else 0, + } + ) + + return { + "total_rows": len(rows), + "grouped_rows": len(summary_rows), + "summary": summary_rows, + } + + +def write_summary_json(output_dir: Path, summary: dict[str, Any]) -> Path: + output_path = output_dir / "sweep_summary.json" + output_path.write_text(json.dumps(summary, indent=2)) + return output_path + + +def write_pareto_csv(output_dir: Path, summary: dict[str, Any]) -> Path: + output_path = output_dir / "pareto_data.csv" + with output_path.open("w", newline="") as handle: + writer = csv.writer(handle) + writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms"]) + for row in summary["summary"]: + writer.writerow( + [ + row["offload_mode"], + row["concurrency"], + row["throughput_tok_s"], + row["ttft_p99_ms"], + ] + ) + return output_path + + +def print_console_summary(summary: dict[str, Any]) -> None: + headers = [ + "offload_mode", + "concurrency", + "runs", + "success_runs", + "ttft_p50_ms", + "ttft_p99_ms", + "throughput_tok_s", + "preemptions", + ] + rows = [ + [ + row["offload_mode"], + row["concurrency"], + row["runs"], + row["success_runs"], + row["ttft_p50_ms"], + row["ttft_p99_ms"], + row["throughput_tok_s"], + row["preemptions"], + ] + for row in summary["summary"] + ] + + print(f"Total rows: {summary['total_rows']}") + print(f"Grouped rows: {summary['grouped_rows']}") + if rows: + print(render_table(headers, rows)) + else: + print("No sweep rows with offload_mode + concurrency found.") + + +def main() -> int: + args = parse_args() + db_path = Path(args.db_path) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + rows = load_rows(db_path) + summary = summarize(rows) + summary_path = write_summary_json(output_dir, summary) + pareto_path = write_pareto_csv(output_dir, summary) + + print_console_summary(summary) + print(f"Wrote: {summary_path}") + print(f"Wrote: {pareto_path}") + + script_dir = Path(__file__).resolve().parent + + if args.pareto: + pareto_cmd = [ + sys.executable, + str(script_dir / "plot_pareto.py"), + "--db-path", + str(db_path), + "--output-dir", + str(output_dir), + ] + subprocess.run(pareto_cmd, check=True) + + if args.distributions: + dist_cmd = [ + sys.executable, + str(script_dir / "analyze_benchmark_distributions.py"), + "--output-dir", + str(output_dir), + ] + if args.export_file: + dist_cmd.extend(["--export-file", args.export_file]) + elif args.trace_dir: + dist_cmd.extend(["--trace-dir", args.trace_dir]) + else: + raise SystemExit("--distributions requires --export-file or --trace-dir") + subprocess.run(dist_cmd, check=True) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/gmi_full_suite.sh b/datasets/isb1/scripts/gmi_full_suite.sh new file mode 100755 index 000000000..fad23efc1 --- /dev/null +++ b/datasets/isb1/scripts/gmi_full_suite.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh" + +usage() { + echo "Usage: gmi_full_suite.sh --gpu-type [--db-path ]" +} + +GPU_TYPE="" +DB_PATH="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu-type) + GPU_TYPE="$2" + shift 2 + ;; + --db-path) + DB_PATH="$2" + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + *) + echo "Unknown: $1" >&2 + exit 1 + ;; + esac +done + +[[ -n "$GPU_TYPE" ]] || { + usage >&2 + exit 1 +} + +case "$GPU_TYPE" in + h100|h200|b200) ;; + *) + echo "Unsupported --gpu-type: $GPU_TYPE" >&2 + exit 1 + ;; +esac + +[[ -x "$PORTABLE_SCRIPT" ]] || { + echo "Expected executable helper at $PORTABLE_SCRIPT" >&2 + exit 1 +} + +if [[ -n "$DB_PATH" ]]; then + export ISB1_RESULTS_DB_PATH="$DB_PATH" +fi + +PASSED=0 +FAILED=0 +SKIPPED=0 + +run_combo() { + local model="$1" + local engine="$2" + local band="$3" + local workload="${4:-code}" + + echo "=========================================" + echo ">>> $model × $engine × $band × $workload on $GPU_TYPE" + echo "=========================================" + + if "$PORTABLE_SCRIPT" \ + --gpu-type "$GPU_TYPE" \ + --model "$model" \ + --engine "$engine" \ + --context-band "$band" \ + --workload "$workload"; then + ((PASSED++)) || true + else + echo "FAILED: $model × $engine × $band × $workload" >&2 + ((FAILED++)) || true + fi +} + +# Core 8k — all models × all engines × chat + code +for model in qwen3.5 gptoss dsr1; do + for engine in vllm sglang; do + for workload in chat code; do + run_combo "$model" "$engine" 8k "$workload" + done + done +done + +# 131k — all models × all engines × chat + code +for model in qwen3.5 gptoss dsr1; do + for engine in vllm sglang; do + for workload in chat code; do + run_combo "$model" "$engine" 131k "$workload" + done + done +done + +# 500k — qwen3.5 + gptoss only (DSR1 max context=164k, exceeds model capability) +for model in qwen3.5 gptoss; do + for engine in vllm sglang; do + for workload in chat code; do + run_combo "$model" "$engine" 500k "$workload" + done + done +done + +# 1m — qwen3.5 only (only model supporting 1M context), b200 only +if [[ "$GPU_TYPE" == "b200" ]]; then + for engine in vllm sglang; do + for workload in chat code; do + run_combo qwen3.5 "$engine" 1m "$workload" + done + done +else + SKIPPED=4 +fi + +echo +echo "=========================================" +echo "SUITE COMPLETE: passed=$PASSED failed=$FAILED skipped=$SKIPPED" +echo "=========================================" + +if command -v python3 >/dev/null 2>&1; then + summary_cmd=(python3 "$SCRIPT_DIR/isb1_results_db.py" summary) + if [[ -n "$DB_PATH" ]]; then + summary_cmd+=(--db-path "$DB_PATH") + fi + "${summary_cmd[@]}" 2>/dev/null || true +fi + +[[ "$FAILED" -eq 0 ]] diff --git a/datasets/isb1/scripts/gmi_kv_sweep.sh b/datasets/isb1/scripts/gmi_kv_sweep.sh new file mode 100644 index 000000000..e953aba1a --- /dev/null +++ b/datasets/isb1/scripts/gmi_kv_sweep.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh" + +usage() { + cat <<'EOF' +Usage: + gmi_kv_sweep.sh \ + --gpu-type \ + --model \ + --engine \ + --context-band <8k|32k|64k|131k|500k|1m> \ + --workload \ + [--users "2,4,8,16,32,64"] \ + [--offload-modes "on,off,noprefix"] \ + [--kv-cache-dtype ] \ + [--benchmark-duration-s ] \ + [--disable-prefix-caching] \ + [--total-cpu-dram-gb ] \ + [--trace-source ] \ + [--db-path ] +EOF +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +trim() { + local x="$1" + x="${x#${x%%[![:space:]]*}}" + x="${x%${x##*[![:space:]]}}" + printf '%s' "$x" +} + +GPU_TYPE="" +MODEL="" +ENGINE="" +CONTEXT_BAND="" +WORKLOAD="" +USERS="2,4,8,16,32,64" +OFFLOAD_MODES="on,off,noprefix" +KV_CACHE_DTYPE="" +BENCHMARK_DURATION_S="1800" +DISABLE_PREFIX_CACHING=false +TOTAL_CPU_DRAM_GB="" +TRACE_SOURCE="isb1" +DB_PATH="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu-type) GPU_TYPE="$2"; shift 2 ;; + --model) MODEL="$2"; shift 2 ;; + --engine) ENGINE="$2"; shift 2 ;; + --context-band) CONTEXT_BAND="$2"; shift 2 ;; + --workload) WORKLOAD="$2"; shift 2 ;; + --users) USERS="$2"; shift 2 ;; + --offload-modes) OFFLOAD_MODES="$2"; shift 2 ;; + --kv-cache-dtype) KV_CACHE_DTYPE="$2"; shift 2 ;; + --benchmark-duration-s) BENCHMARK_DURATION_S="$2"; shift 2 ;; + --disable-prefix-caching) DISABLE_PREFIX_CACHING=true; shift ;; + --total-cpu-dram-gb) TOTAL_CPU_DRAM_GB="$2"; shift 2 ;; + --trace-source) TRACE_SOURCE="$2"; shift 2 ;; + --db-path) DB_PATH="$2"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) die "Unknown argument: $1" ;; + esac +done + +[[ -n "$GPU_TYPE" ]] || die "--gpu-type is required" +[[ -n "$MODEL" ]] || die "--model is required" +[[ -n "$ENGINE" ]] || die "--engine is required" +[[ -n "$CONTEXT_BAND" ]] || die "--context-band is required" +[[ -n "$WORKLOAD" ]] || die "--workload is required" +[[ -x "$PORTABLE_SCRIPT" ]] || die "Expected executable script: $PORTABLE_SCRIPT" + +case "$ENGINE" in + vllm|sglang) ;; + *) die "Unsupported --engine: $ENGINE" ;; +esac + +case "$TRACE_SOURCE" in + isb1|kv_cache_tester|aiperf) ;; + *) die "Unsupported --trace-source: $TRACE_SOURCE" ;; +esac + +IFS=',' read -r -a user_list <<< "$USERS" +IFS=',' read -r -a mode_list <<< "$OFFLOAD_MODES" + +[[ "${#user_list[@]}" -gt 0 ]] || die "--users cannot be empty" +[[ "${#mode_list[@]}" -gt 0 ]] || die "--offload-modes cannot be empty" + +TOTAL=0 +PASSED=0 +FAILED=0 + +for raw_mode in "${mode_list[@]}"; do + mode=$(trim "$raw_mode") + [[ -n "$mode" ]] || continue + + case "$mode" in + on|off|noprefix|legacy) ;; + *) die "Unsupported offload mode in --offload-modes: $mode" ;; + esac + + if [[ "$ENGINE" == "sglang" && "$mode" == "on" ]]; then + echo "Skipping mode=on for SGLang (no native offload support)" + continue + fi + + for raw_users in "${user_list[@]}"; do + users=$(trim "$raw_users") + [[ "$users" =~ ^[0-9]+$ ]] || die "Invalid user concurrency: $users" + + TOTAL=$((TOTAL + 1)) + echo "========================================================" + echo "Run $TOTAL: model=$MODEL engine=$ENGINE users=$users mode=$mode" + echo "========================================================" + + cmd=( + "$PORTABLE_SCRIPT" + --gpu-type "$GPU_TYPE" + --model "$MODEL" + --engine "$ENGINE" + --context-band "$CONTEXT_BAND" + --workload "$WORKLOAD" + --benchmark-type isb1_kv_stress + --benchmark-duration-s "$BENCHMARK_DURATION_S" + --max-concurrency "$users" + --trace-source "$TRACE_SOURCE" + --offload-mode "$mode" + ) + + if [[ -n "$KV_CACHE_DTYPE" ]]; then + cmd+=(--kv-cache-dtype "$KV_CACHE_DTYPE") + fi + if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then + cmd+=(--disable-prefix-caching) + fi + if [[ -n "$TOTAL_CPU_DRAM_GB" ]]; then + cmd+=(--total-cpu-dram-gb "$TOTAL_CPU_DRAM_GB") + fi + if [[ -n "$DB_PATH" ]]; then + if ISB1_RESULTS_DB_PATH="$DB_PATH" "${cmd[@]}"; then + PASSED=$((PASSED + 1)) + echo "PASS users=$users mode=$mode" + else + FAILED=$((FAILED + 1)) + echo "FAIL users=$users mode=$mode" >&2 + fi + else + if "${cmd[@]}"; then + PASSED=$((PASSED + 1)) + echo "PASS users=$users mode=$mode" + else + FAILED=$((FAILED + 1)) + echo "FAIL users=$users mode=$mode" >&2 + fi + fi + done +done + +echo +echo "KV sweep complete" +echo " total: $TOTAL" +echo " passed: $PASSED" +echo " failed: $FAILED" + +if [[ -n "$DB_PATH" && -f "$DB_PATH" ]]; then + echo " db: $DB_PATH" +fi + +[[ "$FAILED" -eq 0 ]] diff --git a/datasets/isb1/scripts/gmi_portable_benchmark.sh b/datasets/isb1/scripts/gmi_portable_benchmark.sh new file mode 100755 index 000000000..f41722e36 --- /dev/null +++ b/datasets/isb1/scripts/gmi_portable_benchmark.sh @@ -0,0 +1,1019 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +usage() { + cat <<'EOF' +Usage: + gmi_portable_benchmark.sh \ + --gpu-type \ + --model \ + --engine \ + --context-band <8k|32k|64k|131k|500k|1m> \ + --workload \ + [--benchmark-type ] \ + [--offload-mode ] \ + [--kv-cache-dtype ] \ + [--disable-prefix-caching] \ + [--total-cpu-dram-gb ] \ + [--benchmark-duration-s ] \ + [--max-concurrency ] \ + [--trace-source ] + +Required environment: + HF_TOKEN or HUGGING_FACE_HUB_TOKEN Hugging Face token for model access + +Optional environment: + PORT API port (default: 8000) + TP Tensor parallelism (default: 8) + HEALTH_TIMEOUT_S Readiness timeout in seconds (default: 1800) + HEALTH_POLL_INTERVAL_S Readiness poll interval (default: 10) + BENCHMARK_OUTPUT_ROOT Output root (default: /datasets/isb1/results/gmi) + GMI_RUN_LABEL Optional suffix added to result names +EOF +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || die "Missing required command: $1" +} + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +REPO_ROOT=$(cd "$SCRIPT_DIR/../../.." && pwd) +source "$REPO_ROOT/benchmarks/benchmark_lib.sh" +PORT=${PORT:-8000} +TP=${TP:-8} +HEALTH_TIMEOUT_S=${HEALTH_TIMEOUT_S:-1800} +HEALTH_POLL_INTERVAL_S=${HEALTH_POLL_INTERVAL_S:-10} +BENCHMARK_OUTPUT_ROOT=${BENCHMARK_OUTPUT_ROOT:-"$REPO_ROOT/datasets/isb1/results/gmi"} +REQUEST_MODE=multi-turn +HARNESS_REQUEST_MODE=auto +IGNORE_WAITS=true + +GPU_TYPE="" +MODEL_KEY="" +ENGINE="" +CONTEXT_BAND="" +WORKLOAD="" +BENCHMARK_TYPE="isb1_replay" +OFFLOAD_MODE="" +KV_CACHE_DTYPE="" +DISABLE_PREFIX_CACHING=false +TOTAL_CPU_DRAM_GB="" +BENCHMARK_DURATION_S="" +MAX_CONCURRENCY_OVERRIDE="" +TRACE_SOURCE="isb1" + +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu-type) + GPU_TYPE="$2" + shift 2 + ;; + --model) + MODEL_KEY="$2" + shift 2 + ;; + --engine) + ENGINE="$2" + shift 2 + ;; + --context-band) + CONTEXT_BAND="$2" + shift 2 + ;; + --workload) + WORKLOAD="$2" + shift 2 + ;; + --benchmark-type) + BENCHMARK_TYPE="$2" + shift 2 + ;; + --offload-mode) + OFFLOAD_MODE="$2" + shift 2 + ;; + --kv-cache-dtype) + KV_CACHE_DTYPE="$2" + shift 2 + ;; + --disable-prefix-caching) + DISABLE_PREFIX_CACHING=true + shift + ;; + --total-cpu-dram-gb) + TOTAL_CPU_DRAM_GB="$2" + shift 2 + ;; + --benchmark-duration-s) + BENCHMARK_DURATION_S="$2" + shift 2 + ;; + --max-concurrency) + MAX_CONCURRENCY_OVERRIDE="$2" + shift 2 + ;; + --trace-source) + TRACE_SOURCE="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + die "Unknown argument: $1" + ;; + esac +done + +[[ -n "$GPU_TYPE" ]] || die "--gpu-type is required" +[[ -n "$MODEL_KEY" ]] || die "--model is required" +[[ -n "$ENGINE" ]] || die "--engine is required" +[[ -n "$CONTEXT_BAND" ]] || die "--context-band is required" +[[ -n "$WORKLOAD" ]] || die "--workload is required" + +case "$GPU_TYPE" in + h100|h200|b200) ;; + *) die "Unsupported --gpu-type: $GPU_TYPE" ;; +esac + +case "$ENGINE" in + vllm|sglang) ;; + *) die "Unsupported --engine: $ENGINE" ;; +esac + +case "$CONTEXT_BAND" in + 8k|32k|64k|131k|500k|1m) ;; + *) die "Unsupported --context-band: $CONTEXT_BAND" ;; +esac + +case "$WORKLOAD" in + chat|code) ;; + *) die "Unsupported --workload: $WORKLOAD (must be chat or code)" ;; +esac + +case "$BENCHMARK_TYPE" in + isb1_replay|isb1_kv_stress) ;; + *) die "Unsupported --benchmark-type: $BENCHMARK_TYPE" ;; +esac + +case "$TRACE_SOURCE" in + isb1|kv_cache_tester|aiperf) ;; + *) die "Unsupported --trace-source: $TRACE_SOURCE" ;; +esac + +case "${OFFLOAD_MODE:-}" in + ""|on|off|noprefix|legacy) ;; + *) die "Unsupported --offload-mode: $OFFLOAD_MODE" ;; +esac + +case "${KV_CACHE_DTYPE:-}" in + ""|auto|fp8) ;; + *) die "Unsupported --kv-cache-dtype: $KV_CACHE_DTYPE" ;; +esac + +if [[ -n "$TOTAL_CPU_DRAM_GB" ]] && ! [[ "$TOTAL_CPU_DRAM_GB" =~ ^[0-9]+([.][0-9]+)?$ ]]; then + die "--total-cpu-dram-gb must be numeric" +fi +if [[ -n "$MAX_CONCURRENCY_OVERRIDE" ]] && ! [[ "$MAX_CONCURRENCY_OVERRIDE" =~ ^[0-9]+$ ]]; then + die "--max-concurrency must be a positive integer" +fi +if [[ -n "$BENCHMARK_DURATION_S" ]] && ! [[ "$BENCHMARK_DURATION_S" =~ ^[0-9]+([.][0-9]+)?$ ]]; then + die "--benchmark-duration-s must be numeric" +fi + +require_cmd docker +require_cmd curl +require_cmd python3 +require_cmd nvidia-smi + +HF_TOKEN_VALUE=${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-}} +[[ -n "$HF_TOKEN_VALUE" ]] || die "Set HF_TOKEN or HUGGING_FACE_HUB_TOKEN before running" + +if [[ -z "$TOTAL_CPU_DRAM_GB" ]]; then + if [[ -r /proc/meminfo ]]; then + TOTAL_CPU_DRAM_GB=$(awk '/MemTotal:/ {printf "%.0f", $2/1048576}' /proc/meminfo) + else + TOTAL_CPU_DRAM_GB=0 + fi +fi + +case "$MODEL_KEY" in + qwen3.5) + MODEL_HF_ID="Qwen/Qwen3.5-397B-A17B-FP8" + MODEL_PREFIX="qwen3.5" + CANONICAL_MODEL_ID="qwen3_5_397b_a17b" + PRECISION="fp8" + ;; + gptoss) + MODEL_HF_ID="openai/gpt-oss-120b" + MODEL_PREFIX="gptoss" + CANONICAL_MODEL_ID="gpt_oss_120b" + PRECISION="fp4" + ;; + dsr1) + MODEL_HF_ID="deepseek-ai/DeepSeek-R1-0528" + MODEL_PREFIX="dsr1" + CANONICAL_MODEL_ID="deepseek_r1_0528" + PRECISION="fp8" + ;; + *) + die "Unsupported --model: $MODEL_KEY" + ;; +esac + +case "$GPU_TYPE" in + b200) + HARDWARE_PROFILE_ID="nvidia:b200_sxm_180gb" + RUNNER_TYPE="b200-gmi-baremetal" + ;; + h100) + HARDWARE_PROFILE_ID="nvidia:h100_sxm_80gb" + RUNNER_TYPE="h100-gmi-baremetal" + ;; + h200) + HARDWARE_PROFILE_ID="nvidia:h200_sxm_141gb" + RUNNER_TYPE="h200-gmi-baremetal" + ;; +esac + +case "$ENGINE" in + vllm) + RUNTIME_STACK_ID="standalone:vllm" + if [[ "$GPU_TYPE" == "b200" ]]; then + IMAGE="vllm/vllm-openai:v0.19.0-cu130" + else + IMAGE="vllm/vllm-openai:v0.18.0" + fi + ;; + sglang) + RUNTIME_STACK_ID="standalone:sglang" + IMAGE="lmsysorg/sglang:v0.5.9-cu130" + ;; +esac + +case "$CONTEXT_BAND" in + 8k) + MAX_MODEL_LEN=10240 + MAX_CONCURRENCY=4 + NUM_WARMUP_SESSIONS=1 + MAX_SESSIONS="" + MAX_TURNS_PER_SESSION="" + MAX_NUM_BATCHED_TOKENS=8192 + MAX_ACTIVE_REQUESTS=128 + ;; + 32k) + MAX_MODEL_LEN=33792 + MAX_CONCURRENCY=4 + NUM_WARMUP_SESSIONS=1 + MAX_SESSIONS="" + MAX_TURNS_PER_SESSION="" + MAX_NUM_BATCHED_TOKENS=8192 + MAX_ACTIVE_REQUESTS=64 + ;; + 64k) + MAX_MODEL_LEN=66560 + MAX_CONCURRENCY=4 + NUM_WARMUP_SESSIONS=1 + MAX_SESSIONS="" + MAX_TURNS_PER_SESSION="" + MAX_NUM_BATCHED_TOKENS=4096 + MAX_ACTIVE_REQUESTS=64 + ;; + 131k) + MAX_MODEL_LEN=132296 + MAX_CONCURRENCY=2 + NUM_WARMUP_SESSIONS=1 + MAX_SESSIONS="" + MAX_TURNS_PER_SESSION="" + MAX_NUM_BATCHED_TOKENS=2048 + MAX_ACTIVE_REQUESTS=32 + ;; + 500k) + MAX_MODEL_LEN=524288 + MAX_CONCURRENCY=1 + NUM_WARMUP_SESSIONS=0 + MAX_SESSIONS=2 + MAX_TURNS_PER_SESSION=4 + MAX_NUM_BATCHED_TOKENS=1024 + MAX_ACTIVE_REQUESTS=8 + ;; + 1m) + MAX_MODEL_LEN=1048576 + MAX_CONCURRENCY=1 + NUM_WARMUP_SESSIONS=0 + MAX_SESSIONS=1 + MAX_TURNS_PER_SESSION=3 + MAX_NUM_BATCHED_TOKENS=1024 + MAX_ACTIVE_REQUESTS=4 + ;; +esac + +if [[ -n "$MAX_CONCURRENCY_OVERRIDE" ]]; then + MAX_CONCURRENCY="$MAX_CONCURRENCY_OVERRIDE" +fi + +select_export_file() { + case "$MODEL_KEY:$CONTEXT_BAND:$ENGINE:$WORKLOAD" in + # ── Chat exports (committed at 8k–131k) ────────────────────── + qwen3.5:8k:*:chat) + printf 'datasets/isb1/exports/core/%s/chat_8k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:32k:*:chat) + printf 'datasets/isb1/exports/extension_32k/%s/chat_32k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:64k:*:chat) + printf 'datasets/isb1/exports/extension_64k/%s/chat_64k1k_qwen3.5.json\n' "$ENGINE" + ;; + *:8k:*:chat) + printf 'datasets/isb1/exports/core/%s/chat_8k1k.json\n' "$ENGINE" + ;; + *:32k:*:chat) + printf 'datasets/isb1/exports/extension_32k/%s/chat_32k1k.json\n' "$ENGINE" + ;; + *:64k:*:chat) + printf 'datasets/isb1/exports/extension_64k/%s/chat_64k1k.json\n' "$ENGINE" + ;; + gptoss:131k:*:chat) + printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k.json\n' "$ENGINE" + ;; + qwen3.5:131k:*:chat) + printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k_qwen3.5.json\n' "$ENGINE" + ;; + dsr1:131k:*:chat) + printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k_dsr1.json\n' "$ENGINE" + ;; + gptoss:500k:*:chat) + printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" + ;; + qwen3.5:500k:*:chat) + printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" + ;; + # dsr1:500k:chat — model max 164k, exceeds capability + qwen3.5:1m:*:chat) + printf 'datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1__%s.json\n' "$ENGINE" + ;; + # dsr1:1m:chat, gptoss:1m:chat — models don't support 1M context + + # ── Code exports ────────────────────────────────────────────── + qwen3.5:8k:*:code) + printf 'datasets/isb1/exports/core/%s/code_8k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:32k:*:code) + printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:64k:*:code) + printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:131k:*:code) + printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:500k:*:code) + printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" + ;; + qwen3.5:1m:*:code) + printf 'datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__%s.json\n' "$ENGINE" + ;; + gptoss:8k:*:code) + printf 'datasets/isb1/exports/core/%s/code_8k1k.json\n' "$ENGINE" + ;; + gptoss:32k:*:code) + printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k.json\n' "$ENGINE" + ;; + gptoss:64k:*:code) + printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k.json\n' "$ENGINE" + ;; + gptoss:131k:*:code) + printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k.json\n' "$ENGINE" + ;; + gptoss:500k:*:code) + printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" + ;; + # gptoss:1m — GPT-OSS max_position_embeddings=131072; 1M exceeds model capability + dsr1:8k:*:code) + printf 'datasets/isb1/exports/core/%s/code_8k1k.json\n' "$ENGINE" + ;; + dsr1:32k:*:code) + printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k.json\n' "$ENGINE" + ;; + dsr1:64k:*:code) + printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k.json\n' "$ENGINE" + ;; + dsr1:131k:*:code) + printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k.json\n' "$ENGINE" + ;; + # dsr1:500k/1m — DeepSeek R1 max_position_embeddings=163840; 500k/1M exceed model capability + *) + return 1 + ;; + esac +} + +TRACE_DIR="" +TRACE_REPLAY_SUMMARY_JSON="" +if [[ "$TRACE_SOURCE" == "isb1" ]]; then + EXPORT_FILE=$(select_export_file) || die "No committed ISB1 export for model=$MODEL_KEY engine=$ENGINE context=$CONTEXT_BAND workload=$WORKLOAD" + EXPORT_PATH="$REPO_ROOT/$EXPORT_FILE" + [[ -f "$EXPORT_PATH" ]] || die "Export file not found: $EXPORT_FILE" + + readarray -t EXPORT_METADATA < <( + python3 - "$EXPORT_PATH" "$RUNTIME_STACK_ID" "$HARDWARE_PROFILE_ID" "$CANONICAL_MODEL_ID" <<'PY' +import json +import sys +from pathlib import Path + +export_path = Path(sys.argv[1]) +runtime_stack_id = sys.argv[2] +hardware_profile_id = sys.argv[3] +canonical_model_id = sys.argv[4] +payload = json.loads(export_path.read_text()) +matches = [ + cell + for cell in payload.get("exports", []) + if cell.get("runtime_stack_id") == runtime_stack_id + and cell.get("hardware_profile_id") == hardware_profile_id + and cell.get("canonical_model_id") == canonical_model_id +] +if not matches: + raise SystemExit( + f"No matching export cells for runtime={runtime_stack_id} hardware={hardware_profile_id} model={canonical_model_id}" + ) +support_statuses = sorted({cell.get("support_status") for cell in matches if cell.get("support_status")}) +cert_statuses = sorted( + {cell.get("benchmark_certification_status") for cell in matches if cell.get("benchmark_certification_status")} +) +trace_ids = sorted({cell.get("trace_id") for cell in matches if cell.get("trace_id")}) +if len(support_statuses) > 1: + raise SystemExit(f"Ambiguous support statuses: {support_statuses}") +if len(cert_statuses) > 1: + raise SystemExit(f"Ambiguous certification statuses: {cert_statuses}") +print(support_statuses[0] if support_statuses else "") +print(cert_statuses[0] if cert_statuses else "") +print(",".join(trace_ids)) +print(len(matches)) +PY + ) + + SUPPORT_STATUS=${EXPORT_METADATA[0]} + BENCHMARK_CERTIFICATION_STATUS=${EXPORT_METADATA[1]} + TRACE_IDS=${EXPORT_METADATA[2]} + MATCHED_CELL_COUNT=${EXPORT_METADATA[3]} +else + SUPPORT_STATUS=${SUPPORT_STATUS:-reviewed_preview} + BENCHMARK_CERTIFICATION_STATUS=${BENCHMARK_CERTIFICATION_STATUS:-dataset_replay_verified} + TRACE_IDS="$TRACE_SOURCE" + MATCHED_CELL_COUNT="n/a" + if [[ "$TRACE_SOURCE" == "kv_cache_tester" ]]; then + TRACE_DIR=${TRACE_DIR:-"$REPO_ROOT/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces"} + EXPORT_FILE="experimental/multiturn/vllm_benchmark/trace_source_kv_cache_tester.json" + else + TRACE_DIR=${TRACE_DIR:-"$REPO_ROOT/experimental/multiturn/vllm_benchmark/aiperf_traces"} + EXPORT_FILE="experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json" + fi + EXPORT_PATH="$REPO_ROOT/$EXPORT_FILE" +fi + +case "$ENGINE" in + vllm) + VLLM_CPU_OFFLOAD_GB="" + VLLM_SWAP_SPACE_GB="" + if [[ "$CONTEXT_BAND" == "500k" ]]; then + VLLM_CPU_OFFLOAD_GB=40 + VLLM_SWAP_SPACE_GB=32 + elif [[ "$CONTEXT_BAND" == "1m" ]]; then + VLLM_CPU_OFFLOAD_GB=80 + VLLM_SWAP_SPACE_GB=64 + fi + case "$CONTEXT_BAND" in + 8k|32k) VLLM_MAX_NUM_SEQS=128 ;; + 64k) VLLM_MAX_NUM_SEQS=64 ;; + 131k) VLLM_MAX_NUM_SEQS=32 ;; + 500k) VLLM_MAX_NUM_SEQS=8 ;; + 1m) VLLM_MAX_NUM_SEQS=4 ;; + esac + ;; + sglang) + case "$GPU_TYPE" in + h100) + SGLANG_MEM_FRACTION_STATIC=0.80 + SGLANG_CHUNKED_PREFILL_SIZE=8192 + ;; + h200) + SGLANG_MEM_FRACTION_STATIC=0.82 + SGLANG_CHUNKED_PREFILL_SIZE=16384 + ;; + b200) + SGLANG_MEM_FRACTION_STATIC=0.85 + SGLANG_CHUNKED_PREFILL_SIZE=32768 + ;; + esac + if [[ "$CONTEXT_BAND" == "500k" || "$CONTEXT_BAND" == "1m" ]]; then + SGLANG_MEM_FRACTION_STATIC=0.85 + SGLANG_CHUNKED_PREFILL_SIZE=8192 + fi + ;; +esac + +DATE_STAMP=$(date +%Y%m%d-%H%M%S) +SAFE_CONTEXT=${CONTEXT_BAND//[^[:alnum:]]/_} +SAFE_MODEL=${MODEL_KEY//[^[:alnum:]._-]/_} +SAFE_ENGINE=${ENGINE//[^[:alnum:]._-]/_} +SAFE_GPU=${GPU_TYPE//[^[:alnum:]._-]/_} +SAFE_WORKLOAD=${WORKLOAD//[^[:alnum:]._-]/_} +RUN_LABEL=${GMI_RUN_LABEL:-} +if [[ -n "$RUN_LABEL" ]]; then + RUN_LABEL="-${RUN_LABEL//[^[:alnum:]._-]/_}" +fi +RESULT_STEM="gmi-${SAFE_GPU}-${SAFE_MODEL}-${SAFE_ENGINE}-${SAFE_WORKLOAD}-${SAFE_CONTEXT}-${DATE_STAMP}${RUN_LABEL}" +RUN_DIR="$BENCHMARK_OUTPUT_ROOT/$RESULT_STEM" +SERVER_LOG="$RUN_DIR/server.log" +SUMMARY_JSON="$RUN_DIR/agg_${RESULT_STEM}.json" +TRACE_REPLAY_SUMMARY_JSON="$RUN_DIR/trace_replay_summary.json" +GPU_PROFILE_CSV="$RUN_DIR/${RESULT_STEM}_gpu_profile.csv" +GPU_PROFILER_PID="" +GPU_MEM_PEAK=0 +GPU_MEM_AVG=0 +GPU_UTIL_AVG=0 +mkdir -p "$RUN_DIR" +mkdir -p "$HOME/.cache/huggingface" + +CONTAINER_NAME="isb1-${RESULT_STEM}" +LOG_TAIL_PID="" +CONTAINER_ID="" +ISB1_RESULTS_DB_PATH=${ISB1_RESULTS_DB_PATH:-} + +stop_gpu_profiler() { + if [[ -n "$GPU_PROFILER_PID" ]]; then + kill "$GPU_PROFILER_PID" >/dev/null 2>&1 || true + wait "$GPU_PROFILER_PID" >/dev/null 2>&1 || true + GPU_PROFILER_PID="" + fi +} + +cleanup() { + local exit_code=$? + set +e + stop_gpu_profiler + if [[ -n "$LOG_TAIL_PID" ]]; then + kill "$LOG_TAIL_PID" >/dev/null 2>&1 || true + fi + if [[ -n "$CONTAINER_NAME" ]]; then + docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true + fi + exit $exit_code +} +trap cleanup EXIT + +launch_server() { + # Apply YaRN for Qwen long-context + apply_yarn_config_if_needed "$MODEL_HF_ID" "$MAX_MODEL_LEN" 2>/dev/null || true + + local docker_cmd=() + docker_cmd=( + docker run -d --rm + --name "$CONTAINER_NAME" + --gpus all + --ipc host + --network host + --shm-size 16g + -e HF_TOKEN="$HF_TOKEN_VALUE" + -e HUGGING_FACE_HUB_TOKEN="$HF_TOKEN_VALUE" + -e NVIDIA_VISIBLE_DEVICES=all + -e PYTHONUNBUFFERED=1 + -v "$HOME/.cache/huggingface:/root/.cache/huggingface" + -v "$REPO_ROOT:/workspace" + -w /workspace + ) + + if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then + docker_cmd+=(-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1) + docker_cmd+=(-e SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1) + fi + + if [[ "$ENGINE" == "vllm" ]]; then + local cmd=( + vllm serve "$MODEL_HF_ID" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size "$TP" + --gpu-memory-utilization 0.90 + --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" + --max-model-len "$MAX_MODEL_LEN" + --max-num-seqs "$VLLM_MAX_NUM_SEQS" + --disable-log-requests + --trust-remote-code + ) + + case "${OFFLOAD_MODE:-}" in + on) + cmd+=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + off) + ;; + noprefix) + cmd+=(--no-enable-prefix-caching) + ;; + legacy|"") + if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then + cmd+=(--cpu-offload-gb "$VLLM_CPU_OFFLOAD_GB") + fi + if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then + cmd+=(--swap-space "$VLLM_SWAP_SPACE_GB") + fi + ;; + esac + + if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then + cmd+=(--no-enable-prefix-caching) + fi + + if [[ "${KV_CACHE_DTYPE:-}" == "fp8" ]]; then + cmd+=(--kv-cache-dtype fp8) + fi + + if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then + cmd+=(--hf-overrides "$YARN_OVERRIDE_JSON") + fi + + CONTAINER_ID=$("${docker_cmd[@]}" "$IMAGE" bash -lc "$(printf '%q ' "${cmd[@]}")") + else + local cmd=( + python3 -m sglang.launch_server + --model-path "$MODEL_HF_ID" + --host 0.0.0.0 + --port "$PORT" + --trust-remote-code + --tensor-parallel-size "$TP" + --data-parallel-size 1 + --context-length "$MAX_MODEL_LEN" + --max-running-requests "$MAX_ACTIVE_REQUESTS" + --cuda-graph-max-bs "$MAX_ACTIVE_REQUESTS" + --chunked-prefill-size "$SGLANG_CHUNKED_PREFILL_SIZE" + --max-prefill-tokens "$SGLANG_CHUNKED_PREFILL_SIZE" + --mem-fraction-static "$SGLANG_MEM_FRACTION_STATIC" + --attention-backend flashinfer + --stream-interval 10 + --decode-log-interval 1 + ) + + case "${OFFLOAD_MODE:-}" in + on) + echo "WARNING: OFFLOAD_MODE=on is not supported for SGLang; continuing without native offload" >&2 + ;; + noprefix) + cmd+=(--disable-radix-cache) + ;; + off|legacy|"") + ;; + esac + + if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then + cmd+=(--disable-radix-cache) + fi + + if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then + cmd+=(--json-model-override-args "$YARN_OVERRIDE_JSON") + fi + + CONTAINER_ID=$("${docker_cmd[@]}" "$IMAGE" bash -lc "$(printf '%q ' "${cmd[@]}")") + fi + + [[ -n "$CONTAINER_ID" ]] || die "Failed to start Docker container" + docker logs -f "$CONTAINER_NAME" > "$SERVER_LOG" 2>&1 & + LOG_TAIL_PID=$! +} + +wait_for_server_ready() { + local deadline=$((SECONDS + HEALTH_TIMEOUT_S)) + until curl --output /dev/null --silent --fail "http://127.0.0.1:${PORT}/health"; do + if ! docker ps --format '{{.Names}}' | grep -Fxq "$CONTAINER_NAME"; then + echo "Container exited before becoming healthy. Recent logs:" >&2 + docker logs "$CONTAINER_NAME" >&2 || true + return 1 + fi + if (( SECONDS >= deadline )); then + echo "Timed out waiting for http://127.0.0.1:${PORT}/health" >&2 + docker logs "$CONTAINER_NAME" | tail -n 200 >&2 || true + return 1 + fi + sleep "$HEALTH_POLL_INTERVAL_S" + done +} + +echo "==> GMI portable benchmark" +echo "repo: $REPO_ROOT" +echo "gpu-type: $GPU_TYPE" +echo "model: $MODEL_KEY ($MODEL_HF_ID)" +echo "engine: $ENGINE" +echo "context-band: $CONTEXT_BAND" +echo "workload: $WORKLOAD" +echo "benchmark-type: $BENCHMARK_TYPE" +echo "trace-source: $TRACE_SOURCE" +echo "max-concurrency: $MAX_CONCURRENCY" +echo "max-model-len: $MAX_MODEL_LEN" +echo "docker image: $IMAGE" +echo "export-file: $EXPORT_FILE" +if [[ "$TRACE_SOURCE" != "isb1" ]]; then + echo "trace-dir: $TRACE_DIR" +fi +echo "runtime-stack-id: $RUNTIME_STACK_ID" +echo "hardware-profile-id: $HARDWARE_PROFILE_ID" +echo "canonical-model-id: $CANONICAL_MODEL_ID" +echo "support-status: ${SUPPORT_STATUS:-}" +echo "certification: ${BENCHMARK_CERTIFICATION_STATUS:-}" +echo "matched export cells: $MATCHED_CELL_COUNT" +echo "trace-ids: ${TRACE_IDS:-}" +echo "output dir: $RUN_DIR" +echo "offload-mode: ${OFFLOAD_MODE:-legacy}" +echo "kv-cache-dtype: ${KV_CACHE_DTYPE:-auto}" +echo "disable-prefix-cache: $DISABLE_PREFIX_CACHING" +echo "total-cpu-dram-gb: $TOTAL_CPU_DRAM_GB" +if [[ "$ENGINE" == "vllm" ]]; then + echo "vllm cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB:-0}" + echo "vllm swap-space-gb: ${VLLM_SWAP_SPACE_GB:-0}" +else + echo "sglang mem fraction: $SGLANG_MEM_FRACTION_STATIC" + echo "sglang chunked pf: $SGLANG_CHUNKED_PREFILL_SIZE" +fi + +"$SCRIPT_DIR/gpu_profile_collector.sh" --output "$GPU_PROFILE_CSV" --interval 2 & +GPU_PROFILER_PID=$! + +launch_server +wait_for_server_ready + +if [[ "$TRACE_SOURCE" == "isb1" ]]; then + echo "==> Server is healthy; starting export replay" + + benchmark_cmd=( + python3 "$REPO_ROOT/utils/bench_serving/benchmark_export_replay.py" + --model "$MODEL_HF_ID" + --base-url "http://127.0.0.1:${PORT}" + --export-file "$EXPORT_PATH" + --request-mode "$HARNESS_REQUEST_MODE" + --max-concurrency "$MAX_CONCURRENCY" + --num-warmup-sessions "$NUM_WARMUP_SESSIONS" + --save-result + --result-dir "$RUN_DIR" + --result-filename "$RESULT_STEM.json" + --runtime-stack-id "$RUNTIME_STACK_ID" + --hardware-profile-id "$HARDWARE_PROFILE_ID" + --canonical-model-id "$CANONICAL_MODEL_ID" + --metadata "benchmark_type=$BENCHMARK_TYPE" + --metadata "export_file=$EXPORT_FILE" + --metadata "runtime_stack_id=$RUNTIME_STACK_ID" + --metadata "hardware_profile_id=$HARDWARE_PROFILE_ID" + --metadata "canonical_model_id=$CANONICAL_MODEL_ID" + --metadata "request_mode=$REQUEST_MODE" + --metadata "gmi_gpu_type=$GPU_TYPE" + --metadata "gmi_engine=$ENGINE" + --metadata "gmi_context_band=$CONTEXT_BAND" + --metadata "gmi_workload=$WORKLOAD" + --trust-remote-code + ) + if [[ -n "$BENCHMARK_DURATION_S" ]]; then + benchmark_cmd+=(--metadata "benchmark_duration_s=$BENCHMARK_DURATION_S") + fi + if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then + benchmark_cmd+=(--metadata "campaign_class=kv_stress") + fi + if [[ -n "$SUPPORT_STATUS" ]]; then + benchmark_cmd+=(--support-status "$SUPPORT_STATUS") + fi + if [[ -n "$MAX_SESSIONS" ]]; then + benchmark_cmd+=(--max-sessions "$MAX_SESSIONS") + fi + if [[ -n "$MAX_TURNS_PER_SESSION" ]]; then + benchmark_cmd+=(--max-turns-per-session "$MAX_TURNS_PER_SESSION") + fi + if [[ "$IGNORE_WAITS" == "true" ]]; then + benchmark_cmd+=(--ignore-waits) + fi + if [[ "$ENGINE" == "vllm" ]]; then + if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then + benchmark_cmd+=(--metadata "vllm_cpu_offload_gb=$VLLM_CPU_OFFLOAD_GB") + fi + if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then + benchmark_cmd+=(--metadata "vllm_swap_space_gb=$VLLM_SWAP_SPACE_GB") + fi + else + benchmark_cmd+=(--metadata "sglang_mem_fraction_override=$SGLANG_MEM_FRACTION_STATIC") + benchmark_cmd+=(--metadata "sglang_chunked_prefill_override=$SGLANG_CHUNKED_PREFILL_SIZE") + fi + + "${benchmark_cmd[@]}" +else + echo "==> Server is healthy; starting trace replay ($TRACE_SOURCE)" + + trace_cmd=( + python3 "$REPO_ROOT/experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py" + --api-endpoint "http://localhost:$PORT" + --trace-directory "$TRACE_DIR" + --output-dir "$RUN_DIR" + --start-users "$MAX_CONCURRENCY" + --max-users "$MAX_CONCURRENCY" + --test-duration "${BENCHMARK_DURATION_S:-1800}" + --seed 42 + --no-color + ) + + "${trace_cmd[@]}" + + python3 "$SCRIPT_DIR/adapt_trace_replay_result.py" \ + --input-dir "$RUN_DIR" \ + --detailed-csv detailed_results.csv \ + --summary-json "$TRACE_REPLAY_SUMMARY_JSON" \ + --output-json "$RUN_DIR/${RESULT_STEM}.json" \ + --model-id "$MODEL_HF_ID" \ + --max-concurrency "$MAX_CONCURRENCY" \ + --request-mode "$REQUEST_MODE" \ + --support-status "$SUPPORT_STATUS" \ + --benchmark-certification-status "$BENCHMARK_CERTIFICATION_STATUS" \ + --result-stem "$RESULT_STEM" +fi + +echo "==> Processing ISB1 result" +( + cd "$RUN_DIR" + export RUNNER_TYPE="$RUNNER_TYPE" + export FRAMEWORK="$ENGINE" + export PRECISION="$PRECISION" + export RESULT_FILENAME="$RESULT_STEM" + export MODEL_PREFIX="$MODEL_PREFIX" + export IMAGE="$IMAGE" + export TP="$TP" + export EP_SIZE=1 + export DP_ATTENTION=false + export BENCHMARK_TYPE="$BENCHMARK_TYPE" + export EXPORT_FILE="$EXPORT_FILE" + export RUNTIME_STACK_ID="$RUNTIME_STACK_ID" + export HARDWARE_PROFILE_ID="$HARDWARE_PROFILE_ID" + export CANONICAL_MODEL_ID="$CANONICAL_MODEL_ID" + export REQUEST_MODE="$REQUEST_MODE" + export TRACE_SOURCE="$TRACE_SOURCE" + export WORKLOAD_TYPE="$WORKLOAD" + export MAX_CONCURRENCY="$MAX_CONCURRENCY" + export IGNORE_WAITS="$IGNORE_WAITS" + export DISPATCH_REF="manual:gmi-portable" + export MAX_MODEL_LEN="$MAX_MODEL_LEN" + export OFFLOAD_MODE="${OFFLOAD_MODE:-}" + export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-}" + export DISABLE_PREFIX_CACHING="$DISABLE_PREFIX_CACHING" + if [[ -n "$BENCHMARK_DURATION_S" ]]; then + export BENCHMARK_DURATION_S="$BENCHMARK_DURATION_S" + fi + if [[ -n "$SUPPORT_STATUS" ]]; then + export SUPPORT_STATUS="$SUPPORT_STATUS" + fi + if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then + export VLLM_CPU_OFFLOAD_GB="$VLLM_CPU_OFFLOAD_GB" + fi + if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then + export VLLM_SWAP_SPACE_GB="$VLLM_SWAP_SPACE_GB" + fi + if [[ -n "${SGLANG_MEM_FRACTION_STATIC:-}" ]]; then + export SGLANG_MEM_FRACTION_OVERRIDE="$SGLANG_MEM_FRACTION_STATIC" + fi + if [[ -n "${SGLANG_CHUNKED_PREFILL_SIZE:-}" ]]; then + export SGLANG_CHUNKED_PREFILL_OVERRIDE="$SGLANG_CHUNKED_PREFILL_SIZE" + fi + python3 "$REPO_ROOT/utils/process_result_isb1.py" | tee "$SUMMARY_JSON" +) + +stop_gpu_profiler + +if [[ -f "$GPU_PROFILE_CSV" ]]; then + GPU_STATS=$(python3 - "$GPU_PROFILE_CSV" <<'PY' +import csv +import sys + +with open(sys.argv[1], newline="") as handle: + rows = list(csv.DictReader(handle)) + +if rows: + mems = [float(row.get("mem_used_mb", "0") or 0) for row in rows] + utils = [float(row.get("gpu_util_pct", "0") or 0) for row in rows] + print(f"{max(mems) / 1024:.2f} {sum(mems) / len(mems) / 1024:.2f} {sum(utils) / len(utils):.1f}") +else: + print("0 0 0") +PY + 2>/dev/null) || GPU_STATS="0 0 0" + read -r GPU_MEM_PEAK GPU_MEM_AVG GPU_UTIL_AVG <<< "$GPU_STATS" +fi + +if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then + CAMPAIGN_METADATA_JSON="$RUN_DIR/kv_stress_campaign_metadata.json" + python3 - \ + "$CAMPAIGN_METADATA_JSON" \ + "$BENCHMARK_TYPE" \ + "$WORKLOAD" \ + "$MAX_CONCURRENCY" \ + "${OFFLOAD_MODE:-}" \ + "${KV_CACHE_DTYPE:-}" \ + "$DISABLE_PREFIX_CACHING" \ + "${BENCHMARK_DURATION_S:-}" <<'PY' +import json +import sys + +payload = { + "benchmark_type": sys.argv[2], + "campaign_class": "kv_stress", + "workload_type": sys.argv[3], + "max_concurrency": sys.argv[4], + "offload_mode": sys.argv[5] or None, + "kv_cache_dtype": sys.argv[6] or None, + "disable_prefix_caching": sys.argv[7], + "benchmark_duration_s": sys.argv[8] or None, +} +with open(sys.argv[1], "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2, sort_keys=True) +PY +fi + +if [[ -f "$SUMMARY_JSON" ]] && command -v python3 >/dev/null 2>&1; then + db_ingest_cmd=( + python3 "$SCRIPT_DIR/isb1_results_db.py" ingest "$SUMMARY_JSON" + --gpu-type "$GPU_TYPE" + --model "$MODEL_KEY" + --engine "$ENGINE" + --context-band "$CONTEXT_BAND" + --workload-type "$WORKLOAD" + --trace-source "$TRACE_SOURCE" + --max-model-len "$MAX_MODEL_LEN" + --tp "$TP" + --gpu-mem-peak-gb "${GPU_MEM_PEAK:-0}" + --gpu-mem-avg-gb "${GPU_MEM_AVG:-0}" + --gpu-util-avg-pct "${GPU_UTIL_AVG:-0}" + --gpu-profile-csv "$GPU_PROFILE_CSV" + ) + if [[ -n "$ISB1_RESULTS_DB_PATH" ]]; then + db_ingest_cmd+=(--db-path "$ISB1_RESULTS_DB_PATH") + fi + if [[ -n "${OFFLOAD_MODE:-}" ]]; then + db_ingest_cmd+=(--offload-mode "$OFFLOAD_MODE") + fi + if [[ -n "${KV_CACHE_DTYPE:-}" ]]; then + db_ingest_cmd+=(--kv-cache-dtype "$KV_CACHE_DTYPE") + fi + if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then + db_ingest_cmd+=(--disable-prefix-caching 1) + fi + if [[ -n "$BENCHMARK_DURATION_S" ]]; then + db_ingest_cmd+=(--benchmark-duration-s "$BENCHMARK_DURATION_S") + fi + if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then + db_ingest_cmd+=(--campaign-class kv_stress) + fi + if [[ "$ENGINE" == "vllm" ]]; then + if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + db_ingest_cmd+=(--vllm-cpu-offload-gb "$VLLM_CPU_OFFLOAD_GB") + fi + if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + db_ingest_cmd+=(--vllm-swap-space-gb "$VLLM_SWAP_SPACE_GB") + fi + else + db_ingest_cmd+=(--sglang-mem-fraction "$SGLANG_MEM_FRACTION_STATIC") + db_ingest_cmd+=(--sglang-chunked-prefill "$SGLANG_CHUNKED_PREFILL_SIZE") + fi + "${db_ingest_cmd[@]}" 2>/dev/null || echo "WARNING: DB ingest failed" >&2 +fi + +python3 - "$SUMMARY_JSON" <<'PY' +import json +import sys +from pathlib import Path + +summary = json.loads(Path(sys.argv[1]).read_text()) +print("==> Summary") +for key, value in [ + ("result_filename", summary.get("result_filename")), + ("support_status", summary.get("support_status")), + ("benchmark_certification_status", summary.get("benchmark_certification_status")), + ("completed_sessions", f"{summary.get('completed_sessions')}/{summary.get('total_sessions')}"), + ("effective_max_context_depth", summary.get("effective_max_context_depth")), + ("context_pressure_class", summary.get("context_pressure_class")), + ("context_pressure_signal", summary.get("context_pressure_signal", {}).get("status")), + ("depth_coverage_ratio", summary.get("depth_coverage_ratio")), + ("depth_coverage_class", summary.get("depth_coverage_class")), + ("max_actual_context_len", summary.get("max_actual_context_len_per_turn")), + ("preemption_count", summary.get("preemption_count")), + ("session_throughput_sps", summary.get("session_throughput_sps")), + ("tput_per_gpu", summary.get("tput_per_gpu")), + ("output_tput_per_gpu", summary.get("output_tput_per_gpu")), + ("mean_ttft_s", summary.get("mean_ttft")), + ("p99_ttft_s", summary.get("p99_ttft")), + ("server_logs", Path(sys.argv[1]).with_name("server.log")), + ("raw_replay_result", Path(sys.argv[1]).with_name(summary.get("result_filename", "run") + ".json")), + ("processed_result", Path(sys.argv[1])), +]: + print(f" {key}: {value}") +PY diff --git a/datasets/isb1/scripts/gmi_test_matrix.sh b/datasets/isb1/scripts/gmi_test_matrix.sh new file mode 100755 index 000000000..5deadb072 --- /dev/null +++ b/datasets/isb1/scripts/gmi_test_matrix.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +usage() { + cat <<'EOF' +Usage: + gmi_test_matrix.sh --gpu-type + +Runs a curated GMI Cloud matrix: + - Qwen3.5 × vllm × 131k + - Qwen3.5 × vllm × 500k + - Qwen3.5 × sglang × 500k + - GPT-OSS × vllm × 131k + - DSR1 × sglang × 131k +EOF +} + +GPU_TYPE="" +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu-type) + GPU_TYPE="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 1 + ;; + esac +done + +[[ -n "$GPU_TYPE" ]] || { + usage >&2 + exit 1 +} + +case "$GPU_TYPE" in + h100|h200|b200) ;; + *) + echo "Unsupported --gpu-type: $GPU_TYPE" >&2 + exit 1 + ;; +esac + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh" +[[ -x "$PORTABLE_SCRIPT" ]] || { + echo "Expected executable helper at $PORTABLE_SCRIPT" >&2 + exit 1 +} + +run_case() { + local model="$1" + local engine="$2" + local context_band="$3" + local workload="${4:-code}" + + echo + echo "============================================================" + echo "Running: gpu=${GPU_TYPE} model=${model} engine=${engine} context=${context_band} workload=${workload}" + echo "============================================================" + + "$PORTABLE_SCRIPT" \ + --gpu-type "$GPU_TYPE" \ + --model "$model" \ + --engine "$engine" \ + --context-band "$context_band" \ + --workload "$workload" +} + +run_case qwen3.5 vllm 8k chat +run_case qwen3.5 vllm 131k code +run_case qwen3.5 vllm 500k code +run_case qwen3.5 sglang 500k chat +run_case gptoss vllm 131k code +run_case gptoss vllm 131k chat +run_case gptoss vllm 500k chat +run_case dsr1 sglang 131k code +run_case dsr1 sglang 131k chat +run_case qwen3.5 vllm 1m code + +echo +echo "Curated GMI test matrix completed successfully." diff --git a/datasets/isb1/scripts/gpu_profile_collector.sh b/datasets/isb1/scripts/gpu_profile_collector.sh new file mode 100755 index 000000000..4ba03f223 --- /dev/null +++ b/datasets/isb1/scripts/gpu_profile_collector.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +# Usage: gpu_profile_collector.sh --output /tmp/gpu.csv [--interval 2] +# Runs nvidia-smi polling until killed (SIGTERM/SIGINT) + +OUTPUT="" +INTERVAL=2 + +while [[ $# -gt 0 ]]; do + case "$1" in + --output) + OUTPUT="$2" + shift 2 + ;; + --interval) + INTERVAL="$2" + shift 2 + ;; + *) + echo "Unknown arg: $1" >&2 + exit 1 + ;; + esac +done + +[[ -n "$OUTPUT" ]] || { + echo "ERROR: --output required" >&2 + exit 1 +} + +mkdir -p "$(dirname "$OUTPUT")" +echo "timestamp,gpu_bus_id,gpu_util_pct,mem_util_pct,mem_used_mb,mem_total_mb,temp_c,power_w" > "$OUTPUT" + +trap 'exit 0' SIGTERM SIGINT + +while true; do + nvidia-smi \ + --query-gpu=timestamp,gpu_bus_id,utilization.gpu,utilization.memory,memory.used,memory.total,temperature.gpu,power.draw \ + --format=csv,noheader,nounits >> "$OUTPUT" 2>/dev/null || true + sleep "$INTERVAL" +done diff --git a/datasets/isb1/scripts/isb1_results_db.py b/datasets/isb1/scripts/isb1_results_db.py new file mode 100644 index 000000000..e052fa766 --- /dev/null +++ b/datasets/isb1/scripts/isb1_results_db.py @@ -0,0 +1,816 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import sqlite3 +import sys +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Iterable, Sequence + +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent.parent.parent +DEFAULT_DB_PATH = REPO_ROOT / "datasets/isb1/results/isb1_results.db" +TABLE_NAME = "benchmark_runs" + +SCHEMA_SQL = f""" +CREATE TABLE IF NOT EXISTS {TABLE_NAME} ( + id INTEGER PRIMARY KEY, + run_id TEXT, + timestamp TEXT, + gpu_type TEXT, + model TEXT, + engine TEXT, + context_band TEXT, + workload_type TEXT, + max_model_len INTEGER, + tp INTEGER, + vllm_cpu_offload_gb REAL, + vllm_swap_space_gb REAL, + sglang_mem_fraction REAL, + sglang_chunked_prefill INTEGER, + ttft_p50_ms REAL, + ttft_p99_ms REAL, + tpot_p50_ms REAL, + tpot_p99_ms REAL, + throughput_tok_s REAL, + total_sessions INTEGER, + completed_sessions INTEGER, + total_turns INTEGER, + completed_turns INTEGER, + preemption_count INTEGER, + gpu_mem_peak_gb REAL, + gpu_mem_avg_gb REAL, + gpu_util_avg_pct REAL, + kv_cache_usage_pct REAL, + server_startup_s REAL, + benchmark_duration_s REAL, + campaign_class TEXT, + trace_source TEXT, + total_actual_input_tokens INTEGER, + max_actual_context_len INTEGER, + depth_coverage_ratio REAL, + depth_coverage_class TEXT, + producer_estimated_kv_bytes_peak INTEGER, + producer_expected_offload_mode TEXT, + offload_mode_match INTEGER, + offload_mode TEXT, + kv_cache_dtype TEXT, + disable_prefix_caching INTEGER, + cpu_cache_usage_peak_pct REAL, + raw_result_json TEXT, + status TEXT, + error_message TEXT +) +""" + +INSERT_COLUMNS = [ + "run_id", + "timestamp", + "gpu_type", + "model", + "engine", + "context_band", + "workload_type", + "max_model_len", + "tp", + "vllm_cpu_offload_gb", + "vllm_swap_space_gb", + "sglang_mem_fraction", + "sglang_chunked_prefill", + "ttft_p50_ms", + "ttft_p99_ms", + "tpot_p50_ms", + "tpot_p99_ms", + "throughput_tok_s", + "total_sessions", + "completed_sessions", + "total_turns", + "completed_turns", + "preemption_count", + "gpu_mem_peak_gb", + "gpu_mem_avg_gb", + "gpu_util_avg_pct", + "kv_cache_usage_pct", + "server_startup_s", + "benchmark_duration_s", + "campaign_class", + "trace_source", + "total_actual_input_tokens", + "max_actual_context_len", + "depth_coverage_ratio", + "depth_coverage_class", + "producer_estimated_kv_bytes_peak", + "producer_expected_offload_mode", + "offload_mode_match", + "offload_mode", + "kv_cache_dtype", + "disable_prefix_caching", + "cpu_cache_usage_peak_pct", + "raw_result_json", + "status", + "error_message", +] + +GROUPABLE_COLUMNS = { + "gpu_type", + "model", + "engine", + "context_band", + "workload_type", + "status", + "tp", + "max_model_len", + "depth_coverage_class", + "offload_mode", + "campaign_class", + "trace_source", +} + +DEFAULT_QUERY_COLUMNS = [ + "timestamp", + "gpu_type", + "model", + "engine", + "context_band", + "workload_type", + "status", + "ttft_p50_ms", + "ttft_p99_ms", + "throughput_tok_s", + "gpu_mem_peak_gb", + "gpu_util_avg_pct", + "preemption_count", + "depth_coverage_ratio", + "max_actual_context_len", + "depth_coverage_class", + "run_id", +] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Store and analyze ISB1 benchmark runs in SQLite.") + subparsers = parser.add_subparsers(dest="command", required=True) + + ingest = subparsers.add_parser("ingest", help="Read a processed ISB1 JSON file and insert a benchmark run.") + ingest.add_argument("json_file", help="Path to utils/process_result_isb1.py output JSON.") + ingest.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") + ingest.add_argument("--gpu-type", required=True, choices=["h100", "h200", "b200"]) + ingest.add_argument("--model", required=True, choices=["qwen3.5", "gptoss", "dsr1"]) + ingest.add_argument("--engine", required=True, choices=["vllm", "sglang"]) + ingest.add_argument("--context-band", required=True, choices=["8k", "32k", "64k", "131k", "500k", "1m"]) + ingest.add_argument("--workload-type", choices=["chat", "code"], help="Workload type (chat or code)") + ingest.add_argument("--run-id", help="Optional run UUID. Generated if omitted.") + ingest.add_argument("--timestamp", help="Optional ISO-8601 timestamp. Uses current UTC time if omitted.") + ingest.add_argument("--max-model-len", type=int) + ingest.add_argument("--tp", type=int) + ingest.add_argument("--vllm-cpu-offload-gb", type=float) + ingest.add_argument("--vllm-swap-space-gb", type=float) + ingest.add_argument("--sglang-mem-fraction", type=float) + ingest.add_argument("--sglang-chunked-prefill", type=int) + ingest.add_argument("--ttft-p50-ms", type=float) + ingest.add_argument("--ttft-p99-ms", type=float) + ingest.add_argument("--tpot-p50-ms", type=float) + ingest.add_argument("--tpot-p99-ms", type=float) + ingest.add_argument("--throughput-tok-s", type=float) + ingest.add_argument("--total-sessions", type=int) + ingest.add_argument("--completed-sessions", type=int) + ingest.add_argument("--total-turns", type=int) + ingest.add_argument("--completed-turns", type=int) + ingest.add_argument("--preemption-count", type=int) + ingest.add_argument("--gpu-mem-peak-gb", type=float) + ingest.add_argument("--gpu-mem-avg-gb", type=float) + ingest.add_argument("--gpu-util-avg-pct", type=float) + ingest.add_argument("--kv-cache-usage-pct", type=float) + ingest.add_argument("--server-startup-s", type=float) + ingest.add_argument("--benchmark-duration-s", type=float) + ingest.add_argument("--campaign-class") + ingest.add_argument("--trace-source", choices=["isb1", "kv_cache_tester", "aiperf"]) + ingest.add_argument("--offload-mode", choices=["on", "off", "noprefix", "legacy"]) + ingest.add_argument("--kv-cache-dtype", choices=["auto", "fp8"]) + ingest.add_argument("--disable-prefix-caching", type=int, choices=[0, 1]) + ingest.add_argument("--gpu-profile-csv", help="Optional GPU profile CSV path to stash in raw_result_json metadata.") + ingest.add_argument("--status", default="success", choices=["success", "failed", "timeout"]) + ingest.add_argument("--error-message") + + query = subparsers.add_parser("query", help="Print runs or an aggregated grouped view.") + query.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") + query.add_argument("--group-by", help="Comma-separated columns to group by, for example gpu_type,context_band.") + + export_csv = subparsers.add_parser("export-csv", help="Export all benchmark rows to CSV.") + export_csv.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") + export_csv.add_argument("--output", help="Destination CSV path. Defaults to stdout.") + + summary = subparsers.add_parser("summary", help="Print a concise findings summary.") + summary.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") + + return parser.parse_args() + + +_MIGRATIONS = [ + f"ALTER TABLE {TABLE_NAME} ADD COLUMN total_actual_input_tokens INTEGER", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN max_actual_context_len INTEGER", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN depth_coverage_ratio REAL", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN depth_coverage_class TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN producer_estimated_kv_bytes_peak INTEGER", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN producer_expected_offload_mode TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN offload_mode_match INTEGER", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN offload_mode TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN kv_cache_dtype TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN disable_prefix_caching INTEGER", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN cpu_cache_usage_peak_pct REAL", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN workload_type TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN campaign_class TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN trace_source TEXT", +] + + +def ensure_db(conn: sqlite3.Connection) -> None: + conn.execute(SCHEMA_SQL) + conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_run_id ON {TABLE_NAME}(run_id)") + conn.execute( + f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_grouping " + f"ON {TABLE_NAME}(gpu_type, model, engine, context_band, status)" + ) + # Idempotent migrations for existing databases + for migration_sql in _MIGRATIONS: + try: + conn.execute(migration_sql) + except sqlite3.OperationalError: + pass # Column already exists + conn.commit() + + +def connect_db(db_path: str | Path) -> sqlite3.Connection: + db_path = Path(db_path) + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + ensure_db(conn) + return conn + + +def utc_now_iso() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def to_float(value: Any) -> float | None: + if value in (None, ""): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def to_int(value: Any) -> int | None: + if value in (None, ""): + return None + try: + return int(float(value)) + except (TypeError, ValueError): + return None + + +def seconds_to_ms(value: Any) -> float | None: + parsed = to_float(value) + return None if parsed is None else parsed * 1000.0 + + +def choose(*values: Any) -> Any: + for value in values: + if value not in (None, ""): + return value + return None + + +def load_payload(path: str | Path) -> dict[str, Any]: + payload = json.loads(Path(path).read_text()) + if not isinstance(payload, dict): + raise SystemExit(f"Expected a JSON object in {path}") + return payload + + +def derive_total_turns(payload: dict[str, Any], total_sessions: int | None) -> int | None: + max_turns = to_int(payload.get("max_turns")) + if max_turns is not None and total_sessions is not None: + return max_turns * total_sessions + per_turn_metrics = payload.get("per_turn_metrics") or {} + if isinstance(per_turn_metrics, dict) and total_sessions is not None: + return len(per_turn_metrics) * total_sessions + return None + + +def derive_completed_turns(payload: dict[str, Any]) -> int | None: + per_turn_metrics = payload.get("per_turn_metrics") or {} + if not isinstance(per_turn_metrics, dict): + return None + completed = 0 + saw_value = False + for turn_metrics in per_turn_metrics.values(): + if not isinstance(turn_metrics, dict): + continue + value = to_int(turn_metrics.get("completed")) + if value is None: + continue + completed += value + saw_value = True + return completed if saw_value else None + + +def build_raw_payload(payload: dict[str, Any], args: argparse.Namespace) -> dict[str, Any]: + enriched = dict(payload) + metadata = { + "source_json": str(Path(args.json_file).resolve()), + "db_path": str(Path(args.db_path).resolve()), + } + if args.gpu_profile_csv: + metadata["gpu_profile_csv"] = str(Path(args.gpu_profile_csv).resolve()) + if args.status != "success": + metadata["status_override"] = args.status + if args.error_message: + metadata["error_message"] = args.error_message + enriched["_isb1_results_db"] = metadata + return enriched + + +def insert_run(args: argparse.Namespace) -> None: + payload = load_payload(args.json_file) + aggregate = payload.get("aggregate_metrics") or {} + runtime_overrides = payload.get("runtime_overrides") or {} + server_metrics_summary = payload.get("server_metrics_summary") or {} + + total_sessions = to_int(choose(args.total_sessions, payload.get("total_sessions"), aggregate.get("total_sessions"))) + completed_sessions = to_int( + choose(args.completed_sessions, payload.get("completed_sessions"), aggregate.get("completed_sessions")) + ) + + gpu_cache_peak = to_float(server_metrics_summary.get("gpu_cache_usage_peak")) + if gpu_cache_peak is None: + gpu_cache_peak = to_float(payload.get("peak_gpu_cache_usage")) + + row = { + "run_id": args.run_id or str(uuid.uuid4()), + "timestamp": args.timestamp or utc_now_iso(), + "gpu_type": args.gpu_type, + "model": args.model, + "engine": args.engine, + "context_band": args.context_band, + "workload_type": choose( + getattr(args, 'workload_type', None), + payload.get("benchmark_surface"), + ), + "max_model_len": to_int(choose(args.max_model_len, payload.get("max_model_len"))), + "tp": to_int(choose(args.tp, payload.get("tp"))), + "vllm_cpu_offload_gb": to_float( + choose( + args.vllm_cpu_offload_gb, + runtime_overrides.get("vllm_cpu_offload_gb"), + payload.get("vllm_cpu_offload_gb"), + ) + ), + "vllm_swap_space_gb": to_float( + choose( + args.vllm_swap_space_gb, + runtime_overrides.get("vllm_swap_space_gb"), + payload.get("vllm_swap_space_gb"), + ) + ), + "sglang_mem_fraction": to_float( + choose( + args.sglang_mem_fraction, + runtime_overrides.get("sglang_mem_fraction_override"), + payload.get("sglang_mem_fraction_override"), + ) + ), + "sglang_chunked_prefill": to_int( + choose( + args.sglang_chunked_prefill, + runtime_overrides.get("sglang_chunked_prefill_override"), + payload.get("sglang_chunked_prefill_override"), + ) + ), + "ttft_p50_ms": to_float( + choose(args.ttft_p50_ms, aggregate.get("median_ttft_ms"), seconds_to_ms(payload.get("median_ttft"))) + ), + "ttft_p99_ms": to_float( + choose(args.ttft_p99_ms, aggregate.get("p99_ttft_ms"), seconds_to_ms(payload.get("p99_ttft"))) + ), + "tpot_p50_ms": to_float( + choose(args.tpot_p50_ms, aggregate.get("median_tpot_ms"), seconds_to_ms(payload.get("median_tpot"))) + ), + "tpot_p99_ms": to_float( + choose(args.tpot_p99_ms, aggregate.get("p99_tpot_ms"), seconds_to_ms(payload.get("p99_tpot"))) + ), + "throughput_tok_s": to_float( + choose(args.throughput_tok_s, aggregate.get("total_token_throughput_tps"), payload.get("throughput_tok_s")) + ), + "total_sessions": total_sessions, + "completed_sessions": completed_sessions, + "total_turns": to_int(choose(args.total_turns, derive_total_turns(payload, total_sessions))), + "completed_turns": to_int(choose(args.completed_turns, derive_completed_turns(payload))), + "preemption_count": to_int(choose(args.preemption_count, payload.get("preemption_count"))), + "gpu_mem_peak_gb": to_float(choose(args.gpu_mem_peak_gb, payload.get("gpu_mem_peak_gb"))), + "gpu_mem_avg_gb": to_float(choose(args.gpu_mem_avg_gb, payload.get("gpu_mem_avg_gb"))), + "gpu_util_avg_pct": to_float(choose(args.gpu_util_avg_pct, payload.get("gpu_util_avg_pct"))), + "kv_cache_usage_pct": to_float( + choose(args.kv_cache_usage_pct, payload.get("kv_cache_usage_pct"), gpu_cache_peak * 100.0 if gpu_cache_peak is not None else None) + ), + "server_startup_s": to_float(choose(args.server_startup_s, payload.get("server_startup_s"))), + "benchmark_duration_s": to_float( + choose(args.benchmark_duration_s, payload.get("benchmark_duration_s"), aggregate.get("total_wall_time_s")) + ), + "campaign_class": choose( + getattr(args, 'campaign_class', None), + payload.get("campaign_class"), + ), + "trace_source": choose( + getattr(args, 'trace_source', None), + payload.get("trace_source"), + ), + "total_actual_input_tokens": to_int( + (payload.get("depth_telemetry") or {}).get("total_actual_input_tokens") + or payload.get("total_actual_input_tokens") + ), + "max_actual_context_len": to_int( + (payload.get("depth_telemetry") or {}).get("max_actual_context_len_per_turn") + or payload.get("max_actual_context_len_per_turn") + ), + "depth_coverage_ratio": to_float(payload.get("depth_coverage_ratio")), + "depth_coverage_class": payload.get("depth_coverage_class"), + "producer_estimated_kv_bytes_peak": to_int(payload.get("producer_estimated_kv_bytes_peak")), + "producer_expected_offload_mode": payload.get("producer_expected_offload_mode"), + "offload_mode_match": ( + 1 if payload.get("producer_expectation_validation", {}).get("offload_mode_match") is True + else 0 if payload.get("producer_expectation_validation", {}).get("offload_mode_match") is False + else None + ), + "offload_mode": choose(getattr(args, 'offload_mode', None), payload.get("offload_mode")), + "kv_cache_dtype": choose(getattr(args, 'kv_cache_dtype', None), payload.get("kv_cache_dtype")), + "disable_prefix_caching": to_int( + choose( + getattr(args, 'disable_prefix_caching', None), + payload.get("disable_prefix_caching"), + ) + ), + "cpu_cache_usage_peak_pct": to_float( + payload.get("peak_cpu_cache_usage", 0.0) * 100.0 + if payload.get("peak_cpu_cache_usage") is not None else None + ), + "raw_result_json": json.dumps(build_raw_payload(payload, args), sort_keys=True), + "status": args.status, + "error_message": choose(args.error_message, payload.get("error_message")), + } + + conn = connect_db(args.db_path) + placeholders = ", ".join("?" for _ in INSERT_COLUMNS) + sql = f"INSERT INTO {TABLE_NAME} ({', '.join(INSERT_COLUMNS)}) VALUES ({placeholders})" + conn.execute(sql, [row[column] for column in INSERT_COLUMNS]) + conn.commit() + conn.close() + + print( + f"Inserted run {row['run_id']} into {Path(args.db_path)} " + f"({row['gpu_type']} {row['model']} {row['engine']} {row['context_band']}, status={row['status']})." + ) + + +def fetch_rows(conn: sqlite3.Connection, sql: str, params: Sequence[Any] = ()) -> list[sqlite3.Row]: + return list(conn.execute(sql, params)) + + +def stringify(value: Any) -> str: + if value is None: + return "" + if isinstance(value, float): + return f"{value:.2f}" + return str(value) + + +def render_table(headers: Sequence[str], rows: Iterable[Sequence[Any]]) -> str: + normalized_rows = [[stringify(value) for value in row] for row in rows] + widths = [len(header) for header in headers] + for row in normalized_rows: + for idx, value in enumerate(row): + widths[idx] = max(widths[idx], len(value)) + + def fmt_row(row: Sequence[str]) -> str: + return " | ".join(value.ljust(widths[idx]) for idx, value in enumerate(row)) + + divider = "-+-".join("-" * width for width in widths) + lines = [fmt_row(headers), divider] + for row in normalized_rows: + lines.append(fmt_row(row)) + return "\n".join(lines) + + +def print_query(args: argparse.Namespace) -> None: + conn = connect_db(args.db_path) + + if args.group_by: + group_columns = [column.strip() for column in args.group_by.split(",") if column.strip()] + if not group_columns: + raise SystemExit("--group-by requires at least one column") + invalid = [column for column in group_columns if column not in GROUPABLE_COLUMNS] + if invalid: + raise SystemExit( + f"Unsupported --group-by columns: {', '.join(invalid)}. " + f"Allowed: {', '.join(sorted(GROUPABLE_COLUMNS))}" + ) + + select_prefix = ", ".join(group_columns) + sql = f""" + SELECT + {select_prefix}, + COUNT(*) AS runs, + SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) AS success_runs, + SUM(CASE WHEN status != 'success' THEN 1 ELSE 0 END) AS non_success_runs, + ROUND(AVG(ttft_p50_ms), 2) AS avg_ttft_p50_ms, + ROUND(AVG(throughput_tok_s), 2) AS avg_throughput_tok_s, + ROUND(MAX(gpu_mem_peak_gb), 2) AS max_gpu_mem_peak_gb, + SUM(CASE WHEN COALESCE(preemption_count, 0) > 0 THEN 1 ELSE 0 END) AS preemption_runs + FROM {TABLE_NAME} + GROUP BY {select_prefix} + ORDER BY {select_prefix} + """ + rows = fetch_rows(conn, sql) + headers = group_columns + [ + "runs", + "success_runs", + "non_success_runs", + "avg_ttft_p50_ms", + "avg_throughput_tok_s", + "max_gpu_mem_peak_gb", + "preemption_runs", + ] + print(render_table(headers, ([row[header] for header in headers] for row in rows))) + else: + sql = f"SELECT {', '.join(DEFAULT_QUERY_COLUMNS)} FROM {TABLE_NAME} ORDER BY id DESC" + rows = fetch_rows(conn, sql) + print(render_table(DEFAULT_QUERY_COLUMNS, ([row[column] for column in DEFAULT_QUERY_COLUMNS] for row in rows))) + + conn.close() + + +def export_csv_rows(args: argparse.Namespace) -> None: + conn = connect_db(args.db_path) + rows = fetch_rows(conn, f"SELECT * FROM {TABLE_NAME} ORDER BY id ASC") + headers = [description[0] for description in conn.execute(f"SELECT * FROM {TABLE_NAME} LIMIT 0").description] + + if args.output: + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + handle = output_path.open("w", newline="") + else: + handle = sys.stdout + + try: + writer = csv.writer(handle) + writer.writerow(headers) + for row in rows: + writer.writerow([row[header] for header in headers]) + finally: + if args.output: + handle.close() + print(f"Exported {len(rows)} rows to {args.output}") + + conn.close() + + +def print_summary(args: argparse.Namespace) -> None: + conn = connect_db(args.db_path) + total_runs = conn.execute(f"SELECT COUNT(*) FROM {TABLE_NAME}").fetchone()[0] + if total_runs == 0: + print(f"No runs found in {args.db_path}") + conn.close() + return + + status_rows = fetch_rows(conn, f"SELECT status, COUNT(*) AS count FROM {TABLE_NAME} GROUP BY status ORDER BY status") + preemption_rows = fetch_rows( + conn, + f""" + SELECT gpu_type, model, engine, context_band, preemption_count, status + FROM {TABLE_NAME} + WHERE COALESCE(preemption_count, 0) > 0 + ORDER BY preemption_count DESC, id DESC + LIMIT 10 + """, + ) + highest_memory_rows = fetch_rows( + conn, + f""" + SELECT gpu_type, model, engine, context_band, gpu_mem_peak_gb, kv_cache_usage_pct, status + FROM {TABLE_NAME} + WHERE gpu_mem_peak_gb IS NOT NULL + ORDER BY gpu_mem_peak_gb DESC, id DESC + LIMIT 5 + """, + ) + slowest_ttft_rows = fetch_rows( + conn, + f""" + SELECT gpu_type, model, engine, context_band, ttft_p50_ms, ttft_p99_ms, status + FROM {TABLE_NAME} + WHERE ttft_p50_ms IS NOT NULL + ORDER BY ttft_p50_ms DESC, id DESC + LIMIT 5 + """, + ) + highest_kv_rows = fetch_rows( + conn, + f""" + SELECT gpu_type, model, engine, context_band, kv_cache_usage_pct, gpu_mem_peak_gb, status + FROM {TABLE_NAME} + WHERE kv_cache_usage_pct IS NOT NULL + ORDER BY kv_cache_usage_pct DESC, id DESC + LIMIT 5 + """, + ) + long_context_rollup = fetch_rows( + conn, + f""" + SELECT + context_band, + COUNT(*) AS runs, + SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) AS success_runs, + ROUND(AVG(ttft_p50_ms), 2) AS avg_ttft_p50_ms, + ROUND(MAX(gpu_mem_peak_gb), 2) AS max_gpu_mem_peak_gb, + SUM(CASE WHEN COALESCE(preemption_count, 0) > 0 THEN 1 ELSE 0 END) AS preemption_runs + FROM {TABLE_NAME} + WHERE context_band IN ('131k', '500k', '1m') + GROUP BY context_band + ORDER BY CASE context_band WHEN '131k' THEN 1 WHEN '500k' THEN 2 WHEN '1m' THEN 3 ELSE 99 END + """, + ) + + print(f"ISB1 results summary ({args.db_path})") + print(f"Total runs: {total_runs}") + print(render_table(["status", "count"], ([row["status"], row["count"]] for row in status_rows))) + print() + + if long_context_rollup: + print("Long-context rollup") + print( + render_table( + ["context_band", "runs", "success_runs", "avg_ttft_p50_ms", "max_gpu_mem_peak_gb", "preemption_runs"], + ( + [ + row["context_band"], + row["runs"], + row["success_runs"], + row["avg_ttft_p50_ms"], + row["max_gpu_mem_peak_gb"], + row["preemption_runs"], + ] + for row in long_context_rollup + ), + ) + ) + print() + + # Depth coverage rollup + depth_coverage_rows = fetch_rows( + conn, + f""" + SELECT + context_band, + COUNT(*) AS runs, + ROUND(AVG(depth_coverage_ratio), 4) AS avg_depth_coverage, + MAX(max_actual_context_len) AS max_actual_ctx, + SUM(CASE WHEN depth_coverage_class = 'configuration_only' THEN 1 ELSE 0 END) AS config_only_runs, + SUM(CASE WHEN depth_coverage_class = 'full' THEN 1 ELSE 0 END) AS full_depth_runs + FROM {TABLE_NAME} + WHERE context_band IN ('131k', '500k', '1m') + AND depth_coverage_ratio IS NOT NULL + GROUP BY context_band + ORDER BY CASE context_band WHEN '131k' THEN 1 WHEN '500k' THEN 2 WHEN '1m' THEN 3 ELSE 99 END + """, + ) + if depth_coverage_rows: + print("Depth coverage (actual vs configured)") + print( + render_table( + ["context_band", "runs", "avg_depth_coverage", "max_actual_ctx", "config_only_runs", "full_depth_runs"], + ( + [ + row["context_band"], + row["runs"], + row["avg_depth_coverage"], + row["max_actual_ctx"], + row["config_only_runs"], + row["full_depth_runs"], + ] + for row in depth_coverage_rows + ), + ) + ) + print() + + if preemption_rows: + print("Runs with preemptions") + print( + render_table( + ["gpu_type", "model", "engine", "context_band", "preemption_count", "status"], + ( + [ + row["gpu_type"], + row["model"], + row["engine"], + row["context_band"], + row["preemption_count"], + row["status"], + ] + for row in preemption_rows + ), + ) + ) + print() + else: + print("Runs with preemptions: none") + print() + + if highest_memory_rows: + print("Highest peak GPU memory") + print( + render_table( + ["gpu_type", "model", "engine", "context_band", "gpu_mem_peak_gb", "kv_cache_usage_pct", "status"], + ( + [ + row["gpu_type"], + row["model"], + row["engine"], + row["context_band"], + row["gpu_mem_peak_gb"], + row["kv_cache_usage_pct"], + row["status"], + ] + for row in highest_memory_rows + ), + ) + ) + print() + + if slowest_ttft_rows: + print("Slowest TTFT p50 runs") + print( + render_table( + ["gpu_type", "model", "engine", "context_band", "ttft_p50_ms", "ttft_p99_ms", "status"], + ( + [ + row["gpu_type"], + row["model"], + row["engine"], + row["context_band"], + row["ttft_p50_ms"], + row["ttft_p99_ms"], + row["status"], + ] + for row in slowest_ttft_rows + ), + ) + ) + print() + + if highest_kv_rows: + print("Highest KV-cache usage") + print( + render_table( + ["gpu_type", "model", "engine", "context_band", "kv_cache_usage_pct", "gpu_mem_peak_gb", "status"], + ( + [ + row["gpu_type"], + row["model"], + row["engine"], + row["context_band"], + row["kv_cache_usage_pct"], + row["gpu_mem_peak_gb"], + row["status"], + ] + for row in highest_kv_rows + ), + ) + ) + + conn.close() + + +def main() -> int: + args = parse_args() + if args.command == "ingest": + insert_run(args) + elif args.command == "query": + print_query(args) + elif args.command == "export-csv": + export_csv_rows(args) + elif args.command == "summary": + print_summary(args) + else: + raise SystemExit(f"Unknown command: {args.command}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/metrics_collector.py b/datasets/isb1/scripts/metrics_collector.py new file mode 100644 index 000000000..3de1f7615 --- /dev/null +++ b/datasets/isb1/scripts/metrics_collector.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 +"""Prometheus metrics scraper for ISB1 KV stress benchmarks.""" + +from __future__ import annotations + +import argparse +import asyncio +import csv +import json +import re +import signal +import statistics +import time +from pathlib import Path +from typing import Dict +from urllib.request import Request, urlopen + +PROM_LINE_RE = re.compile( + r"^\s*([a-zA-Z_:][a-zA-Z0-9_:]*)(?:\{[^}]*\})?\s+([-+]?(?:\d+\.\d*|\d*\.\d+|\d+)(?:[eE][-+]?\d+)?)\s*$" +) + +CANONICAL_METRICS: dict[str, tuple[str, ...]] = { + # Required vLLM metrics + "vllm:gpu_cache_usage_perc": ( + "vllm:gpu_cache_usage_perc", + "vllm_gpu_cache_usage_perc", + ), + "vllm:cpu_cache_usage_perc": ( + "vllm:cpu_cache_usage_perc", + "vllm_cpu_cache_usage_perc", + ), + "vllm:num_preemptions_total": ( + "vllm:num_preemptions_total", + "vllm_num_preemptions_total", + ), + "vllm:num_requests_running": ( + "vllm:num_requests_running", + "vllm_num_requests_running", + ), + "vllm:num_requests_waiting": ( + "vllm:num_requests_waiting", + "vllm_num_requests_waiting", + ), + "vllm:kv_offload_bytes_gpu_to_cpu": ( + "vllm:kv_offload_bytes_gpu_to_cpu", + "vllm_kv_offload_bytes_gpu_to_cpu", + ), + "vllm:kv_offload_bytes_cpu_to_gpu": ( + "vllm:kv_offload_bytes_cpu_to_gpu", + "vllm_kv_offload_bytes_cpu_to_gpu", + ), + "vllm:prompt_tokens_total": ( + "vllm:prompt_tokens_total", + "vllm_prompt_tokens_total", + ), + "vllm:generation_tokens_total": ( + "vllm:generation_tokens_total", + "vllm_generation_tokens_total", + ), + # Optional but useful in vLLM + "vllm:num_requests_swapped": ( + "vllm:num_requests_swapped", + "vllm_num_requests_swapped", + ), + # PR #993 parity metrics (vLLM) + "vllm:prefix_cache_hit_rate": ( + "vllm:prefix_cache_hit_rate", + "vllm_prefix_cache_hit_rate", + ), + "vllm:cpu_prefix_cache_hit_rate": ( + "vllm:cpu_prefix_cache_hit_rate", + "vllm_cpu_prefix_cache_hit_rate", + ), + "vllm:kv_offload_time_gpu_to_cpu_seconds": ( + "vllm:kv_offload_time_gpu_to_cpu_seconds", + "vllm_kv_offload_time_gpu_to_cpu_seconds", + ), + "vllm:kv_offload_time_cpu_to_gpu_seconds": ( + "vllm:kv_offload_time_cpu_to_gpu_seconds", + "vllm_kv_offload_time_cpu_to_gpu_seconds", + ), + "vllm:prompt_tokens_local_compute": ( + "vllm:prompt_tokens_local_compute", + "vllm_prompt_tokens_local_compute", + ), + "vllm:prompt_tokens_local_cache_hit": ( + "vllm:prompt_tokens_local_cache_hit", + "vllm_prompt_tokens_local_cache_hit", + ), + "vllm:prompt_tokens_external_kv_transfer": ( + "vllm:prompt_tokens_external_kv_transfer", + "vllm_prompt_tokens_external_kv_transfer", + ), + # SGLang equivalents (best-effort) + "sglang:kv_cache_usage": ( + "sglang:kv_cache_usage", + "sglang_kv_cache_usage", + "sglang_kv_cache_utilization", + ), + "sglang:cache_hit_rate": ( + "sglang:cache_hit_rate", + "sglang_cache_hit_rate", + "sglang_radix_cache_hit_rate", + ), + "sglang:num_requests_running": ( + "sglang:num_requests_running", + "sglang_num_requests_running", + "sglang_scheduler_num_running_requests", + ), + "sglang:num_requests_waiting": ( + "sglang:num_requests_waiting", + "sglang_num_requests_waiting", + "sglang_scheduler_num_waiting_requests", + ), + "sglang:prompt_tokens_total": ( + "sglang:prompt_tokens_total", + "sglang_prompt_tokens_total", + "sglang_num_prompt_tokens_total", + ), + "sglang:generation_tokens_total": ( + "sglang:generation_tokens_total", + "sglang_generation_tokens_total", + "sglang_num_generation_tokens_total", + ), + # PR #993 parity metrics (SGLang) + "sglang:num_preemptions_total": ( + "sglang:num_preemptions_total", + "sglang_num_preemptions_total", + ), + "sglang:prefix_cache_queries_total": ( + "sglang:prefix_cache_queries_total", + "sglang_prefix_cache_queries_total", + ), +} + + +def _normalize_name(name: str) -> str: + return name.replace(":", "_") + + +def parse_prometheus_rows(payload: str) -> list[tuple[str, float]]: + rows: list[tuple[str, float]] = [] + for line in payload.splitlines(): + if not line or line.startswith("#"): + continue + match = PROM_LINE_RE.match(line) + if not match: + continue + name, raw_value = match.groups() + try: + rows.append((name, float(raw_value))) + except ValueError: + continue + return rows + + +def parse_prometheus_text(payload: str) -> Dict[str, float]: + samples: Dict[str, float] = {} + for name, value in parse_prometheus_rows(payload): + samples[name] = value + return samples + + +def map_canonical_metrics(samples: Dict[str, float]) -> Dict[str, float]: + mapped: Dict[str, float] = {} + + normalized_index: Dict[str, float] = {} + for key, value in samples.items(): + normalized_index[_normalize_name(key)] = value + + for canonical_name, aliases in CANONICAL_METRICS.items(): + value = None + for alias in aliases: + if alias in samples: + value = samples[alias] + break + alias_norm = _normalize_name(alias) + if alias_norm in normalized_index: + value = normalized_index[alias_norm] + break + if value is not None: + mapped[canonical_name] = value + + return mapped + + +def fetch_metrics(metrics_url: str, timeout_s: float = 5.0) -> str: + request = Request(metrics_url, headers={"Accept": "text/plain"}) + with urlopen(request, timeout=timeout_s) as response: # nosec B310 + return response.read().decode("utf-8", errors="replace") + + +def _percentile(values: list[float], p: float) -> float: + if not values: + return 0.0 + if len(values) == 1: + return values[0] + sorted_values = sorted(values) + rank = (len(sorted_values) - 1) * p + lo = int(rank) + hi = min(lo + 1, len(sorted_values) - 1) + frac = rank - lo + return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac + + +def _build_summary(metric_values: dict[str, list[float]]) -> dict[str, dict[str, float]]: + summary: dict[str, dict[str, float]] = {} + for metric_name, values in metric_values.items(): + if not values: + continue + summary[metric_name] = { + "count": float(len(values)), + "min": min(values), + "max": max(values), + "mean": statistics.fmean(values), + "p50": _percentile(values, 0.50), + "p99": _percentile(values, 0.99), + } + return summary + + +async def scrape_loop( + metrics_url: str, + output_path: Path, + interval_s: float, + duration_s: float, + wide: bool, + summary_json_path: Path | None, +) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + + stop_event = asyncio.Event() + + def _request_stop(*_: object) -> None: + stop_event.set() + + try: + loop = asyncio.get_running_loop() + loop.add_signal_handler(signal.SIGINT, _request_stop) + loop.add_signal_handler(signal.SIGTERM, _request_stop) + except NotImplementedError: + pass + + started_at = time.time() + metric_values: dict[str, list[float]] = {} + + wide_path = output_path.with_name("kv_metrics_wide.csv") + + with output_path.open("w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["timestamp", "metric_name", "metric_value"]) + + wide_file = None + wide_writer = None + if wide: + wide_file = wide_path.open("w", newline="", encoding="utf-8") + wide_writer = csv.writer(wide_file) + wide_writer.writerow(["timestamp", "metric_name", "metric_value"]) + + try: + while not stop_event.is_set(): + now = time.time() + if duration_s > 0 and (now - started_at) >= duration_s: + break + + try: + raw_text = await asyncio.to_thread(fetch_metrics, metrics_url) + raw_rows = parse_prometheus_rows(raw_text) + samples = parse_prometheus_text(raw_text) + mapped = map_canonical_metrics(samples) + + if wide_writer is not None: + for raw_metric_name, raw_metric_value in raw_rows: + wide_writer.writerow( + [f"{now:.3f}", raw_metric_name, f"{raw_metric_value:.8f}"] + ) + wide_file.flush() + + for metric_name, metric_value in mapped.items(): + writer.writerow([f"{now:.3f}", metric_name, f"{metric_value:.8f}"]) + metric_values.setdefault(metric_name, []).append(metric_value) + f.flush() + except Exception as exc: + writer.writerow([f"{now:.3f}", "collector:error", repr(exc)]) + f.flush() + + await asyncio.sleep(interval_s) + finally: + if wide_file is not None: + wide_file.close() + + if summary_json_path is not None: + summary_json_path.parent.mkdir(parents=True, exist_ok=True) + summary_json_path.write_text( + json.dumps(_build_summary(metric_values), indent=2, sort_keys=True), + encoding="utf-8", + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Scrape Prometheus metrics into CSV") + parser.add_argument( + "--metrics-url", + default="http://0.0.0.0:8888/metrics", + help="Prometheus endpoint URL", + ) + parser.add_argument( + "--output", + default="kv_metrics.csv", + help="CSV output path", + ) + parser.add_argument( + "--interval", + type=float, + default=2.0, + help="Scrape interval in seconds", + ) + parser.add_argument( + "--duration", + type=float, + default=0.0, + help="Optional max duration in seconds (0 means run until interrupted)", + ) + parser.add_argument( + "--wide", + action="store_true", + help="Also scrape all non-comment Prometheus metric lines into kv_metrics_wide.csv", + ) + parser.add_argument( + "--summary-json", + nargs="?", + const="kv_metrics_summary.json", + default=None, + help="Write per-metric min/max/mean/p50/p99 summary JSON (default: kv_metrics_summary.json)", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + summary_json_path = Path(args.summary_json) if args.summary_json else None + asyncio.run( + scrape_loop( + metrics_url=args.metrics_url, + output_path=Path(args.output), + interval_s=max(args.interval, 0.1), + duration_s=max(args.duration, 0.0), + wide=args.wide, + summary_json_path=summary_json_path, + ) + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/plot_pareto.py b/datasets/isb1/scripts/plot_pareto.py new file mode 100644 index 000000000..964696ad1 --- /dev/null +++ b/datasets/isb1/scripts/plot_pareto.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import sqlite3 +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Compute Pareto frontier for KV sweep throughput vs p99 TTFT") + parser.add_argument("--db-path", default=None, help="SQLite DB path (benchmark_runs)") + parser.add_argument("--json-dir", default=None, help="Directory containing sweep summary JSON files") + parser.add_argument("--output-dir", required=True, help="Directory for pareto outputs") + return parser.parse_args() + + +def _to_float(value: Any) -> float | None: + if value in (None, ""): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def load_rows_from_db(db_path: Path) -> list[dict[str, Any]]: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + """ + SELECT offload_mode, ttft_p99_ms, throughput_tok_s, max_concurrency, raw_result_json + FROM benchmark_runs + WHERE offload_mode IS NOT NULL + AND ttft_p99_ms IS NOT NULL + AND throughput_tok_s IS NOT NULL + ORDER BY id ASC + """ + ).fetchall() + conn.close() + + normalized: list[dict[str, Any]] = [] + for row in rows: + concurrency = row["max_concurrency"] + if concurrency in (None, "") and row["raw_result_json"]: + try: + payload = json.loads(row["raw_result_json"]) + concurrency = payload.get("conc") or payload.get("max_concurrency") + except Exception: + pass + normalized.append( + { + "offload_mode": row["offload_mode"], + "concurrency": int(concurrency) if concurrency not in (None, "") else None, + "throughput_tok_s": _to_float(row["throughput_tok_s"]), + "ttft_p99_ms": _to_float(row["ttft_p99_ms"]), + "source": "db", + } + ) + return normalized + + +def load_rows_from_json_dir(json_dir: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for path in sorted(json_dir.glob("*.json")): + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + + if isinstance(payload, dict) and isinstance(payload.get("summary"), list): + for row in payload["summary"]: + rows.append( + { + "offload_mode": row.get("offload_mode"), + "concurrency": row.get("concurrency"), + "throughput_tok_s": _to_float(row.get("throughput_tok_s")), + "ttft_p99_ms": _to_float(row.get("ttft_p99_ms")), + "source": str(path.name), + } + ) + elif isinstance(payload, list): + for row in payload: + if isinstance(row, dict): + rows.append( + { + "offload_mode": row.get("offload_mode"), + "concurrency": row.get("concurrency"), + "throughput_tok_s": _to_float(row.get("throughput_tok_s")), + "ttft_p99_ms": _to_float(row.get("ttft_p99_ms")), + "source": str(path.name), + } + ) + return rows + + +def compute_pareto_frontier(points: list[dict[str, Any]]) -> list[dict[str, Any]]: + valid = [p for p in points if p["throughput_tok_s"] is not None and p["ttft_p99_ms"] is not None] + if not valid: + return [] + + # maximize throughput, minimize ttft_p99_ms + sorted_points = sorted(valid, key=lambda p: (p["throughput_tok_s"], -p["ttft_p99_ms"]), reverse=True) + frontier: list[dict[str, Any]] = [] + best_latency = float("inf") + for point in sorted_points: + latency = point["ttft_p99_ms"] + if latency <= best_latency: + frontier.append(point) + best_latency = latency + return sorted(frontier, key=lambda p: (p["throughput_tok_s"], p["ttft_p99_ms"])) + + +def write_csv(path: Path, rows: list[dict[str, Any]], frontier_keys: set[tuple[str, int | None, float, float]]) -> None: + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.writer(handle) + writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms", "is_frontier", "source"]) + for row in rows: + key = (row.get("offload_mode") or "", row.get("concurrency"), row.get("throughput_tok_s") or 0.0, row.get("ttft_p99_ms") or 0.0) + writer.writerow([ + row.get("offload_mode"), + row.get("concurrency"), + row.get("throughput_tok_s"), + row.get("ttft_p99_ms"), + key in frontier_keys, + row.get("source"), + ]) + + +def maybe_write_plot(output_path: Path, grouped_frontiers: dict[str, list[dict[str, Any]]]) -> bool: + try: + import matplotlib.pyplot as plt # type: ignore + except Exception: + return False + + plt.figure(figsize=(10, 6)) + for mode, frontier in sorted(grouped_frontiers.items()): + x = [p["throughput_tok_s"] for p in frontier] + y = [p["ttft_p99_ms"] for p in frontier] + if not x: + continue + plt.plot(x, y, marker="o", label=mode) + plt.xlabel("Throughput (tokens/sec)") + plt.ylabel("p99 TTFT (ms)") + plt.title("Pareto Frontier by Offload Mode") + plt.legend() + plt.grid(True, alpha=0.3) + output_path.parent.mkdir(parents=True, exist_ok=True) + plt.tight_layout() + plt.savefig(output_path) + plt.close() + return True + + +def main() -> int: + args = parse_args() + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + if not args.db_path and not args.json_dir: + raise SystemExit("Provide --db-path or --json-dir") + + rows: list[dict[str, Any]] = [] + if args.db_path: + rows.extend(load_rows_from_db(Path(args.db_path))) + if args.json_dir: + rows.extend(load_rows_from_json_dir(Path(args.json_dir))) + + grouped: dict[str, list[dict[str, Any]]] = {} + for row in rows: + mode = row.get("offload_mode") + if not mode: + continue + grouped.setdefault(mode, []).append(row) + + grouped_frontiers: dict[str, list[dict[str, Any]]] = {} + for mode, points in grouped.items(): + grouped_frontiers[mode] = compute_pareto_frontier(points) + + frontier_keys: set[tuple[str, int | None, float, float]] = set() + for mode, frontier in grouped_frontiers.items(): + for point in frontier: + frontier_keys.add((mode, point.get("concurrency"), point.get("throughput_tok_s") or 0.0, point.get("ttft_p99_ms") or 0.0)) + + csv_path = output_dir / "pareto_data.csv" + write_csv(csv_path, rows, frontier_keys) + + summary = { + "total_points": len(rows), + "offload_modes": sorted(grouped.keys()), + "frontier": {mode: frontier for mode, frontier in grouped_frontiers.items()}, + } + summary_path = output_dir / "pareto_summary.json" + summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True), encoding="utf-8") + + plot_written = maybe_write_plot(output_dir / "pareto_frontier.png", grouped_frontiers) + + print(f"Wrote: {csv_path}") + print(f"Wrote: {summary_path}") + if plot_written: + print(f"Wrote: {output_dir / 'pareto_frontier.png'}") + else: + print("Skipped pareto_frontier.png (matplotlib unavailable)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/README.md b/experimental/README.md index f39dfc4af..8ba1ba9b5 100644 --- a/experimental/README.md +++ b/experimental/README.md @@ -1,5 +1,11 @@ # Experimental -This folder contains experimental WIP code that is mostly Claude Code generated. +This folder contains experimental WIP code and planning material. -**Warning:** Code in this directory is very basic and likely contains errors or incomplete implementations. It is not intended for production use or as part of the official InferenceMAX results. +Relevant roadmap docs: + +For the current official ISB1 support statement, use: +- `datasets/isb1/SUPPORT_MATRIX.md` +- `datasets/isb1/README.md` + +**Warning:** code and notes in this directory may be incomplete, experimental, or future-looking. They are not by themselves the official statement of supported InferenceX ISB1 capability. diff --git a/experimental/multiturn/README.md b/experimental/multiturn/README.md index 05b22f67e..fd9114b37 100644 --- a/experimental/multiturn/README.md +++ b/experimental/multiturn/README.md @@ -1,16 +1,27 @@ -## Experimental WIP: Multi turn with/without CPU KVCache Offloading - -lit review -- https://lmsys.org/blog/2025-09-10-sglang-hicache/ -- sglang calls GPU HBM as (L1) and CPU DRAM as (L2) -- https://lmsys.org/images/blog/hicache/mooncake_benchmark.png -- single turn long context Q&A https://arxiv.org/abs/2311.04939 (seems more like an shared prefix style similar to cascade attention (pre cursor to sglang radix attention )) https://flashinfer.ai/2024/02/02/cascade-inference.html -- synethic & sharegpt vllm multi turn datasets https://github.com/vllm-project/vllm/tree/main/benchmarks/multi_turn -- Production Alibiba Multi turn dataset https://arxiv.org/abs/2506.02634 (seem to not provide the acutal prompts and outputs tho, more just prompt lengths and output lengths, etc.) -- sglang synthetic multi turn benchmark script here https://github.com/sgl-project/sglang/tree/main/benchmark/hicache -- interestingly sglang blog simulates PD disagg via just setting OSL as 1 -- MT-bench https://arxiv.org/abs/2402.14762 -```bash -python3 benchmark/hicache/bench_multiturn.py --model-path $MODEL_PATH --disable-random-sample \ ---output-length 1 --request-length 2048 \ # simulate P-D disaggregation -``` +# Experimental multiturn notes + +This directory contains working notes, investigations, and planning material for multiturn and long-context benchmarking. + +## Official ISB1 replay status lives elsewhere + +Do **not** treat this directory as the source of truth for the currently supported InferenceX ISB1 surface. + +For the official, reviewable statement of what is landed now, use: +- `datasets/isb1/SUPPORT_MATRIX.md` +- `datasets/isb1/README.md` +- `.github/configs/isb1-master.yaml` + +## Relevant roadmap docs + +- `ISB1_MULTITURN_LONG_CONTEXT_CANONICAL_SYNTHESIS_2026-04-09.md` — canonical synthesis for next implementation phases; use this first for planning context. +- `ISB1_INFERENCEX_PHASED_PR_ROADMAP_2026-04-09.md` — phased landing plan used to split schema/workflow/data/extension/polish work into mergeable stages. + +## Scope warning + +Files in this directory may discuss future or experimental directions such as: +- KV offload investigations +- synthetic multiturn ideas +- broader long-context expansion +- experiments outside the currently merged official replay lane + +Those notes are useful for planning, but they are **not** themselves an official support claim. diff --git a/experimental/multiturn/vllm_benchmark/.gitignore b/experimental/multiturn/vllm_benchmark/.gitignore new file mode 100644 index 000000000..5c371b81e --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/.gitignore @@ -0,0 +1,7 @@ +# Python +__pycache__/ +*.pyc + +# Generated artifacts +*.log +*.tmp diff --git a/experimental/multiturn/vllm_benchmark/README.md b/experimental/multiturn/vllm_benchmark/README.md new file mode 100644 index 000000000..b2ea6f175 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/README.md @@ -0,0 +1,33 @@ +# vLLM Benchmark (Experimental) + +This directory tracks the PR #993 parity surface for multi-turn trace replay and KV stress experiments. + +## Trace sources + +- **ISB-1 exports**: existing committed replay exports. +- **kv-cache-tester**: `kv-cache-tester/` is a placeholder for the external trace replay repo. +- **AIPerf synthetic traces**: `aiperf_traces/` provides fallback synthetic traces. + +## Analysis tools + +The parity analysis scripts live under `datasets/isb1/scripts/`: + +- `plot_pareto.py` +- `analyze_benchmark_distributions.py` +- `collect_sweep_results.py` +- `adapt_trace_replay_result.py` + +## LMCache variants + +LMCache launch helpers are under `launch/`: + +- `lmcache_vllm_h200.sh` +- `lmcache_vllm_b200.sh` + +## Per-hardware replay scripts + +Trace replay scripts are under `scripts/` for per-model/per-engine/per-hardware combinations. + +--- + +**Experimental infrastructure. Not part of official ISB-1 support matrix.** diff --git a/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json b/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json new file mode 100644 index 000000000..683556038 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json @@ -0,0 +1,5559 @@ +{ + "sessions": [ + { + "turns": [ + { + "role": "user", + "content_token_count": 4355, + "target_output_tokens": 229 + }, + { + "role": "user", + "content_token_count": 13955, + "target_output_tokens": 384 + }, + { + "role": "user", + "content_token_count": 1941, + "target_output_tokens": 89 + }, + { + "role": "user", + "content_token_count": 11403, + "target_output_tokens": 2247 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 13567, + "target_output_tokens": 663 + }, + { + "role": "user", + "content_token_count": 49742, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 13186, + "target_output_tokens": 686 + }, + { + "role": "user", + "content_token_count": 7600, + "target_output_tokens": 418 + }, + { + "role": "user", + "content_token_count": 5978, + "target_output_tokens": 385 + }, + { + "role": "user", + "content_token_count": 1998, + "target_output_tokens": 706 + }, + { + "role": "user", + "content_token_count": 1582, + "target_output_tokens": 667 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 14644, + "target_output_tokens": 467 + }, + { + "role": "user", + "content_token_count": 20321, + "target_output_tokens": 971 + }, + { + "role": "user", + "content_token_count": 2950, + "target_output_tokens": 274 + }, + { + "role": "user", + "content_token_count": 4932, + "target_output_tokens": 680 + }, + { + "role": "user", + "content_token_count": 9971, + "target_output_tokens": 706 + }, + { + "role": "user", + "content_token_count": 3348, + "target_output_tokens": 440 + }, + { + "role": "user", + "content_token_count": 13343, + "target_output_tokens": 431 + }, + { + "role": "user", + "content_token_count": 6230, + "target_output_tokens": 2231 + }, + { + "role": "user", + "content_token_count": 8168, + "target_output_tokens": 421 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1487, + "target_output_tokens": 986 + }, + { + "role": "user", + "content_token_count": 2684, + "target_output_tokens": 549 + }, + { + "role": "user", + "content_token_count": 3065, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 12135, + "target_output_tokens": 1145 + }, + { + "role": "user", + "content_token_count": 14716, + "target_output_tokens": 1074 + }, + { + "role": "user", + "content_token_count": 16644, + "target_output_tokens": 1062 + }, + { + "role": "user", + "content_token_count": 12355, + "target_output_tokens": 285 + }, + { + "role": "user", + "content_token_count": 3108, + "target_output_tokens": 291 + }, + { + "role": "user", + "content_token_count": 7234, + "target_output_tokens": 1235 + }, + { + "role": "user", + "content_token_count": 25179, + "target_output_tokens": 493 + }, + { + "role": "user", + "content_token_count": 6480, + "target_output_tokens": 431 + }, + { + "role": "user", + "content_token_count": 13902, + "target_output_tokens": 652 + }, + { + "role": "user", + "content_token_count": 6014, + "target_output_tokens": 1037 + }, + { + "role": "user", + "content_token_count": 41352, + "target_output_tokens": 649 + }, + { + "role": "user", + "content_token_count": 8852, + "target_output_tokens": 319 + }, + { + "role": "user", + "content_token_count": 8795, + "target_output_tokens": 736 + }, + { + "role": "user", + "content_token_count": 27778, + "target_output_tokens": 373 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6962, + "target_output_tokens": 1351 + }, + { + "role": "user", + "content_token_count": 2614, + "target_output_tokens": 248 + }, + { + "role": "user", + "content_token_count": 11529, + "target_output_tokens": 248 + }, + { + "role": "user", + "content_token_count": 5165, + "target_output_tokens": 653 + }, + { + "role": "user", + "content_token_count": 2132, + "target_output_tokens": 318 + }, + { + "role": "user", + "content_token_count": 5290, + "target_output_tokens": 614 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 23469, + "target_output_tokens": 546 + }, + { + "role": "user", + "content_token_count": 7665, + "target_output_tokens": 360 + }, + { + "role": "user", + "content_token_count": 27018, + "target_output_tokens": 1332 + }, + { + "role": "user", + "content_token_count": 1887, + "target_output_tokens": 326 + }, + { + "role": "user", + "content_token_count": 5249, + "target_output_tokens": 346 + }, + { + "role": "user", + "content_token_count": 7443, + "target_output_tokens": 828 + }, + { + "role": "user", + "content_token_count": 6496, + "target_output_tokens": 100 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 9221, + "target_output_tokens": 430 + }, + { + "role": "user", + "content_token_count": 7697, + "target_output_tokens": 1197 + }, + { + "role": "user", + "content_token_count": 5421, + "target_output_tokens": 277 + }, + { + "role": "user", + "content_token_count": 8799, + "target_output_tokens": 540 + }, + { + "role": "user", + "content_token_count": 14993, + "target_output_tokens": 768 + }, + { + "role": "user", + "content_token_count": 28612, + "target_output_tokens": 581 + }, + { + "role": "user", + "content_token_count": 42160, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 9846, + "target_output_tokens": 544 + }, + { + "role": "user", + "content_token_count": 15085, + "target_output_tokens": 302 + }, + { + "role": "user", + "content_token_count": 8267, + "target_output_tokens": 596 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 23256, + "target_output_tokens": 821 + }, + { + "role": "user", + "content_token_count": 36819, + "target_output_tokens": 183 + }, + { + "role": "user", + "content_token_count": 1590, + "target_output_tokens": 2201 + }, + { + "role": "user", + "content_token_count": 12229, + "target_output_tokens": 1265 + }, + { + "role": "user", + "content_token_count": 7483, + "target_output_tokens": 1819 + }, + { + "role": "user", + "content_token_count": 2288, + "target_output_tokens": 970 + }, + { + "role": "user", + "content_token_count": 33871, + "target_output_tokens": 703 + }, + { + "role": "user", + "content_token_count": 8650, + "target_output_tokens": 147 + }, + { + "role": "user", + "content_token_count": 10018, + "target_output_tokens": 487 + }, + { + "role": "user", + "content_token_count": 21103, + "target_output_tokens": 805 + }, + { + "role": "user", + "content_token_count": 17500, + "target_output_tokens": 493 + }, + { + "role": "user", + "content_token_count": 1678, + "target_output_tokens": 129 + }, + { + "role": "user", + "content_token_count": 29345, + "target_output_tokens": 303 + }, + { + "role": "user", + "content_token_count": 4555, + "target_output_tokens": 483 + }, + { + "role": "user", + "content_token_count": 39008, + "target_output_tokens": 631 + }, + { + "role": "user", + "content_token_count": 3284, + "target_output_tokens": 142 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7400, + "target_output_tokens": 948 + }, + { + "role": "user", + "content_token_count": 3992, + "target_output_tokens": 387 + }, + { + "role": "user", + "content_token_count": 8450, + "target_output_tokens": 313 + }, + { + "role": "user", + "content_token_count": 8606, + "target_output_tokens": 89 + }, + { + "role": "user", + "content_token_count": 4775, + "target_output_tokens": 3004 + }, + { + "role": "user", + "content_token_count": 44546, + "target_output_tokens": 758 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 10548, + "target_output_tokens": 522 + }, + { + "role": "user", + "content_token_count": 23492, + "target_output_tokens": 463 + }, + { + "role": "user", + "content_token_count": 2803, + "target_output_tokens": 3146 + }, + { + "role": "user", + "content_token_count": 2080, + "target_output_tokens": 257 + }, + { + "role": "user", + "content_token_count": 8416, + "target_output_tokens": 1401 + }, + { + "role": "user", + "content_token_count": 3410, + "target_output_tokens": 4096 + }, + { + "role": "user", + "content_token_count": 20886, + "target_output_tokens": 246 + }, + { + "role": "user", + "content_token_count": 16891, + "target_output_tokens": 111 + }, + { + "role": "user", + "content_token_count": 4933, + "target_output_tokens": 654 + }, + { + "role": "user", + "content_token_count": 5560, + "target_output_tokens": 634 + }, + { + "role": "user", + "content_token_count": 8380, + "target_output_tokens": 158 + }, + { + "role": "user", + "content_token_count": 17894, + "target_output_tokens": 278 + }, + { + "role": "user", + "content_token_count": 4907, + "target_output_tokens": 312 + }, + { + "role": "user", + "content_token_count": 5810, + "target_output_tokens": 1418 + }, + { + "role": "user", + "content_token_count": 6056, + "target_output_tokens": 515 + }, + { + "role": "user", + "content_token_count": 6750, + "target_output_tokens": 279 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6845, + "target_output_tokens": 83 + }, + { + "role": "user", + "content_token_count": 3847, + "target_output_tokens": 2093 + }, + { + "role": "user", + "content_token_count": 2327, + "target_output_tokens": 926 + }, + { + "role": "user", + "content_token_count": 11838, + "target_output_tokens": 453 + }, + { + "role": "user", + "content_token_count": 5787, + "target_output_tokens": 1590 + }, + { + "role": "user", + "content_token_count": 16091, + "target_output_tokens": 84 + }, + { + "role": "user", + "content_token_count": 15625, + "target_output_tokens": 168 + }, + { + "role": "user", + "content_token_count": 24568, + "target_output_tokens": 789 + }, + { + "role": "user", + "content_token_count": 25763, + "target_output_tokens": 605 + }, + { + "role": "user", + "content_token_count": 20307, + "target_output_tokens": 570 + }, + { + "role": "user", + "content_token_count": 6868, + "target_output_tokens": 294 + }, + { + "role": "user", + "content_token_count": 18094, + "target_output_tokens": 170 + }, + { + "role": "user", + "content_token_count": 4778, + "target_output_tokens": 511 + }, + { + "role": "user", + "content_token_count": 3934, + "target_output_tokens": 495 + }, + { + "role": "user", + "content_token_count": 12163, + "target_output_tokens": 795 + }, + { + "role": "user", + "content_token_count": 12752, + "target_output_tokens": 3072 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 17618, + "target_output_tokens": 1691 + }, + { + "role": "user", + "content_token_count": 12217, + "target_output_tokens": 164 + }, + { + "role": "user", + "content_token_count": 31341, + "target_output_tokens": 777 + }, + { + "role": "user", + "content_token_count": 2248, + "target_output_tokens": 1106 + }, + { + "role": "user", + "content_token_count": 11819, + "target_output_tokens": 812 + }, + { + "role": "user", + "content_token_count": 5636, + "target_output_tokens": 187 + }, + { + "role": "user", + "content_token_count": 5477, + "target_output_tokens": 403 + }, + { + "role": "user", + "content_token_count": 19604, + "target_output_tokens": 390 + }, + { + "role": "user", + "content_token_count": 8663, + "target_output_tokens": 865 + }, + { + "role": "user", + "content_token_count": 16969, + "target_output_tokens": 407 + }, + { + "role": "user", + "content_token_count": 22672, + "target_output_tokens": 371 + }, + { + "role": "user", + "content_token_count": 4500, + "target_output_tokens": 257 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6952, + "target_output_tokens": 1454 + }, + { + "role": "user", + "content_token_count": 21170, + "target_output_tokens": 1383 + }, + { + "role": "user", + "content_token_count": 9252, + "target_output_tokens": 209 + }, + { + "role": "user", + "content_token_count": 6023, + "target_output_tokens": 155 + }, + { + "role": "user", + "content_token_count": 30200, + "target_output_tokens": 2025 + }, + { + "role": "user", + "content_token_count": 8146, + "target_output_tokens": 132 + }, + { + "role": "user", + "content_token_count": 15151, + "target_output_tokens": 300 + }, + { + "role": "user", + "content_token_count": 6381, + "target_output_tokens": 739 + }, + { + "role": "user", + "content_token_count": 3225, + "target_output_tokens": 454 + }, + { + "role": "user", + "content_token_count": 5177, + "target_output_tokens": 2094 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 17308, + "target_output_tokens": 484 + }, + { + "role": "user", + "content_token_count": 27306, + "target_output_tokens": 413 + }, + { + "role": "user", + "content_token_count": 24589, + "target_output_tokens": 1070 + }, + { + "role": "user", + "content_token_count": 7202, + "target_output_tokens": 256 + }, + { + "role": "user", + "content_token_count": 6018, + "target_output_tokens": 200 + }, + { + "role": "user", + "content_token_count": 3867, + "target_output_tokens": 593 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 16341, + "target_output_tokens": 1754 + }, + { + "role": "user", + "content_token_count": 4374, + "target_output_tokens": 1779 + }, + { + "role": "user", + "content_token_count": 5850, + "target_output_tokens": 290 + }, + { + "role": "user", + "content_token_count": 5391, + "target_output_tokens": 2242 + }, + { + "role": "user", + "content_token_count": 18534, + "target_output_tokens": 187 + }, + { + "role": "user", + "content_token_count": 1541, + "target_output_tokens": 1352 + }, + { + "role": "user", + "content_token_count": 512, + "target_output_tokens": 917 + }, + { + "role": "user", + "content_token_count": 6840, + "target_output_tokens": 397 + }, + { + "role": "user", + "content_token_count": 4664, + "target_output_tokens": 585 + }, + { + "role": "user", + "content_token_count": 7184, + "target_output_tokens": 846 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7488, + "target_output_tokens": 545 + }, + { + "role": "user", + "content_token_count": 6149, + "target_output_tokens": 180 + }, + { + "role": "user", + "content_token_count": 18544, + "target_output_tokens": 1062 + }, + { + "role": "user", + "content_token_count": 23779, + "target_output_tokens": 962 + }, + { + "role": "user", + "content_token_count": 7158, + "target_output_tokens": 624 + }, + { + "role": "user", + "content_token_count": 5401, + "target_output_tokens": 264 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6126, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 10891, + "target_output_tokens": 787 + }, + { + "role": "user", + "content_token_count": 7206, + "target_output_tokens": 446 + }, + { + "role": "user", + "content_token_count": 14885, + "target_output_tokens": 534 + }, + { + "role": "user", + "content_token_count": 16761, + "target_output_tokens": 418 + }, + { + "role": "user", + "content_token_count": 8153, + "target_output_tokens": 322 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6173, + "target_output_tokens": 792 + }, + { + "role": "user", + "content_token_count": 7491, + "target_output_tokens": 360 + }, + { + "role": "user", + "content_token_count": 11004, + "target_output_tokens": 522 + }, + { + "role": "user", + "content_token_count": 30822, + "target_output_tokens": 733 + }, + { + "role": "user", + "content_token_count": 16828, + "target_output_tokens": 660 + }, + { + "role": "user", + "content_token_count": 10930, + "target_output_tokens": 2180 + }, + { + "role": "user", + "content_token_count": 9511, + "target_output_tokens": 182 + }, + { + "role": "user", + "content_token_count": 9162, + "target_output_tokens": 683 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 28818, + "target_output_tokens": 245 + }, + { + "role": "user", + "content_token_count": 6134, + "target_output_tokens": 472 + }, + { + "role": "user", + "content_token_count": 6634, + "target_output_tokens": 813 + }, + { + "role": "user", + "content_token_count": 10762, + "target_output_tokens": 182 + }, + { + "role": "user", + "content_token_count": 5519, + "target_output_tokens": 1891 + }, + { + "role": "user", + "content_token_count": 9813, + "target_output_tokens": 544 + }, + { + "role": "user", + "content_token_count": 27459, + "target_output_tokens": 1087 + }, + { + "role": "user", + "content_token_count": 11085, + "target_output_tokens": 192 + }, + { + "role": "user", + "content_token_count": 13108, + "target_output_tokens": 444 + }, + { + "role": "user", + "content_token_count": 24568, + "target_output_tokens": 203 + }, + { + "role": "user", + "content_token_count": 12813, + "target_output_tokens": 800 + }, + { + "role": "user", + "content_token_count": 6876, + "target_output_tokens": 126 + }, + { + "role": "user", + "content_token_count": 9155, + "target_output_tokens": 4096 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5653, + "target_output_tokens": 908 + }, + { + "role": "user", + "content_token_count": 2275, + "target_output_tokens": 410 + }, + { + "role": "user", + "content_token_count": 3348, + "target_output_tokens": 708 + }, + { + "role": "user", + "content_token_count": 7689, + "target_output_tokens": 448 + }, + { + "role": "user", + "content_token_count": 8998, + "target_output_tokens": 1126 + }, + { + "role": "user", + "content_token_count": 1847, + "target_output_tokens": 1767 + }, + { + "role": "user", + "content_token_count": 5015, + "target_output_tokens": 484 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 37087, + "target_output_tokens": 360 + }, + { + "role": "user", + "content_token_count": 9919, + "target_output_tokens": 3052 + }, + { + "role": "user", + "content_token_count": 3728, + "target_output_tokens": 265 + }, + { + "role": "user", + "content_token_count": 13398, + "target_output_tokens": 274 + }, + { + "role": "user", + "content_token_count": 5429, + "target_output_tokens": 994 + }, + { + "role": "user", + "content_token_count": 998, + "target_output_tokens": 116 + }, + { + "role": "user", + "content_token_count": 1326, + "target_output_tokens": 718 + }, + { + "role": "user", + "content_token_count": 9401, + "target_output_tokens": 712 + }, + { + "role": "user", + "content_token_count": 9097, + "target_output_tokens": 84 + }, + { + "role": "user", + "content_token_count": 5568, + "target_output_tokens": 126 + }, + { + "role": "user", + "content_token_count": 29693, + "target_output_tokens": 361 + }, + { + "role": "user", + "content_token_count": 4150, + "target_output_tokens": 804 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 13188, + "target_output_tokens": 1389 + }, + { + "role": "user", + "content_token_count": 20963, + "target_output_tokens": 792 + }, + { + "role": "user", + "content_token_count": 15129, + "target_output_tokens": 325 + }, + { + "role": "user", + "content_token_count": 7575, + "target_output_tokens": 149 + }, + { + "role": "user", + "content_token_count": 20166, + "target_output_tokens": 668 + }, + { + "role": "user", + "content_token_count": 7192, + "target_output_tokens": 332 + }, + { + "role": "user", + "content_token_count": 10367, + "target_output_tokens": 610 + }, + { + "role": "user", + "content_token_count": 5248, + "target_output_tokens": 157 + }, + { + "role": "user", + "content_token_count": 9240, + "target_output_tokens": 216 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2873, + "target_output_tokens": 154 + }, + { + "role": "user", + "content_token_count": 10140, + "target_output_tokens": 2818 + }, + { + "role": "user", + "content_token_count": 4864, + "target_output_tokens": 1018 + }, + { + "role": "user", + "content_token_count": 10400, + "target_output_tokens": 210 + }, + { + "role": "user", + "content_token_count": 9931, + "target_output_tokens": 431 + }, + { + "role": "user", + "content_token_count": 19920, + "target_output_tokens": 1335 + }, + { + "role": "user", + "content_token_count": 12765, + "target_output_tokens": 479 + }, + { + "role": "user", + "content_token_count": 16121, + "target_output_tokens": 634 + }, + { + "role": "user", + "content_token_count": 16426, + "target_output_tokens": 303 + }, + { + "role": "user", + "content_token_count": 8657, + "target_output_tokens": 606 + }, + { + "role": "user", + "content_token_count": 3219, + "target_output_tokens": 126 + }, + { + "role": "user", + "content_token_count": 3934, + "target_output_tokens": 90 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 29139, + "target_output_tokens": 283 + }, + { + "role": "user", + "content_token_count": 11018, + "target_output_tokens": 2117 + }, + { + "role": "user", + "content_token_count": 12413, + "target_output_tokens": 123 + }, + { + "role": "user", + "content_token_count": 4620, + "target_output_tokens": 1279 + }, + { + "role": "user", + "content_token_count": 14998, + "target_output_tokens": 857 + }, + { + "role": "user", + "content_token_count": 6874, + "target_output_tokens": 377 + }, + { + "role": "user", + "content_token_count": 9962, + "target_output_tokens": 369 + }, + { + "role": "user", + "content_token_count": 35116, + "target_output_tokens": 178 + }, + { + "role": "user", + "content_token_count": 9970, + "target_output_tokens": 516 + }, + { + "role": "user", + "content_token_count": 11643, + "target_output_tokens": 543 + }, + { + "role": "user", + "content_token_count": 14700, + "target_output_tokens": 547 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1351, + "target_output_tokens": 2192 + }, + { + "role": "user", + "content_token_count": 23550, + "target_output_tokens": 200 + }, + { + "role": "user", + "content_token_count": 2511, + "target_output_tokens": 347 + }, + { + "role": "user", + "content_token_count": 20677, + "target_output_tokens": 589 + }, + { + "role": "user", + "content_token_count": 3425, + "target_output_tokens": 1138 + }, + { + "role": "user", + "content_token_count": 22755, + "target_output_tokens": 1462 + }, + { + "role": "user", + "content_token_count": 6087, + "target_output_tokens": 840 + }, + { + "role": "user", + "content_token_count": 9876, + "target_output_tokens": 164 + }, + { + "role": "user", + "content_token_count": 5481, + "target_output_tokens": 787 + }, + { + "role": "user", + "content_token_count": 4935, + "target_output_tokens": 471 + }, + { + "role": "user", + "content_token_count": 4601, + "target_output_tokens": 373 + }, + { + "role": "user", + "content_token_count": 7449, + "target_output_tokens": 1129 + }, + { + "role": "user", + "content_token_count": 7437, + "target_output_tokens": 664 + }, + { + "role": "user", + "content_token_count": 18022, + "target_output_tokens": 609 + }, + { + "role": "user", + "content_token_count": 6651, + "target_output_tokens": 593 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3803, + "target_output_tokens": 185 + }, + { + "role": "user", + "content_token_count": 4171, + "target_output_tokens": 471 + }, + { + "role": "user", + "content_token_count": 2991, + "target_output_tokens": 2486 + }, + { + "role": "user", + "content_token_count": 11107, + "target_output_tokens": 846 + }, + { + "role": "user", + "content_token_count": 12672, + "target_output_tokens": 1246 + }, + { + "role": "user", + "content_token_count": 9802, + "target_output_tokens": 404 + }, + { + "role": "user", + "content_token_count": 7244, + "target_output_tokens": 665 + }, + { + "role": "user", + "content_token_count": 11618, + "target_output_tokens": 1037 + }, + { + "role": "user", + "content_token_count": 4494, + "target_output_tokens": 365 + }, + { + "role": "user", + "content_token_count": 3666, + "target_output_tokens": 262 + }, + { + "role": "user", + "content_token_count": 10055, + "target_output_tokens": 395 + }, + { + "role": "user", + "content_token_count": 5900, + "target_output_tokens": 778 + }, + { + "role": "user", + "content_token_count": 2260, + "target_output_tokens": 112 + }, + { + "role": "user", + "content_token_count": 3803, + "target_output_tokens": 1263 + }, + { + "role": "user", + "content_token_count": 38195, + "target_output_tokens": 1187 + }, + { + "role": "user", + "content_token_count": 15430, + "target_output_tokens": 304 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 15126, + "target_output_tokens": 363 + }, + { + "role": "user", + "content_token_count": 11997, + "target_output_tokens": 65 + }, + { + "role": "user", + "content_token_count": 12124, + "target_output_tokens": 304 + }, + { + "role": "user", + "content_token_count": 2942, + "target_output_tokens": 722 + }, + { + "role": "user", + "content_token_count": 10438, + "target_output_tokens": 1058 + }, + { + "role": "user", + "content_token_count": 11401, + "target_output_tokens": 517 + }, + { + "role": "user", + "content_token_count": 22839, + "target_output_tokens": 1334 + }, + { + "role": "user", + "content_token_count": 4480, + "target_output_tokens": 409 + }, + { + "role": "user", + "content_token_count": 8627, + "target_output_tokens": 625 + }, + { + "role": "user", + "content_token_count": 2553, + "target_output_tokens": 1775 + }, + { + "role": "user", + "content_token_count": 5008, + "target_output_tokens": 1304 + }, + { + "role": "user", + "content_token_count": 14883, + "target_output_tokens": 920 + }, + { + "role": "user", + "content_token_count": 14845, + "target_output_tokens": 188 + }, + { + "role": "user", + "content_token_count": 7446, + "target_output_tokens": 116 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1555, + "target_output_tokens": 87 + }, + { + "role": "user", + "content_token_count": 4544, + "target_output_tokens": 466 + }, + { + "role": "user", + "content_token_count": 3256, + "target_output_tokens": 560 + }, + { + "role": "user", + "content_token_count": 3753, + "target_output_tokens": 201 + }, + { + "role": "user", + "content_token_count": 12476, + "target_output_tokens": 1849 + }, + { + "role": "user", + "content_token_count": 8975, + "target_output_tokens": 1635 + }, + { + "role": "user", + "content_token_count": 2877, + "target_output_tokens": 355 + }, + { + "role": "user", + "content_token_count": 4514, + "target_output_tokens": 181 + }, + { + "role": "user", + "content_token_count": 5382, + "target_output_tokens": 458 + }, + { + "role": "user", + "content_token_count": 3729, + "target_output_tokens": 292 + }, + { + "role": "user", + "content_token_count": 23202, + "target_output_tokens": 850 + }, + { + "role": "user", + "content_token_count": 6266, + "target_output_tokens": 373 + }, + { + "role": "user", + "content_token_count": 2491, + "target_output_tokens": 651 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5699, + "target_output_tokens": 448 + }, + { + "role": "user", + "content_token_count": 8399, + "target_output_tokens": 96 + }, + { + "role": "user", + "content_token_count": 24606, + "target_output_tokens": 892 + }, + { + "role": "user", + "content_token_count": 1881, + "target_output_tokens": 404 + }, + { + "role": "user", + "content_token_count": 14270, + "target_output_tokens": 302 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2662, + "target_output_tokens": 159 + }, + { + "role": "user", + "content_token_count": 27451, + "target_output_tokens": 742 + }, + { + "role": "user", + "content_token_count": 6138, + "target_output_tokens": 752 + }, + { + "role": "user", + "content_token_count": 3040, + "target_output_tokens": 95 + }, + { + "role": "user", + "content_token_count": 3937, + "target_output_tokens": 394 + }, + { + "role": "user", + "content_token_count": 10143, + "target_output_tokens": 205 + }, + { + "role": "user", + "content_token_count": 4055, + "target_output_tokens": 665 + }, + { + "role": "user", + "content_token_count": 4486, + "target_output_tokens": 491 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 11225, + "target_output_tokens": 3158 + }, + { + "role": "user", + "content_token_count": 5709, + "target_output_tokens": 206 + }, + { + "role": "user", + "content_token_count": 8289, + "target_output_tokens": 2061 + }, + { + "role": "user", + "content_token_count": 11501, + "target_output_tokens": 625 + }, + { + "role": "user", + "content_token_count": 3024, + "target_output_tokens": 131 + }, + { + "role": "user", + "content_token_count": 6949, + "target_output_tokens": 743 + }, + { + "role": "user", + "content_token_count": 3555, + "target_output_tokens": 205 + }, + { + "role": "user", + "content_token_count": 4155, + "target_output_tokens": 478 + }, + { + "role": "user", + "content_token_count": 11184, + "target_output_tokens": 279 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 15198, + "target_output_tokens": 865 + }, + { + "role": "user", + "content_token_count": 27300, + "target_output_tokens": 352 + }, + { + "role": "user", + "content_token_count": 4084, + "target_output_tokens": 694 + }, + { + "role": "user", + "content_token_count": 2879, + "target_output_tokens": 643 + }, + { + "role": "user", + "content_token_count": 8411, + "target_output_tokens": 1094 + }, + { + "role": "user", + "content_token_count": 3496, + "target_output_tokens": 845 + }, + { + "role": "user", + "content_token_count": 14540, + "target_output_tokens": 288 + }, + { + "role": "user", + "content_token_count": 4651, + "target_output_tokens": 385 + }, + { + "role": "user", + "content_token_count": 14792, + "target_output_tokens": 842 + }, + { + "role": "user", + "content_token_count": 6271, + "target_output_tokens": 317 + }, + { + "role": "user", + "content_token_count": 7613, + "target_output_tokens": 763 + }, + { + "role": "user", + "content_token_count": 5852, + "target_output_tokens": 418 + }, + { + "role": "user", + "content_token_count": 11166, + "target_output_tokens": 2196 + }, + { + "role": "user", + "content_token_count": 19005, + "target_output_tokens": 1055 + }, + { + "role": "user", + "content_token_count": 5886, + "target_output_tokens": 492 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4062, + "target_output_tokens": 1211 + }, + { + "role": "user", + "content_token_count": 2190, + "target_output_tokens": 717 + }, + { + "role": "user", + "content_token_count": 7556, + "target_output_tokens": 257 + }, + { + "role": "user", + "content_token_count": 5768, + "target_output_tokens": 1324 + }, + { + "role": "user", + "content_token_count": 5463, + "target_output_tokens": 1404 + }, + { + "role": "user", + "content_token_count": 19173, + "target_output_tokens": 808 + }, + { + "role": "user", + "content_token_count": 7797, + "target_output_tokens": 808 + }, + { + "role": "user", + "content_token_count": 4039, + "target_output_tokens": 414 + }, + { + "role": "user", + "content_token_count": 2391, + "target_output_tokens": 436 + }, + { + "role": "user", + "content_token_count": 1957, + "target_output_tokens": 1098 + }, + { + "role": "user", + "content_token_count": 16198, + "target_output_tokens": 852 + }, + { + "role": "user", + "content_token_count": 3101, + "target_output_tokens": 532 + }, + { + "role": "user", + "content_token_count": 4035, + "target_output_tokens": 833 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1220, + "target_output_tokens": 138 + }, + { + "role": "user", + "content_token_count": 14648, + "target_output_tokens": 168 + }, + { + "role": "user", + "content_token_count": 8228, + "target_output_tokens": 537 + }, + { + "role": "user", + "content_token_count": 2352, + "target_output_tokens": 462 + }, + { + "role": "user", + "content_token_count": 7794, + "target_output_tokens": 259 + }, + { + "role": "user", + "content_token_count": 2734, + "target_output_tokens": 819 + }, + { + "role": "user", + "content_token_count": 17235, + "target_output_tokens": 1471 + }, + { + "role": "user", + "content_token_count": 1357, + "target_output_tokens": 762 + }, + { + "role": "user", + "content_token_count": 10804, + "target_output_tokens": 156 + }, + { + "role": "user", + "content_token_count": 16389, + "target_output_tokens": 983 + }, + { + "role": "user", + "content_token_count": 5074, + "target_output_tokens": 431 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 10280, + "target_output_tokens": 119 + }, + { + "role": "user", + "content_token_count": 4370, + "target_output_tokens": 817 + }, + { + "role": "user", + "content_token_count": 6854, + "target_output_tokens": 1795 + }, + { + "role": "user", + "content_token_count": 15223, + "target_output_tokens": 543 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6116, + "target_output_tokens": 309 + }, + { + "role": "user", + "content_token_count": 6257, + "target_output_tokens": 1301 + }, + { + "role": "user", + "content_token_count": 16623, + "target_output_tokens": 1520 + }, + { + "role": "user", + "content_token_count": 9563, + "target_output_tokens": 1403 + }, + { + "role": "user", + "content_token_count": 9134, + "target_output_tokens": 840 + }, + { + "role": "user", + "content_token_count": 6453, + "target_output_tokens": 388 + }, + { + "role": "user", + "content_token_count": 2951, + "target_output_tokens": 376 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3444, + "target_output_tokens": 414 + }, + { + "role": "user", + "content_token_count": 2321, + "target_output_tokens": 901 + }, + { + "role": "user", + "content_token_count": 3638, + "target_output_tokens": 1425 + }, + { + "role": "user", + "content_token_count": 7123, + "target_output_tokens": 1696 + }, + { + "role": "user", + "content_token_count": 2057, + "target_output_tokens": 351 + }, + { + "role": "user", + "content_token_count": 18346, + "target_output_tokens": 587 + }, + { + "role": "user", + "content_token_count": 9716, + "target_output_tokens": 640 + }, + { + "role": "user", + "content_token_count": 6768, + "target_output_tokens": 388 + }, + { + "role": "user", + "content_token_count": 3788, + "target_output_tokens": 250 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2734, + "target_output_tokens": 1979 + }, + { + "role": "user", + "content_token_count": 4136, + "target_output_tokens": 2452 + }, + { + "role": "user", + "content_token_count": 7721, + "target_output_tokens": 550 + }, + { + "role": "user", + "content_token_count": 1881, + "target_output_tokens": 648 + }, + { + "role": "user", + "content_token_count": 6673, + "target_output_tokens": 406 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6955, + "target_output_tokens": 1459 + }, + { + "role": "user", + "content_token_count": 1014, + "target_output_tokens": 1007 + }, + { + "role": "user", + "content_token_count": 13098, + "target_output_tokens": 1459 + }, + { + "role": "user", + "content_token_count": 4876, + "target_output_tokens": 947 + }, + { + "role": "user", + "content_token_count": 9889, + "target_output_tokens": 1563 + }, + { + "role": "user", + "content_token_count": 2544, + "target_output_tokens": 3149 + }, + { + "role": "user", + "content_token_count": 9006, + "target_output_tokens": 245 + }, + { + "role": "user", + "content_token_count": 18694, + "target_output_tokens": 1384 + }, + { + "role": "user", + "content_token_count": 1467, + "target_output_tokens": 1471 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 17406, + "target_output_tokens": 286 + }, + { + "role": "user", + "content_token_count": 3679, + "target_output_tokens": 636 + }, + { + "role": "user", + "content_token_count": 2184, + "target_output_tokens": 321 + }, + { + "role": "user", + "content_token_count": 7967, + "target_output_tokens": 187 + }, + { + "role": "user", + "content_token_count": 6174, + "target_output_tokens": 654 + }, + { + "role": "user", + "content_token_count": 7180, + "target_output_tokens": 270 + }, + { + "role": "user", + "content_token_count": 10946, + "target_output_tokens": 95 + }, + { + "role": "user", + "content_token_count": 2518, + "target_output_tokens": 430 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6603, + "target_output_tokens": 646 + }, + { + "role": "user", + "content_token_count": 10518, + "target_output_tokens": 1096 + }, + { + "role": "user", + "content_token_count": 14848, + "target_output_tokens": 408 + }, + { + "role": "user", + "content_token_count": 2262, + "target_output_tokens": 499 + }, + { + "role": "user", + "content_token_count": 6591, + "target_output_tokens": 662 + }, + { + "role": "user", + "content_token_count": 5042, + "target_output_tokens": 540 + }, + { + "role": "user", + "content_token_count": 14974, + "target_output_tokens": 3408 + }, + { + "role": "user", + "content_token_count": 5658, + "target_output_tokens": 1060 + }, + { + "role": "user", + "content_token_count": 5558, + "target_output_tokens": 1785 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3100, + "target_output_tokens": 849 + }, + { + "role": "user", + "content_token_count": 12776, + "target_output_tokens": 945 + }, + { + "role": "user", + "content_token_count": 2376, + "target_output_tokens": 1003 + }, + { + "role": "user", + "content_token_count": 6865, + "target_output_tokens": 462 + }, + { + "role": "user", + "content_token_count": 3111, + "target_output_tokens": 509 + }, + { + "role": "user", + "content_token_count": 16078, + "target_output_tokens": 342 + }, + { + "role": "user", + "content_token_count": 16493, + "target_output_tokens": 733 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 8957, + "target_output_tokens": 307 + }, + { + "role": "user", + "content_token_count": 19094, + "target_output_tokens": 427 + }, + { + "role": "user", + "content_token_count": 2869, + "target_output_tokens": 405 + }, + { + "role": "user", + "content_token_count": 18384, + "target_output_tokens": 185 + }, + { + "role": "user", + "content_token_count": 6443, + "target_output_tokens": 1522 + }, + { + "role": "user", + "content_token_count": 5348, + "target_output_tokens": 662 + }, + { + "role": "user", + "content_token_count": 3869, + "target_output_tokens": 175 + }, + { + "role": "user", + "content_token_count": 5106, + "target_output_tokens": 761 + }, + { + "role": "user", + "content_token_count": 16260, + "target_output_tokens": 2221 + }, + { + "role": "user", + "content_token_count": 3983, + "target_output_tokens": 90 + }, + { + "role": "user", + "content_token_count": 2900, + "target_output_tokens": 809 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4829, + "target_output_tokens": 226 + }, + { + "role": "user", + "content_token_count": 2384, + "target_output_tokens": 491 + }, + { + "role": "user", + "content_token_count": 26292, + "target_output_tokens": 659 + }, + { + "role": "user", + "content_token_count": 12843, + "target_output_tokens": 692 + }, + { + "role": "user", + "content_token_count": 3004, + "target_output_tokens": 300 + }, + { + "role": "user", + "content_token_count": 21070, + "target_output_tokens": 1321 + }, + { + "role": "user", + "content_token_count": 12368, + "target_output_tokens": 129 + }, + { + "role": "user", + "content_token_count": 6159, + "target_output_tokens": 1480 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5460, + "target_output_tokens": 249 + }, + { + "role": "user", + "content_token_count": 9185, + "target_output_tokens": 229 + }, + { + "role": "user", + "content_token_count": 29343, + "target_output_tokens": 319 + }, + { + "role": "user", + "content_token_count": 7542, + "target_output_tokens": 1027 + }, + { + "role": "user", + "content_token_count": 3182, + "target_output_tokens": 248 + }, + { + "role": "user", + "content_token_count": 9888, + "target_output_tokens": 1865 + }, + { + "role": "user", + "content_token_count": 7401, + "target_output_tokens": 854 + }, + { + "role": "user", + "content_token_count": 6561, + "target_output_tokens": 654 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6488, + "target_output_tokens": 77 + }, + { + "role": "user", + "content_token_count": 6158, + "target_output_tokens": 374 + }, + { + "role": "user", + "content_token_count": 12575, + "target_output_tokens": 1325 + }, + { + "role": "user", + "content_token_count": 18730, + "target_output_tokens": 325 + }, + { + "role": "user", + "content_token_count": 2581, + "target_output_tokens": 1027 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 1888 + }, + { + "role": "user", + "content_token_count": 1787, + "target_output_tokens": 970 + }, + { + "role": "user", + "content_token_count": 7304, + "target_output_tokens": 181 + }, + { + "role": "user", + "content_token_count": 4038, + "target_output_tokens": 2854 + }, + { + "role": "user", + "content_token_count": 9441, + "target_output_tokens": 985 + }, + { + "role": "user", + "content_token_count": 5386, + "target_output_tokens": 550 + }, + { + "role": "user", + "content_token_count": 895, + "target_output_tokens": 550 + }, + { + "role": "user", + "content_token_count": 3238, + "target_output_tokens": 467 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 9749, + "target_output_tokens": 594 + }, + { + "role": "user", + "content_token_count": 6586, + "target_output_tokens": 303 + }, + { + "role": "user", + "content_token_count": 13734, + "target_output_tokens": 1592 + }, + { + "role": "user", + "content_token_count": 4723, + "target_output_tokens": 2155 + }, + { + "role": "user", + "content_token_count": 19342, + "target_output_tokens": 161 + }, + { + "role": "user", + "content_token_count": 7921, + "target_output_tokens": 130 + }, + { + "role": "user", + "content_token_count": 26045, + "target_output_tokens": 613 + }, + { + "role": "user", + "content_token_count": 9327, + "target_output_tokens": 158 + }, + { + "role": "user", + "content_token_count": 5054, + "target_output_tokens": 652 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 753 + }, + { + "role": "user", + "content_token_count": 13763, + "target_output_tokens": 501 + }, + { + "role": "user", + "content_token_count": 7809, + "target_output_tokens": 618 + }, + { + "role": "user", + "content_token_count": 1780, + "target_output_tokens": 1609 + }, + { + "role": "user", + "content_token_count": 13566, + "target_output_tokens": 219 + }, + { + "role": "user", + "content_token_count": 8244, + "target_output_tokens": 707 + }, + { + "role": "user", + "content_token_count": 3690, + "target_output_tokens": 2575 + }, + { + "role": "user", + "content_token_count": 8579, + "target_output_tokens": 289 + }, + { + "role": "user", + "content_token_count": 13461, + "target_output_tokens": 835 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7460, + "target_output_tokens": 564 + }, + { + "role": "user", + "content_token_count": 12306, + "target_output_tokens": 643 + }, + { + "role": "user", + "content_token_count": 4237, + "target_output_tokens": 436 + }, + { + "role": "user", + "content_token_count": 2239, + "target_output_tokens": 1437 + }, + { + "role": "user", + "content_token_count": 4323, + "target_output_tokens": 1610 + }, + { + "role": "user", + "content_token_count": 8322, + "target_output_tokens": 628 + }, + { + "role": "user", + "content_token_count": 8307, + "target_output_tokens": 321 + }, + { + "role": "user", + "content_token_count": 8038, + "target_output_tokens": 221 + }, + { + "role": "user", + "content_token_count": 9312, + "target_output_tokens": 119 + }, + { + "role": "user", + "content_token_count": 8570, + "target_output_tokens": 1070 + }, + { + "role": "user", + "content_token_count": 43634, + "target_output_tokens": 801 + }, + { + "role": "user", + "content_token_count": 9896, + "target_output_tokens": 559 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 11595, + "target_output_tokens": 458 + }, + { + "role": "user", + "content_token_count": 8292, + "target_output_tokens": 942 + }, + { + "role": "user", + "content_token_count": 3946, + "target_output_tokens": 490 + }, + { + "role": "user", + "content_token_count": 2955, + "target_output_tokens": 712 + }, + { + "role": "user", + "content_token_count": 4839, + "target_output_tokens": 272 + }, + { + "role": "user", + "content_token_count": 4011, + "target_output_tokens": 335 + }, + { + "role": "user", + "content_token_count": 5086, + "target_output_tokens": 315 + }, + { + "role": "user", + "content_token_count": 5209, + "target_output_tokens": 764 + }, + { + "role": "user", + "content_token_count": 6710, + "target_output_tokens": 146 + }, + { + "role": "user", + "content_token_count": 2382, + "target_output_tokens": 277 + }, + { + "role": "user", + "content_token_count": 18762, + "target_output_tokens": 312 + }, + { + "role": "user", + "content_token_count": 3554, + "target_output_tokens": 393 + }, + { + "role": "user", + "content_token_count": 10240, + "target_output_tokens": 130 + }, + { + "role": "user", + "content_token_count": 10301, + "target_output_tokens": 986 + }, + { + "role": "user", + "content_token_count": 4008, + "target_output_tokens": 461 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 21422, + "target_output_tokens": 346 + }, + { + "role": "user", + "content_token_count": 5246, + "target_output_tokens": 217 + }, + { + "role": "user", + "content_token_count": 13646, + "target_output_tokens": 499 + }, + { + "role": "user", + "content_token_count": 5532, + "target_output_tokens": 249 + }, + { + "role": "user", + "content_token_count": 5178, + "target_output_tokens": 149 + }, + { + "role": "user", + "content_token_count": 1034, + "target_output_tokens": 316 + }, + { + "role": "user", + "content_token_count": 3570, + "target_output_tokens": 318 + }, + { + "role": "user", + "content_token_count": 9334, + "target_output_tokens": 1761 + }, + { + "role": "user", + "content_token_count": 4071, + "target_output_tokens": 227 + }, + { + "role": "user", + "content_token_count": 11734, + "target_output_tokens": 340 + }, + { + "role": "user", + "content_token_count": 5927, + "target_output_tokens": 302 + }, + { + "role": "user", + "content_token_count": 7918, + "target_output_tokens": 337 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2647, + "target_output_tokens": 301 + }, + { + "role": "user", + "content_token_count": 14271, + "target_output_tokens": 1313 + }, + { + "role": "user", + "content_token_count": 5670, + "target_output_tokens": 954 + }, + { + "role": "user", + "content_token_count": 5014, + "target_output_tokens": 2103 + }, + { + "role": "user", + "content_token_count": 14137, + "target_output_tokens": 997 + }, + { + "role": "user", + "content_token_count": 8872, + "target_output_tokens": 1332 + }, + { + "role": "user", + "content_token_count": 2096, + "target_output_tokens": 4096 + }, + { + "role": "user", + "content_token_count": 16766, + "target_output_tokens": 587 + }, + { + "role": "user", + "content_token_count": 5742, + "target_output_tokens": 493 + }, + { + "role": "user", + "content_token_count": 21664, + "target_output_tokens": 696 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3432, + "target_output_tokens": 203 + }, + { + "role": "user", + "content_token_count": 4013, + "target_output_tokens": 79 + }, + { + "role": "user", + "content_token_count": 23484, + "target_output_tokens": 220 + }, + { + "role": "user", + "content_token_count": 1546, + "target_output_tokens": 289 + }, + { + "role": "user", + "content_token_count": 4542, + "target_output_tokens": 515 + }, + { + "role": "user", + "content_token_count": 5260, + "target_output_tokens": 378 + }, + { + "role": "user", + "content_token_count": 5487, + "target_output_tokens": 654 + }, + { + "role": "user", + "content_token_count": 7881, + "target_output_tokens": 380 + }, + { + "role": "user", + "content_token_count": 3358, + "target_output_tokens": 687 + }, + { + "role": "user", + "content_token_count": 11898, + "target_output_tokens": 180 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 38833, + "target_output_tokens": 534 + }, + { + "role": "user", + "content_token_count": 5781, + "target_output_tokens": 725 + }, + { + "role": "user", + "content_token_count": 7261, + "target_output_tokens": 165 + }, + { + "role": "user", + "content_token_count": 1280, + "target_output_tokens": 129 + }, + { + "role": "user", + "content_token_count": 5792, + "target_output_tokens": 466 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 10544, + "target_output_tokens": 692 + }, + { + "role": "user", + "content_token_count": 15136, + "target_output_tokens": 836 + }, + { + "role": "user", + "content_token_count": 5686, + "target_output_tokens": 1758 + }, + { + "role": "user", + "content_token_count": 12712, + "target_output_tokens": 2240 + }, + { + "role": "user", + "content_token_count": 4875, + "target_output_tokens": 482 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 60523, + "target_output_tokens": 271 + }, + { + "role": "user", + "content_token_count": 10297, + "target_output_tokens": 631 + }, + { + "role": "user", + "content_token_count": 16059, + "target_output_tokens": 648 + }, + { + "role": "user", + "content_token_count": 20684, + "target_output_tokens": 487 + }, + { + "role": "user", + "content_token_count": 6343, + "target_output_tokens": 637 + }, + { + "role": "user", + "content_token_count": 29821, + "target_output_tokens": 436 + }, + { + "role": "user", + "content_token_count": 2615, + "target_output_tokens": 187 + }, + { + "role": "user", + "content_token_count": 4564, + "target_output_tokens": 980 + }, + { + "role": "user", + "content_token_count": 7889, + "target_output_tokens": 907 + }, + { + "role": "user", + "content_token_count": 14777, + "target_output_tokens": 361 + }, + { + "role": "user", + "content_token_count": 5646, + "target_output_tokens": 1521 + }, + { + "role": "user", + "content_token_count": 13268, + "target_output_tokens": 554 + }, + { + "role": "user", + "content_token_count": 10637, + "target_output_tokens": 1013 + }, + { + "role": "user", + "content_token_count": 5757, + "target_output_tokens": 1339 + }, + { + "role": "user", + "content_token_count": 5184, + "target_output_tokens": 628 + }, + { + "role": "user", + "content_token_count": 12479, + "target_output_tokens": 792 + }, + { + "role": "user", + "content_token_count": 18012, + "target_output_tokens": 167 + }, + { + "role": "user", + "content_token_count": 14643, + "target_output_tokens": 532 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1938, + "target_output_tokens": 1098 + }, + { + "role": "user", + "content_token_count": 685, + "target_output_tokens": 986 + }, + { + "role": "user", + "content_token_count": 3023, + "target_output_tokens": 292 + }, + { + "role": "user", + "content_token_count": 26370, + "target_output_tokens": 332 + }, + { + "role": "user", + "content_token_count": 7935, + "target_output_tokens": 179 + }, + { + "role": "user", + "content_token_count": 2052, + "target_output_tokens": 99 + }, + { + "role": "user", + "content_token_count": 5165, + "target_output_tokens": 747 + }, + { + "role": "user", + "content_token_count": 13734, + "target_output_tokens": 435 + }, + { + "role": "user", + "content_token_count": 979, + "target_output_tokens": 760 + }, + { + "role": "user", + "content_token_count": 4084, + "target_output_tokens": 604 + }, + { + "role": "user", + "content_token_count": 19546, + "target_output_tokens": 183 + }, + { + "role": "user", + "content_token_count": 1609, + "target_output_tokens": 191 + }, + { + "role": "user", + "content_token_count": 3857, + "target_output_tokens": 1024 + }, + { + "role": "user", + "content_token_count": 21131, + "target_output_tokens": 1830 + }, + { + "role": "user", + "content_token_count": 4129, + "target_output_tokens": 343 + }, + { + "role": "user", + "content_token_count": 30740, + "target_output_tokens": 635 + }, + { + "role": "user", + "content_token_count": 10871, + "target_output_tokens": 995 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 8416, + "target_output_tokens": 664 + }, + { + "role": "user", + "content_token_count": 6856, + "target_output_tokens": 360 + }, + { + "role": "user", + "content_token_count": 12991, + "target_output_tokens": 1554 + }, + { + "role": "user", + "content_token_count": 2681, + "target_output_tokens": 1392 + }, + { + "role": "user", + "content_token_count": 2083, + "target_output_tokens": 1322 + }, + { + "role": "user", + "content_token_count": 2529, + "target_output_tokens": 862 + }, + { + "role": "user", + "content_token_count": 4854, + "target_output_tokens": 412 + }, + { + "role": "user", + "content_token_count": 5826, + "target_output_tokens": 904 + }, + { + "role": "user", + "content_token_count": 1412, + "target_output_tokens": 197 + }, + { + "role": "user", + "content_token_count": 16884, + "target_output_tokens": 319 + }, + { + "role": "user", + "content_token_count": 2209, + "target_output_tokens": 370 + }, + { + "role": "user", + "content_token_count": 6010, + "target_output_tokens": 1294 + }, + { + "role": "user", + "content_token_count": 19805, + "target_output_tokens": 2855 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7510, + "target_output_tokens": 354 + }, + { + "role": "user", + "content_token_count": 20508, + "target_output_tokens": 390 + }, + { + "role": "user", + "content_token_count": 14364, + "target_output_tokens": 234 + }, + { + "role": "user", + "content_token_count": 5578, + "target_output_tokens": 672 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7461, + "target_output_tokens": 2138 + }, + { + "role": "user", + "content_token_count": 8915, + "target_output_tokens": 721 + }, + { + "role": "user", + "content_token_count": 827, + "target_output_tokens": 458 + }, + { + "role": "user", + "content_token_count": 5858, + "target_output_tokens": 252 + }, + { + "role": "user", + "content_token_count": 3199, + "target_output_tokens": 864 + }, + { + "role": "user", + "content_token_count": 17479, + "target_output_tokens": 387 + }, + { + "role": "user", + "content_token_count": 6488, + "target_output_tokens": 768 + }, + { + "role": "user", + "content_token_count": 11265, + "target_output_tokens": 797 + }, + { + "role": "user", + "content_token_count": 6991, + "target_output_tokens": 802 + }, + { + "role": "user", + "content_token_count": 12962, + "target_output_tokens": 559 + }, + { + "role": "user", + "content_token_count": 6638, + "target_output_tokens": 2509 + }, + { + "role": "user", + "content_token_count": 2297, + "target_output_tokens": 803 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 11614, + "target_output_tokens": 248 + }, + { + "role": "user", + "content_token_count": 3234, + "target_output_tokens": 64 + }, + { + "role": "user", + "content_token_count": 18001, + "target_output_tokens": 64 + }, + { + "role": "user", + "content_token_count": 17797, + "target_output_tokens": 792 + }, + { + "role": "user", + "content_token_count": 15525, + "target_output_tokens": 341 + }, + { + "role": "user", + "content_token_count": 11380, + "target_output_tokens": 308 + }, + { + "role": "user", + "content_token_count": 20150, + "target_output_tokens": 336 + }, + { + "role": "user", + "content_token_count": 10705, + "target_output_tokens": 149 + }, + { + "role": "user", + "content_token_count": 5871, + "target_output_tokens": 432 + }, + { + "role": "user", + "content_token_count": 5526, + "target_output_tokens": 406 + }, + { + "role": "user", + "content_token_count": 7675, + "target_output_tokens": 1587 + }, + { + "role": "user", + "content_token_count": 2277, + "target_output_tokens": 1478 + }, + { + "role": "user", + "content_token_count": 9244, + "target_output_tokens": 168 + }, + { + "role": "user", + "content_token_count": 9135, + "target_output_tokens": 141 + }, + { + "role": "user", + "content_token_count": 6477, + "target_output_tokens": 847 + }, + { + "role": "user", + "content_token_count": 5213, + "target_output_tokens": 381 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 11902, + "target_output_tokens": 363 + }, + { + "role": "user", + "content_token_count": 4133, + "target_output_tokens": 763 + }, + { + "role": "user", + "content_token_count": 34974, + "target_output_tokens": 595 + }, + { + "role": "user", + "content_token_count": 3005, + "target_output_tokens": 748 + }, + { + "role": "user", + "content_token_count": 13140, + "target_output_tokens": 1585 + }, + { + "role": "user", + "content_token_count": 10800, + "target_output_tokens": 451 + }, + { + "role": "user", + "content_token_count": 7703, + "target_output_tokens": 308 + }, + { + "role": "user", + "content_token_count": 6180, + "target_output_tokens": 421 + }, + { + "role": "user", + "content_token_count": 7095, + "target_output_tokens": 2469 + }, + { + "role": "user", + "content_token_count": 27521, + "target_output_tokens": 645 + }, + { + "role": "user", + "content_token_count": 14207, + "target_output_tokens": 615 + }, + { + "role": "user", + "content_token_count": 7467, + "target_output_tokens": 736 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 20561, + "target_output_tokens": 111 + }, + { + "role": "user", + "content_token_count": 1000, + "target_output_tokens": 934 + }, + { + "role": "user", + "content_token_count": 32461, + "target_output_tokens": 115 + }, + { + "role": "user", + "content_token_count": 7010, + "target_output_tokens": 128 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 567 + }, + { + "role": "user", + "content_token_count": 9176, + "target_output_tokens": 146 + }, + { + "role": "user", + "content_token_count": 11138, + "target_output_tokens": 2089 + }, + { + "role": "user", + "content_token_count": 24757, + "target_output_tokens": 204 + }, + { + "role": "user", + "content_token_count": 6580, + "target_output_tokens": 1229 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4856, + "target_output_tokens": 587 + }, + { + "role": "user", + "content_token_count": 4192, + "target_output_tokens": 631 + }, + { + "role": "user", + "content_token_count": 7377, + "target_output_tokens": 358 + }, + { + "role": "user", + "content_token_count": 4030, + "target_output_tokens": 437 + }, + { + "role": "user", + "content_token_count": 8482, + "target_output_tokens": 404 + }, + { + "role": "user", + "content_token_count": 10934, + "target_output_tokens": 397 + }, + { + "role": "user", + "content_token_count": 5271, + "target_output_tokens": 105 + }, + { + "role": "user", + "content_token_count": 1504, + "target_output_tokens": 207 + }, + { + "role": "user", + "content_token_count": 12542, + "target_output_tokens": 497 + }, + { + "role": "user", + "content_token_count": 3169, + "target_output_tokens": 418 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 34022, + "target_output_tokens": 920 + }, + { + "role": "user", + "content_token_count": 4306, + "target_output_tokens": 383 + }, + { + "role": "user", + "content_token_count": 3490, + "target_output_tokens": 1086 + }, + { + "role": "user", + "content_token_count": 3939, + "target_output_tokens": 1038 + }, + { + "role": "user", + "content_token_count": 26508, + "target_output_tokens": 1136 + }, + { + "role": "user", + "content_token_count": 7044, + "target_output_tokens": 3317 + }, + { + "role": "user", + "content_token_count": 2441, + "target_output_tokens": 962 + }, + { + "role": "user", + "content_token_count": 2360, + "target_output_tokens": 442 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 13707, + "target_output_tokens": 159 + }, + { + "role": "user", + "content_token_count": 3362, + "target_output_tokens": 495 + }, + { + "role": "user", + "content_token_count": 3014, + "target_output_tokens": 156 + }, + { + "role": "user", + "content_token_count": 9534, + "target_output_tokens": 430 + }, + { + "role": "user", + "content_token_count": 8037, + "target_output_tokens": 724 + }, + { + "role": "user", + "content_token_count": 12462, + "target_output_tokens": 814 + }, + { + "role": "user", + "content_token_count": 18227, + "target_output_tokens": 371 + }, + { + "role": "user", + "content_token_count": 2077, + "target_output_tokens": 867 + }, + { + "role": "user", + "content_token_count": 10950, + "target_output_tokens": 412 + }, + { + "role": "user", + "content_token_count": 12169, + "target_output_tokens": 331 + }, + { + "role": "user", + "content_token_count": 4436, + "target_output_tokens": 260 + }, + { + "role": "user", + "content_token_count": 2961, + "target_output_tokens": 952 + }, + { + "role": "user", + "content_token_count": 21323, + "target_output_tokens": 1066 + }, + { + "role": "user", + "content_token_count": 14035, + "target_output_tokens": 1134 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 14500, + "target_output_tokens": 1813 + }, + { + "role": "user", + "content_token_count": 4751, + "target_output_tokens": 1726 + }, + { + "role": "user", + "content_token_count": 14083, + "target_output_tokens": 444 + }, + { + "role": "user", + "content_token_count": 2668, + "target_output_tokens": 199 + }, + { + "role": "user", + "content_token_count": 6391, + "target_output_tokens": 3392 + }, + { + "role": "user", + "content_token_count": 33050, + "target_output_tokens": 2319 + }, + { + "role": "user", + "content_token_count": 19617, + "target_output_tokens": 401 + }, + { + "role": "user", + "content_token_count": 9052, + "target_output_tokens": 220 + }, + { + "role": "user", + "content_token_count": 21741, + "target_output_tokens": 1047 + }, + { + "role": "user", + "content_token_count": 19064, + "target_output_tokens": 340 + }, + { + "role": "user", + "content_token_count": 1184, + "target_output_tokens": 804 + }, + { + "role": "user", + "content_token_count": 50708, + "target_output_tokens": 1268 + }, + { + "role": "user", + "content_token_count": 1043, + "target_output_tokens": 528 + }, + { + "role": "user", + "content_token_count": 7976, + "target_output_tokens": 600 + }, + { + "role": "user", + "content_token_count": 2967, + "target_output_tokens": 193 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4241, + "target_output_tokens": 1292 + }, + { + "role": "user", + "content_token_count": 8073, + "target_output_tokens": 1244 + }, + { + "role": "user", + "content_token_count": 21650, + "target_output_tokens": 603 + }, + { + "role": "user", + "content_token_count": 30704, + "target_output_tokens": 109 + }, + { + "role": "user", + "content_token_count": 3793, + "target_output_tokens": 486 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 455 + }, + { + "role": "user", + "content_token_count": 12867, + "target_output_tokens": 244 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5205, + "target_output_tokens": 190 + }, + { + "role": "user", + "content_token_count": 9530, + "target_output_tokens": 323 + }, + { + "role": "user", + "content_token_count": 5813, + "target_output_tokens": 662 + }, + { + "role": "user", + "content_token_count": 6079, + "target_output_tokens": 710 + }, + { + "role": "user", + "content_token_count": 3766, + "target_output_tokens": 319 + }, + { + "role": "user", + "content_token_count": 10983, + "target_output_tokens": 419 + }, + { + "role": "user", + "content_token_count": 38098, + "target_output_tokens": 897 + }, + { + "role": "user", + "content_token_count": 7410, + "target_output_tokens": 1273 + }, + { + "role": "user", + "content_token_count": 6534, + "target_output_tokens": 439 + }, + { + "role": "user", + "content_token_count": 2603, + "target_output_tokens": 363 + }, + { + "role": "user", + "content_token_count": 4395, + "target_output_tokens": 72 + }, + { + "role": "user", + "content_token_count": 6739, + "target_output_tokens": 424 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 23588, + "target_output_tokens": 164 + }, + { + "role": "user", + "content_token_count": 17832, + "target_output_tokens": 506 + }, + { + "role": "user", + "content_token_count": 22461, + "target_output_tokens": 198 + }, + { + "role": "user", + "content_token_count": 10329, + "target_output_tokens": 1380 + }, + { + "role": "user", + "content_token_count": 16613, + "target_output_tokens": 523 + }, + { + "role": "user", + "content_token_count": 18924, + "target_output_tokens": 1091 + }, + { + "role": "user", + "content_token_count": 6640, + "target_output_tokens": 936 + }, + { + "role": "user", + "content_token_count": 5752, + "target_output_tokens": 1079 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 16422, + "target_output_tokens": 611 + }, + { + "role": "user", + "content_token_count": 8736, + "target_output_tokens": 1393 + }, + { + "role": "user", + "content_token_count": 30989, + "target_output_tokens": 357 + }, + { + "role": "user", + "content_token_count": 32378, + "target_output_tokens": 365 + }, + { + "role": "user", + "content_token_count": 4826, + "target_output_tokens": 1142 + }, + { + "role": "user", + "content_token_count": 7705, + "target_output_tokens": 2254 + }, + { + "role": "user", + "content_token_count": 1630, + "target_output_tokens": 1219 + }, + { + "role": "user", + "content_token_count": 5323, + "target_output_tokens": 838 + }, + { + "role": "user", + "content_token_count": 21581, + "target_output_tokens": 654 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 8355, + "target_output_tokens": 529 + }, + { + "role": "user", + "content_token_count": 33639, + "target_output_tokens": 650 + }, + { + "role": "user", + "content_token_count": 9794, + "target_output_tokens": 355 + }, + { + "role": "user", + "content_token_count": 5952, + "target_output_tokens": 608 + }, + { + "role": "user", + "content_token_count": 7696, + "target_output_tokens": 163 + }, + { + "role": "user", + "content_token_count": 8151, + "target_output_tokens": 108 + }, + { + "role": "user", + "content_token_count": 11377, + "target_output_tokens": 486 + }, + { + "role": "user", + "content_token_count": 2795, + "target_output_tokens": 765 + }, + { + "role": "user", + "content_token_count": 8478, + "target_output_tokens": 361 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3254, + "target_output_tokens": 524 + }, + { + "role": "user", + "content_token_count": 13573, + "target_output_tokens": 1371 + }, + { + "role": "user", + "content_token_count": 4347, + "target_output_tokens": 538 + }, + { + "role": "user", + "content_token_count": 52807, + "target_output_tokens": 1303 + }, + { + "role": "user", + "content_token_count": 6319, + "target_output_tokens": 278 + }, + { + "role": "user", + "content_token_count": 4295, + "target_output_tokens": 640 + }, + { + "role": "user", + "content_token_count": 2030, + "target_output_tokens": 358 + }, + { + "role": "user", + "content_token_count": 13300, + "target_output_tokens": 504 + }, + { + "role": "user", + "content_token_count": 4151, + "target_output_tokens": 1040 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 10729, + "target_output_tokens": 621 + }, + { + "role": "user", + "content_token_count": 6674, + "target_output_tokens": 433 + }, + { + "role": "user", + "content_token_count": 11618, + "target_output_tokens": 156 + }, + { + "role": "user", + "content_token_count": 13713, + "target_output_tokens": 934 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 9731, + "target_output_tokens": 318 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 507 + }, + { + "role": "user", + "content_token_count": 3019, + "target_output_tokens": 450 + }, + { + "role": "user", + "content_token_count": 10288, + "target_output_tokens": 668 + }, + { + "role": "user", + "content_token_count": 22301, + "target_output_tokens": 815 + }, + { + "role": "user", + "content_token_count": 5283, + "target_output_tokens": 275 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3544, + "target_output_tokens": 843 + }, + { + "role": "user", + "content_token_count": 7783, + "target_output_tokens": 332 + }, + { + "role": "user", + "content_token_count": 2684, + "target_output_tokens": 845 + }, + { + "role": "user", + "content_token_count": 10549, + "target_output_tokens": 275 + }, + { + "role": "user", + "content_token_count": 9460, + "target_output_tokens": 608 + }, + { + "role": "user", + "content_token_count": 3164, + "target_output_tokens": 542 + }, + { + "role": "user", + "content_token_count": 3760, + "target_output_tokens": 494 + }, + { + "role": "user", + "content_token_count": 5991, + "target_output_tokens": 458 + }, + { + "role": "user", + "content_token_count": 3873, + "target_output_tokens": 800 + }, + { + "role": "user", + "content_token_count": 4054, + "target_output_tokens": 400 + }, + { + "role": "user", + "content_token_count": 3102, + "target_output_tokens": 2786 + }, + { + "role": "user", + "content_token_count": 5452, + "target_output_tokens": 3343 + }, + { + "role": "user", + "content_token_count": 2904, + "target_output_tokens": 483 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2269, + "target_output_tokens": 738 + }, + { + "role": "user", + "content_token_count": 18252, + "target_output_tokens": 64 + }, + { + "role": "user", + "content_token_count": 16077, + "target_output_tokens": 369 + }, + { + "role": "user", + "content_token_count": 2591, + "target_output_tokens": 1498 + }, + { + "role": "user", + "content_token_count": 955, + "target_output_tokens": 964 + }, + { + "role": "user", + "content_token_count": 15421, + "target_output_tokens": 1148 + }, + { + "role": "user", + "content_token_count": 26417, + "target_output_tokens": 282 + }, + { + "role": "user", + "content_token_count": 2450, + "target_output_tokens": 641 + }, + { + "role": "user", + "content_token_count": 3723, + "target_output_tokens": 1544 + }, + { + "role": "user", + "content_token_count": 24848, + "target_output_tokens": 1652 + }, + { + "role": "user", + "content_token_count": 1198, + "target_output_tokens": 303 + }, + { + "role": "user", + "content_token_count": 3660, + "target_output_tokens": 378 + }, + { + "role": "user", + "content_token_count": 8385, + "target_output_tokens": 971 + }, + { + "role": "user", + "content_token_count": 17089, + "target_output_tokens": 146 + }, + { + "role": "user", + "content_token_count": 13626, + "target_output_tokens": 1436 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6980, + "target_output_tokens": 779 + }, + { + "role": "user", + "content_token_count": 14266, + "target_output_tokens": 998 + }, + { + "role": "user", + "content_token_count": 19395, + "target_output_tokens": 931 + }, + { + "role": "user", + "content_token_count": 27605, + "target_output_tokens": 864 + }, + { + "role": "user", + "content_token_count": 7245, + "target_output_tokens": 462 + }, + { + "role": "user", + "content_token_count": 3242, + "target_output_tokens": 90 + }, + { + "role": "user", + "content_token_count": 2781, + "target_output_tokens": 1296 + }, + { + "role": "user", + "content_token_count": 1676, + "target_output_tokens": 1609 + }, + { + "role": "user", + "content_token_count": 9287, + "target_output_tokens": 1339 + }, + { + "role": "user", + "content_token_count": 7842, + "target_output_tokens": 686 + }, + { + "role": "user", + "content_token_count": 7397, + "target_output_tokens": 133 + }, + { + "role": "user", + "content_token_count": 12946, + "target_output_tokens": 579 + }, + { + "role": "user", + "content_token_count": 6842, + "target_output_tokens": 1282 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 14195, + "target_output_tokens": 466 + }, + { + "role": "user", + "content_token_count": 4463, + "target_output_tokens": 558 + }, + { + "role": "user", + "content_token_count": 1089, + "target_output_tokens": 2126 + }, + { + "role": "user", + "content_token_count": 9114, + "target_output_tokens": 483 + }, + { + "role": "user", + "content_token_count": 4745, + "target_output_tokens": 810 + }, + { + "role": "user", + "content_token_count": 11648, + "target_output_tokens": 395 + }, + { + "role": "user", + "content_token_count": 2438, + "target_output_tokens": 444 + }, + { + "role": "user", + "content_token_count": 15094, + "target_output_tokens": 357 + }, + { + "role": "user", + "content_token_count": 5004, + "target_output_tokens": 1692 + }, + { + "role": "user", + "content_token_count": 17422, + "target_output_tokens": 161 + }, + { + "role": "user", + "content_token_count": 18830, + "target_output_tokens": 350 + }, + { + "role": "user", + "content_token_count": 3203, + "target_output_tokens": 1336 + }, + { + "role": "user", + "content_token_count": 4912, + "target_output_tokens": 1071 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 10200, + "target_output_tokens": 315 + }, + { + "role": "user", + "content_token_count": 43481, + "target_output_tokens": 953 + }, + { + "role": "user", + "content_token_count": 6381, + "target_output_tokens": 473 + }, + { + "role": "user", + "content_token_count": 2352, + "target_output_tokens": 361 + }, + { + "role": "user", + "content_token_count": 11246, + "target_output_tokens": 486 + }, + { + "role": "user", + "content_token_count": 38916, + "target_output_tokens": 252 + }, + { + "role": "user", + "content_token_count": 29292, + "target_output_tokens": 332 + }, + { + "role": "user", + "content_token_count": 7163, + "target_output_tokens": 737 + }, + { + "role": "user", + "content_token_count": 4145, + "target_output_tokens": 316 + }, + { + "role": "user", + "content_token_count": 4769, + "target_output_tokens": 298 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5594, + "target_output_tokens": 1686 + }, + { + "role": "user", + "content_token_count": 4311, + "target_output_tokens": 398 + }, + { + "role": "user", + "content_token_count": 13684, + "target_output_tokens": 419 + }, + { + "role": "user", + "content_token_count": 33855, + "target_output_tokens": 188 + }, + { + "role": "user", + "content_token_count": 2118, + "target_output_tokens": 1128 + }, + { + "role": "user", + "content_token_count": 2030, + "target_output_tokens": 184 + }, + { + "role": "user", + "content_token_count": 10739, + "target_output_tokens": 561 + }, + { + "role": "user", + "content_token_count": 5555, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 16640, + "target_output_tokens": 668 + }, + { + "role": "user", + "content_token_count": 23253, + "target_output_tokens": 884 + }, + { + "role": "user", + "content_token_count": 3965, + "target_output_tokens": 740 + }, + { + "role": "user", + "content_token_count": 8551, + "target_output_tokens": 1807 + }, + { + "role": "user", + "content_token_count": 3578, + "target_output_tokens": 766 + }, + { + "role": "user", + "content_token_count": 4639, + "target_output_tokens": 1157 + }, + { + "role": "user", + "content_token_count": 6212, + "target_output_tokens": 437 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5004, + "target_output_tokens": 178 + }, + { + "role": "user", + "content_token_count": 5596, + "target_output_tokens": 867 + }, + { + "role": "user", + "content_token_count": 12366, + "target_output_tokens": 1221 + }, + { + "role": "user", + "content_token_count": 5092, + "target_output_tokens": 167 + }, + { + "role": "user", + "content_token_count": 11259, + "target_output_tokens": 286 + }, + { + "role": "user", + "content_token_count": 18357, + "target_output_tokens": 1419 + }, + { + "role": "user", + "content_token_count": 12445, + "target_output_tokens": 425 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1753, + "target_output_tokens": 457 + }, + { + "role": "user", + "content_token_count": 4410, + "target_output_tokens": 138 + }, + { + "role": "user", + "content_token_count": 3759, + "target_output_tokens": 295 + }, + { + "role": "user", + "content_token_count": 11816, + "target_output_tokens": 830 + }, + { + "role": "user", + "content_token_count": 16209, + "target_output_tokens": 141 + }, + { + "role": "user", + "content_token_count": 46023, + "target_output_tokens": 2056 + }, + { + "role": "user", + "content_token_count": 5420, + "target_output_tokens": 422 + }, + { + "role": "user", + "content_token_count": 2445, + "target_output_tokens": 2119 + }, + { + "role": "user", + "content_token_count": 3724, + "target_output_tokens": 1277 + }, + { + "role": "user", + "content_token_count": 3168, + "target_output_tokens": 391 + }, + { + "role": "user", + "content_token_count": 9061, + "target_output_tokens": 1199 + }, + { + "role": "user", + "content_token_count": 4255, + "target_output_tokens": 1880 + }, + { + "role": "user", + "content_token_count": 20542, + "target_output_tokens": 449 + }, + { + "role": "user", + "content_token_count": 18541, + "target_output_tokens": 211 + }, + { + "role": "user", + "content_token_count": 17405, + "target_output_tokens": 878 + }, + { + "role": "user", + "content_token_count": 7086, + "target_output_tokens": 396 + }, + { + "role": "user", + "content_token_count": 4469, + "target_output_tokens": 189 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4594, + "target_output_tokens": 567 + }, + { + "role": "user", + "content_token_count": 15961, + "target_output_tokens": 276 + }, + { + "role": "user", + "content_token_count": 18817, + "target_output_tokens": 296 + }, + { + "role": "user", + "content_token_count": 8980, + "target_output_tokens": 446 + }, + { + "role": "user", + "content_token_count": 13739, + "target_output_tokens": 476 + }, + { + "role": "user", + "content_token_count": 4954, + "target_output_tokens": 1124 + }, + { + "role": "user", + "content_token_count": 7155, + "target_output_tokens": 2553 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 8108, + "target_output_tokens": 337 + }, + { + "role": "user", + "content_token_count": 7213, + "target_output_tokens": 198 + }, + { + "role": "user", + "content_token_count": 6441, + "target_output_tokens": 932 + }, + { + "role": "user", + "content_token_count": 25889, + "target_output_tokens": 494 + }, + { + "role": "user", + "content_token_count": 5672, + "target_output_tokens": 322 + }, + { + "role": "user", + "content_token_count": 6174, + "target_output_tokens": 984 + }, + { + "role": "user", + "content_token_count": 13080, + "target_output_tokens": 594 + }, + { + "role": "user", + "content_token_count": 23119, + "target_output_tokens": 64 + }, + { + "role": "user", + "content_token_count": 10812, + "target_output_tokens": 939 + }, + { + "role": "user", + "content_token_count": 27801, + "target_output_tokens": 925 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3640, + "target_output_tokens": 108 + }, + { + "role": "user", + "content_token_count": 2053, + "target_output_tokens": 655 + }, + { + "role": "user", + "content_token_count": 16255, + "target_output_tokens": 1911 + }, + { + "role": "user", + "content_token_count": 13439, + "target_output_tokens": 629 + }, + { + "role": "user", + "content_token_count": 25472, + "target_output_tokens": 1323 + }, + { + "role": "user", + "content_token_count": 10114, + "target_output_tokens": 674 + }, + { + "role": "user", + "content_token_count": 1708, + "target_output_tokens": 1493 + }, + { + "role": "user", + "content_token_count": 5384, + "target_output_tokens": 1587 + }, + { + "role": "user", + "content_token_count": 6730, + "target_output_tokens": 408 + }, + { + "role": "user", + "content_token_count": 1746, + "target_output_tokens": 413 + }, + { + "role": "user", + "content_token_count": 1684, + "target_output_tokens": 1349 + }, + { + "role": "user", + "content_token_count": 22551, + "target_output_tokens": 426 + }, + { + "role": "user", + "content_token_count": 10297, + "target_output_tokens": 772 + }, + { + "role": "user", + "content_token_count": 13002, + "target_output_tokens": 1444 + }, + { + "role": "user", + "content_token_count": 16737, + "target_output_tokens": 1199 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7675, + "target_output_tokens": 354 + }, + { + "role": "user", + "content_token_count": 5654, + "target_output_tokens": 220 + }, + { + "role": "user", + "content_token_count": 946, + "target_output_tokens": 515 + }, + { + "role": "user", + "content_token_count": 6573, + "target_output_tokens": 1712 + }, + { + "role": "user", + "content_token_count": 47344, + "target_output_tokens": 554 + }, + { + "role": "user", + "content_token_count": 10099, + "target_output_tokens": 1064 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4184, + "target_output_tokens": 213 + }, + { + "role": "user", + "content_token_count": 20020, + "target_output_tokens": 727 + }, + { + "role": "user", + "content_token_count": 5788, + "target_output_tokens": 464 + }, + { + "role": "user", + "content_token_count": 16426, + "target_output_tokens": 188 + }, + { + "role": "user", + "content_token_count": 6170, + "target_output_tokens": 1080 + }, + { + "role": "user", + "content_token_count": 12316, + "target_output_tokens": 659 + }, + { + "role": "user", + "content_token_count": 2817, + "target_output_tokens": 148 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 14649, + "target_output_tokens": 769 + }, + { + "role": "user", + "content_token_count": 13707, + "target_output_tokens": 314 + }, + { + "role": "user", + "content_token_count": 1901, + "target_output_tokens": 480 + }, + { + "role": "user", + "content_token_count": 4892, + "target_output_tokens": 562 + }, + { + "role": "user", + "content_token_count": 18481, + "target_output_tokens": 195 + }, + { + "role": "user", + "content_token_count": 3762, + "target_output_tokens": 564 + }, + { + "role": "user", + "content_token_count": 8463, + "target_output_tokens": 286 + }, + { + "role": "user", + "content_token_count": 11078, + "target_output_tokens": 90 + }, + { + "role": "user", + "content_token_count": 1106, + "target_output_tokens": 2149 + }, + { + "role": "user", + "content_token_count": 3393, + "target_output_tokens": 1477 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 285 + }, + { + "role": "user", + "content_token_count": 11370, + "target_output_tokens": 417 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 19821, + "target_output_tokens": 217 + }, + { + "role": "user", + "content_token_count": 20454, + "target_output_tokens": 689 + }, + { + "role": "user", + "content_token_count": 6158, + "target_output_tokens": 495 + }, + { + "role": "user", + "content_token_count": 10407, + "target_output_tokens": 172 + }, + { + "role": "user", + "content_token_count": 6777, + "target_output_tokens": 244 + }, + { + "role": "user", + "content_token_count": 52928, + "target_output_tokens": 476 + }, + { + "role": "user", + "content_token_count": 42478, + "target_output_tokens": 223 + }, + { + "role": "user", + "content_token_count": 4347, + "target_output_tokens": 593 + }, + { + "role": "user", + "content_token_count": 12237, + "target_output_tokens": 123 + }, + { + "role": "user", + "content_token_count": 17586, + "target_output_tokens": 598 + }, + { + "role": "user", + "content_token_count": 2461, + "target_output_tokens": 501 + }, + { + "role": "user", + "content_token_count": 4825, + "target_output_tokens": 168 + }, + { + "role": "user", + "content_token_count": 2679, + "target_output_tokens": 2852 + }, + { + "role": "user", + "content_token_count": 7837, + "target_output_tokens": 492 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 277 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5214, + "target_output_tokens": 2004 + }, + { + "role": "user", + "content_token_count": 11163, + "target_output_tokens": 2005 + }, + { + "role": "user", + "content_token_count": 25193, + "target_output_tokens": 211 + }, + { + "role": "user", + "content_token_count": 2010, + "target_output_tokens": 256 + }, + { + "role": "user", + "content_token_count": 9992, + "target_output_tokens": 1115 + }, + { + "role": "user", + "content_token_count": 12896, + "target_output_tokens": 623 + }, + { + "role": "user", + "content_token_count": 3791, + "target_output_tokens": 998 + }, + { + "role": "user", + "content_token_count": 8003, + "target_output_tokens": 338 + }, + { + "role": "user", + "content_token_count": 4495, + "target_output_tokens": 552 + }, + { + "role": "user", + "content_token_count": 1634, + "target_output_tokens": 2271 + }, + { + "role": "user", + "content_token_count": 5760, + "target_output_tokens": 97 + }, + { + "role": "user", + "content_token_count": 10434, + "target_output_tokens": 609 + }, + { + "role": "user", + "content_token_count": 23376, + "target_output_tokens": 112 + }, + { + "role": "user", + "content_token_count": 8046, + "target_output_tokens": 544 + }, + { + "role": "user", + "content_token_count": 1341, + "target_output_tokens": 1666 + }, + { + "role": "user", + "content_token_count": 12979, + "target_output_tokens": 341 + }, + { + "role": "user", + "content_token_count": 8061, + "target_output_tokens": 463 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 14288, + "target_output_tokens": 1379 + }, + { + "role": "user", + "content_token_count": 7502, + "target_output_tokens": 164 + }, + { + "role": "user", + "content_token_count": 2894, + "target_output_tokens": 68 + }, + { + "role": "user", + "content_token_count": 28437, + "target_output_tokens": 318 + }, + { + "role": "user", + "content_token_count": 9110, + "target_output_tokens": 780 + }, + { + "role": "user", + "content_token_count": 7833, + "target_output_tokens": 1300 + }, + { + "role": "user", + "content_token_count": 35537, + "target_output_tokens": 227 + }, + { + "role": "user", + "content_token_count": 6575, + "target_output_tokens": 341 + }, + { + "role": "user", + "content_token_count": 5057, + "target_output_tokens": 747 + }, + { + "role": "user", + "content_token_count": 1020, + "target_output_tokens": 566 + }, + { + "role": "user", + "content_token_count": 29797, + "target_output_tokens": 461 + }, + { + "role": "user", + "content_token_count": 6275, + "target_output_tokens": 244 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5975, + "target_output_tokens": 713 + }, + { + "role": "user", + "content_token_count": 4182, + "target_output_tokens": 813 + }, + { + "role": "user", + "content_token_count": 31157, + "target_output_tokens": 394 + }, + { + "role": "user", + "content_token_count": 5352, + "target_output_tokens": 628 + }, + { + "role": "user", + "content_token_count": 5323, + "target_output_tokens": 468 + }, + { + "role": "user", + "content_token_count": 8404, + "target_output_tokens": 603 + }, + { + "role": "user", + "content_token_count": 10457, + "target_output_tokens": 528 + }, + { + "role": "user", + "content_token_count": 21616, + "target_output_tokens": 1002 + }, + { + "role": "user", + "content_token_count": 11231, + "target_output_tokens": 266 + }, + { + "role": "user", + "content_token_count": 3555, + "target_output_tokens": 981 + }, + { + "role": "user", + "content_token_count": 2347, + "target_output_tokens": 311 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 512, + "target_output_tokens": 1289 + }, + { + "role": "user", + "content_token_count": 14824, + "target_output_tokens": 595 + }, + { + "role": "user", + "content_token_count": 2459, + "target_output_tokens": 491 + }, + { + "role": "user", + "content_token_count": 5155, + "target_output_tokens": 854 + }, + { + "role": "user", + "content_token_count": 1706, + "target_output_tokens": 335 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4693, + "target_output_tokens": 552 + }, + { + "role": "user", + "content_token_count": 3717, + "target_output_tokens": 321 + }, + { + "role": "user", + "content_token_count": 11640, + "target_output_tokens": 525 + }, + { + "role": "user", + "content_token_count": 7120, + "target_output_tokens": 1424 + }, + { + "role": "user", + "content_token_count": 6218, + "target_output_tokens": 1656 + }, + { + "role": "user", + "content_token_count": 11256, + "target_output_tokens": 3945 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6313, + "target_output_tokens": 1528 + }, + { + "role": "user", + "content_token_count": 5148, + "target_output_tokens": 196 + }, + { + "role": "user", + "content_token_count": 15406, + "target_output_tokens": 461 + }, + { + "role": "user", + "content_token_count": 2451, + "target_output_tokens": 404 + }, + { + "role": "user", + "content_token_count": 9688, + "target_output_tokens": 847 + }, + { + "role": "user", + "content_token_count": 14736, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 8049, + "target_output_tokens": 1021 + }, + { + "role": "user", + "content_token_count": 5751, + "target_output_tokens": 3843 + }, + { + "role": "user", + "content_token_count": 11137, + "target_output_tokens": 390 + }, + { + "role": "user", + "content_token_count": 34636, + "target_output_tokens": 895 + }, + { + "role": "user", + "content_token_count": 11915, + "target_output_tokens": 599 + }, + { + "role": "user", + "content_token_count": 8409, + "target_output_tokens": 86 + }, + { + "role": "user", + "content_token_count": 3406, + "target_output_tokens": 2233 + }, + { + "role": "user", + "content_token_count": 15118, + "target_output_tokens": 677 + }, + { + "role": "user", + "content_token_count": 11251, + "target_output_tokens": 203 + }, + { + "role": "user", + "content_token_count": 7848, + "target_output_tokens": 198 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 19708, + "target_output_tokens": 526 + }, + { + "role": "user", + "content_token_count": 6199, + "target_output_tokens": 262 + }, + { + "role": "user", + "content_token_count": 5688, + "target_output_tokens": 957 + }, + { + "role": "user", + "content_token_count": 8993, + "target_output_tokens": 1558 + }, + { + "role": "user", + "content_token_count": 14718, + "target_output_tokens": 207 + }, + { + "role": "user", + "content_token_count": 10274, + "target_output_tokens": 744 + }, + { + "role": "user", + "content_token_count": 10756, + "target_output_tokens": 330 + }, + { + "role": "user", + "content_token_count": 55245, + "target_output_tokens": 171 + }, + { + "role": "user", + "content_token_count": 14177, + "target_output_tokens": 343 + }, + { + "role": "user", + "content_token_count": 11266, + "target_output_tokens": 370 + }, + { + "role": "user", + "content_token_count": 5359, + "target_output_tokens": 1273 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1649, + "target_output_tokens": 218 + }, + { + "role": "user", + "content_token_count": 8871, + "target_output_tokens": 629 + }, + { + "role": "user", + "content_token_count": 11623, + "target_output_tokens": 247 + }, + { + "role": "user", + "content_token_count": 17643, + "target_output_tokens": 536 + }, + { + "role": "user", + "content_token_count": 1355, + "target_output_tokens": 127 + }, + { + "role": "user", + "content_token_count": 10824, + "target_output_tokens": 363 + }, + { + "role": "user", + "content_token_count": 3760, + "target_output_tokens": 810 + }, + { + "role": "user", + "content_token_count": 13120, + "target_output_tokens": 179 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2614, + "target_output_tokens": 270 + }, + { + "role": "user", + "content_token_count": 4555, + "target_output_tokens": 271 + }, + { + "role": "user", + "content_token_count": 5387, + "target_output_tokens": 216 + }, + { + "role": "user", + "content_token_count": 3338, + "target_output_tokens": 694 + }, + { + "role": "user", + "content_token_count": 9274, + "target_output_tokens": 488 + }, + { + "role": "user", + "content_token_count": 41006, + "target_output_tokens": 1179 + }, + { + "role": "user", + "content_token_count": 11764, + "target_output_tokens": 336 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4551, + "target_output_tokens": 391 + }, + { + "role": "user", + "content_token_count": 7744, + "target_output_tokens": 590 + }, + { + "role": "user", + "content_token_count": 6922, + "target_output_tokens": 1285 + }, + { + "role": "user", + "content_token_count": 15085, + "target_output_tokens": 881 + }, + { + "role": "user", + "content_token_count": 23696, + "target_output_tokens": 380 + }, + { + "role": "user", + "content_token_count": 13825, + "target_output_tokens": 1441 + }, + { + "role": "user", + "content_token_count": 7353, + "target_output_tokens": 686 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4844, + "target_output_tokens": 520 + }, + { + "role": "user", + "content_token_count": 11126, + "target_output_tokens": 170 + }, + { + "role": "user", + "content_token_count": 2742, + "target_output_tokens": 549 + }, + { + "role": "user", + "content_token_count": 4533, + "target_output_tokens": 309 + } + ] + } + ] +} \ No newline at end of file diff --git a/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py b/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py new file mode 100644 index 000000000..ccc51ca7a --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +"""Generate synthetic AIPerf-style trace sessions for kv-cache-tester-compatible replay.""" + +from __future__ import annotations + +import argparse +import json +import math +import random +from pathlib import Path + + +def lognormal_sigma(p50: float, p95: float) -> float: + return math.log(p95 / p50) / 1.645 + + +def sample_tokens(rng: random.Random, p50: float, p95: float, min_v: int, max_v: int) -> int: + sigma = lognormal_sigma(p50, p95) + mu = math.log(p50) + sampled = int(round(rng.lognormvariate(mu, sigma))) + return max(min_v, min(max_v, sampled)) + + +def generate_sessions(count: int, seed: int) -> dict: + rng = random.Random(seed) + sessions = [] + + # Target coding-workload distributions: + # ISL p50~8k, p95~32k + # OSL p50~512, p95~2k + for _ in range(count): + num_turns = rng.randint(4, 18) + turns = [] + for _ in range(num_turns): + turns.append( + { + "role": "user", + "content_token_count": sample_tokens( + rng, + p50=8000, + p95=32000, + min_v=512, + max_v=65536, + ), + "target_output_tokens": sample_tokens( + rng, + p50=512, + p95=2000, + min_v=64, + max_v=4096, + ), + } + ) + sessions.append({"turns": turns}) + + return {"sessions": sessions} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate synthetic AIPerf traces") + parser.add_argument("--sessions", type=int, default=100, help="Number of sessions") + parser.add_argument("--seed", type=int, default=993, help="Random seed") + parser.add_argument( + "--output", + type=Path, + default=Path(__file__).with_name("aiperf_synthetic_traces.json"), + help="Output JSON path", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + payload = generate_sessions(args.sessions, args.seed) + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md b/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md new file mode 100644 index 000000000..94731fd42 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md @@ -0,0 +1,11 @@ +# kv-cache-tester placeholder + +This directory should be populated with the external `kv-cache-tester` repository. + +Expected structure includes trace replay tooling and real trace assets used by experimental multiturn benchmarks. + +## Initialization + +If/when access is available, initialize this directory by checking out the kv-cache-tester repo contents here (for example via approved submodule setup or direct clone workflow owned by maintainers). + +Do not replace this placeholder with unapproved external URLs in this branch. diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/.gitkeep b/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/experimental/multiturn/vllm_benchmark/launch/README.md b/experimental/multiturn/vllm_benchmark/launch/README.md new file mode 100644 index 000000000..00d33ecba --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/launch/README.md @@ -0,0 +1,8 @@ +# LMCache launch scripts (experimental) + +These scripts launch vLLM with LMCache KV transfer enabled: + +- `lmcache_vllm_h200.sh` +- `lmcache_vllm_b200.sh` + +They are experimental parity utilities and are not wired into the standard InferenceX benchmark dispatch lanes. diff --git a/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh new file mode 100755 index 000000000..f83b4b7f2 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP + +PORT=${PORT:-8888} +SERVER_LOG=/workspace/server.log +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} + +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +python3 -m pip install -q lmcache + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +echo "LMCache vLLM server running (PID=$SERVER_PID, log=$SERVER_LOG)" +wait "$SERVER_PID" diff --git a/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh new file mode 100755 index 000000000..f83b4b7f2 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP + +PORT=${PORT:-8888} +SERVER_LOG=/workspace/server.log +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} + +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +python3 -m pip install -q lmcache + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +echo "LMCache vLLM server running (PID=$SERVER_PID, log=$SERVER_LOG)" +wait "$SERVER_PID" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh new file mode 100755 index 000000000..7c46b0c31 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} +RADIX_CACHE_ARGS="" +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh new file mode 100755 index 000000000..7c46b0c31 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} +RADIX_CACHE_ARGS="" +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh new file mode 100755 index 000000000..7c46b0c31 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} +RADIX_CACHE_ARGS="" +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh new file mode 100755 index 000000000..7c46b0c31 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} +RADIX_CACHE_ARGS="" +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 847b7ee80..644b2c3a4 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -1,5 +1,7 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + # System-specific configuration for B200 DGXC Slurm cluster SLURM_PARTITION="gpu" SLURM_ACCOUNT="benchmark" @@ -215,8 +217,7 @@ else HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache" SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') - SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 LOCK_FILE="${SQUASH_FILE}.lock" salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" @@ -243,5 +244,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh + bash "$SCRIPT_PATH" + + scancel $JOB_ID fi diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f8c614936..caa1e8364 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -1,8 +1,9 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') -SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 PORT=8888 # Create unique cache directory based on model parameters @@ -30,13 +31,17 @@ docker run --rm --init --network host --name $server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ +-e SPEC_DECODING -e DISAGG \ +-e BENCHMARK_TYPE -e EXPORT_FILE -e RUNTIME_STACK_ID -e HARDWARE_PROFILE_ID -e CANONICAL_MODEL_ID -e REQUEST_MODE -e MAX_CONCURRENCY \ +-e SUPPORT_STATUS -e VLLM_CPU_OFFLOAD_GB -e VLLM_SWAP_SPACE_GB -e SGLANG_MEM_FRACTION_OVERRIDE -e SGLANG_CHUNKED_PREFILL_OVERRIDE \ +-e MAX_SESSIONS -e MAX_TURNS_PER_SESSION -e MAX_OUTPUT_LEN -e NUM_WARMUP_SESSIONS -e IGNORE_WAITS -e IGNORE_EOS \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ -benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" +"$SCRIPT_PATH" # Try graceful first docker stop -t 90 "$server_name" || true diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index c321ee0f9..cbcc7469b 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -1,9 +1,10 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" PARTITION="main" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') -SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 UCX_NET_DEVICES=eth0 @@ -17,4 +18,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh \ No newline at end of file +bash "$SCRIPT_PATH" \ No newline at end of file diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 5100419b9..44c46600d 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -1,7 +1,10 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/" PORT=8888 +SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 server_name="bmk-server" @@ -10,9 +13,13 @@ docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e EP_SIZE -e DP_ATTENTION -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ +-e SPEC_DECODING -e DISAGG \ +-e BENCHMARK_TYPE -e EXPORT_FILE -e RUNTIME_STACK_ID -e HARDWARE_PROFILE_ID -e CANONICAL_MODEL_ID -e REQUEST_MODE -e MAX_CONCURRENCY \ +-e SUPPORT_STATUS -e VLLM_CPU_OFFLOAD_GB -e VLLM_SWAP_SPACE_GB -e SGLANG_MEM_FRACTION_OVERRIDE -e SGLANG_CHUNKED_PREFILL_OVERRIDE \ +-e MAX_SESSIONS -e MAX_TURNS_PER_SESSION -e MAX_OUTPUT_LEN -e NUM_WARMUP_SESSIONS -e IGNORE_WAITS -e IGNORE_EOS \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ -benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100.sh" +"$SCRIPT_PATH" diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 49a42e981..bb10dcb6d 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -1,9 +1,12 @@ #!/usr/bin/env bash +source "$(dirname "$0")/lib_single_node_script.sh" + export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" PARTITION="h100" SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" +SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 set -x @@ -31,7 +34,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh +bash "$SCRIPT_PATH" rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index bb0335955..11570289a 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -1,5 +1,7 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + # System-specific configuration for H100 DGXC Slurm cluster SLURM_PARTITION="hpc-gpu-1" SLURM_ACCOUNT="customer" @@ -230,6 +232,7 @@ else HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/" SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 salloc --exclude="$SLURM_EXCLUDED_NODELIST" --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -247,7 +250,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh + bash "$SCRIPT_PATH" scancel $JOB_ID diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 657f84792..5a49efcc6 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -1,11 +1,12 @@ #!/usr/bin/env bash +source "$(dirname "$0")/lib_single_node_script.sh" + export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') -SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +SCRIPT_PATH=$(resolve_single_node_benchmark_script "$MODEL_CODE" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 PARTITION="h200" SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -44,7 +45,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash "$SCRIPT_PATH" rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 9b3b771a5..a6f4d2986 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -1,5 +1,7 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + # System-specific configuration for H200 DGXC Slurm cluster SLURM_PARTITION="main" SLURM_ACCOUNT="sa-shared" @@ -233,6 +235,7 @@ else # Convert pyxis image format (nvcr.io#path) to docker format (nvcr.io/path) for enroot import DOCKER_IMAGE=$(echo "$IMAGE" | sed 's/#/\//g') LOCK_FILE="${SQUASH_FILE}.lock" + SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -258,7 +261,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh + bash "$SCRIPT_PATH" scancel $JOB_ID diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 9d157a858..3b697fb51 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -1,11 +1,12 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') -SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +SCRIPT_PATH=$(resolve_single_node_benchmark_script "$MODEL_CODE" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 PARTITION="main" @@ -19,4 +20,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash "$SCRIPT_PATH" diff --git a/runners/lib_single_node_script.sh b/runners/lib_single_node_script.sh new file mode 100644 index 000000000..194668856 --- /dev/null +++ b/runners/lib_single_node_script.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +resolve_single_node_benchmark_script() { + local model_code="$1" + local precision="$2" + local runner_code="$3" + local framework="${4:-}" + local spec_decoding="${5:-}" + local script_base="benchmarks/single_node/${model_code}_${precision}_${runner_code}" + + if [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] && [[ "$framework" == "sglang" || "$framework" == "vllm" ]]; then + local runtime_candidate="${script_base}_${framework}.sh" + if [[ -f "$runtime_candidate" ]]; then + printf '%s\n' "$runtime_candidate" + return 0 + fi + fi + + local framework_suffix="" + local spec_suffix="" + if [[ "$framework" == "trt" ]]; then + framework_suffix="_trt" + fi + if [[ "$spec_decoding" == "mtp" ]]; then + spec_suffix="_mtp" + fi + + local legacy_candidate="${script_base}${framework_suffix}${spec_suffix}.sh" + if [[ -f "$legacy_candidate" ]]; then + printf '%s\n' "$legacy_candidate" + return 0 + fi + + echo "ERROR: Could not resolve single-node benchmark script." >&2 + echo " model=$model_code precision=$precision runner=$runner_code framework=${framework:-} spec_decoding=${spec_decoding:-} benchmark_type=${BENCHMARK_TYPE:-}" >&2 + if [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] && [[ "$framework" == "sglang" || "$framework" == "vllm" ]]; then + echo " checked runtime-aware candidate: ${script_base}_${framework}.sh" >&2 + fi + echo " checked legacy candidate: $legacy_candidate" >&2 + return 1 +} diff --git a/utils/bench_serving/benchmark_export_replay.py b/utils/bench_serving/benchmark_export_replay.py new file mode 100644 index 000000000..c67a5fd41 --- /dev/null +++ b/utils/bench_serving/benchmark_export_replay.py @@ -0,0 +1,1536 @@ +# SPDX-License-Identifier: Apache-2.0 +r"""Replay ISB1 export sessions against OpenAI-compatible inference servers. + +Supported export formats: + - ``inferencex_multiturn`` (direct-ingest session turns) + - ``inferencex_trace_replay`` (event-based trace replay) + +Supported request modes: + - ``chat``: send full message history to ``/v1/chat/completions`` + - ``completions``: project the message history into a single tagged prompt + and send it to ``/v1/completions`` + - ``auto``: prefer chat for standalone vLLM/SGLang cells and completions + for TRT / Dynamo projection cells +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import math +import os +import random +import sys +import time +import warnings +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any, Callable, Optional + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60, sock_read=5 * 60) +DEFAULT_IMAGE_TOKEN_ESTIMATE = 2048 +DEFAULT_FALLBACK_OUTPUT_LEN = 256 +CHAT_NATIVE_RUNTIMES = {"standalone:vllm", "standalone:sglang"} +COMPLETIONS_PREFERRED_RUNTIMES = { + "standalone:trt_llm", + "dynamo:vllm", + "dynamo:sglang", + "dynamo:trt_llm", +} +ROLE_LABELS = { + "system": "SYSTEM", + "user": "USER", + "assistant": "ASSISTANT", + "tool": "TOOL", + "retrieval": "RETRIEVAL", + "execution": "EXECUTION", +} +MODULE_DIR = Path(__file__).resolve().parent +if str(MODULE_DIR) not in sys.path: + sys.path.insert(0, str(MODULE_DIR)) + + +@dataclass +class TurnResult: + turn_idx: int + context_len: int + output_len: int + ttft: float = 0.0 + tpot: float = 0.0 + e2el: float = 0.0 + itl: list[float] = field(default_factory=list) + success: bool = True + error: str = "" + request_mode: str = "chat" + actual_context_len: int = 0 + + +@dataclass +class SessionResult: + session_id: str + turns: list[TurnResult] = field(default_factory=list) + total_input_tokens: int = 0 + total_actual_input_tokens: int = 0 + total_output_tokens: int = 0 + total_duration: float = 0.0 + + +@dataclass +class ReplayTurn: + turn_idx: int + turn_id: Any + output_len: int + wait_before_s: float + context_len: int + actual_context_len: int + chat_messages: list[dict[str, Any]] + completion_prompt: str + + +@dataclass +class ReplaySession: + session_id: str + trace_id: str + runtime_stack_id: str + hardware_profile_id: str + canonical_model_id: str + support_status: str + benchmark_certification_status: str + request_mode: str + adapter_id: str + turns: list[ReplayTurn] + + +def _csv_values(raw: Optional[str]) -> set[str] | None: + if raw is None: + return None + values = {item.strip() for item in raw.split(",") if item.strip()} + return values or None + + +def _matches_filter(value: str, allowed: set[str] | None) -> bool: + return allowed is None or value in allowed + + +def _fallback_text_token_count(text: str) -> int: + stripped = (text or "").strip() + if not stripped: + return 0 + return max(1, math.ceil(len(stripped) / 4)) + + +def build_text_token_counter( + tokenizer_id: Optional[str], + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, +) -> Callable[[str], int]: + if not tokenizer_id: + return _fallback_text_token_count + + try: + from backend_request_func import get_tokenizer + + tokenizer = get_tokenizer( + tokenizer_id, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + ) + except Exception as exc: + warnings.warn( + "Falling back to approximate token counting because tokenizer load " + f"failed for {tokenizer_id!r}: {exc}", + stacklevel=2, + ) + return _fallback_text_token_count + + def _count(text: str) -> int: + return len(tokenizer.encode(text or "", add_special_tokens=False)) + + return _count + + +def _render_block_as_text(block: dict[str, Any]) -> str: + block_type = str(block.get("type", "text")) + text = (block.get("text") or "").strip() + if block_type == "text": + return text + if block_type == "code": + return f"[CODE]\n{text}" if text else "[CODE]" + if block_type == "log": + return f"[LOG]\n{text}" if text else "[LOG]" + if block_type == "document": + label = block.get("asset_path") or block.get("uri") or "" + if text and label: + return f"[DOCUMENT: {label}]\n{text}" + if text: + return f"[DOCUMENT]\n{text}" + return f"[DOCUMENT: {label}]" if label else "[DOCUMENT]" + if block_type == "table": + return f"[TABLE]\n{text}" if text else "[TABLE]" + if block_type == "image": + label = block.get("uri") or block.get("asset_path") or text or "image" + return f"[IMAGE: {label}]" + return text or f"[{block_type.upper()}]" + + +def _extract_message_text(message: dict[str, Any]) -> str: + if isinstance(message.get("content"), str): + body = message["content"] + elif isinstance(message.get("content"), list): + parts: list[str] = [] + for part in message["content"]: + part_type = str(part.get("type", "text")) + if part_type == "text": + parts.append((part.get("text") or "").strip()) + elif part_type == "image_url": + url = "" + if isinstance(part.get("image_url"), dict): + url = part["image_url"].get("url") or "" + parts.append(f"[IMAGE: {url or 'image'}]") + body = "\n\n".join(item for item in parts if item) + else: + content_blocks = message.get("content_blocks") or [] + body = "\n\n".join( + filter(None, (_render_block_as_text(block) for block in content_blocks)) + ) + + role = str(message.get("role", "user")) + if role in {"tool", "retrieval", "execution"}: + prefix = f"[{ROLE_LABELS.get(role, role.upper())} RESULT]" + return f"{prefix}\n{body}" if body else prefix + return body + + +def _message_to_chat_payload(message: dict[str, Any]) -> dict[str, Any]: + role = str(message.get("role", "user")) + projected_role = role if role in {"system", "user", "assistant"} else "user" + content_blocks = message.get("content_blocks") or [] + + if not content_blocks: + return {"role": projected_role, "content": _extract_message_text(message)} + + parts: list[dict[str, Any]] = [] + if role not in {"system", "user", "assistant"}: + parts.append( + { + "type": "text", + "text": f"[{ROLE_LABELS.get(role, role.upper())} RESULT]", + } + ) + + for block in content_blocks: + block_type = str(block.get("type", "text")) + if block_type == "image" and block.get("uri"): + parts.append( + { + "type": "image_url", + "image_url": {"url": block["uri"]}, + } + ) + continue + + text = _render_block_as_text(block) + if text: + parts.append({"type": "text", "text": text}) + + if not parts: + return {"role": projected_role, "content": ""} + if len(parts) == 1 and parts[0]["type"] == "text": + return {"role": projected_role, "content": parts[0]["text"]} + return {"role": projected_role, "content": parts} + + +def _message_token_estimate( + message: dict[str, Any], + count_text_tokens: Callable[[str], int], + image_token_estimate: int, +) -> int: + content_blocks = message.get("content_blocks") or [] + if not content_blocks: + return count_text_tokens(_extract_message_text(message)) + + total = 0 + role = str(message.get("role", "user")) + if role in {"tool", "retrieval", "execution"}: + total += count_text_tokens(f"[{ROLE_LABELS.get(role, role.upper())} RESULT]") + + for block in content_blocks: + block_type = str(block.get("type", "text")) + if block_type == "image": + total += int( + block.get("asset_token_count") + or block.get("metadata", {}).get("token_count") + or image_token_estimate + ) + continue + if block.get("asset_token_count") and block.get("asset_path"): + total += int(block["asset_token_count"]) + continue + total += count_text_tokens(_render_block_as_text(block)) + return total + + +def _chat_payload_token_count( + chat_messages: list[dict[str, Any]], + count_text_tokens: Callable[[str], int], +) -> int: + """Count tokens in the rendered chat payload that will actually be sent over HTTP.""" + total = 0 + for msg in chat_messages: + content = msg.get("content", "") + if isinstance(content, str): + total += count_text_tokens(content) + elif isinstance(content, list): + for part in content: + if part.get("type") == "text": + total += count_text_tokens(part.get("text", "")) + elif part.get("type") == "image_url": + total += DEFAULT_IMAGE_TOKEN_ESTIMATE + return total + + +def _messages_to_completion_prompt(messages: list[dict[str, Any]]) -> str: + prompt_parts: list[str] = [] + for message in messages: + role = ROLE_LABELS.get(str(message.get("role", "user")), "USER") + body = _extract_message_text(message).strip() + prompt_parts.append(f"{role}:\n{body}" if body else f"{role}:") + prompt_parts.append("ASSISTANT:\n") + return "\n\n".join(prompt_parts) + + +def resolve_request_mode(runtime_stack_id: str, requested_mode: str) -> str: + if requested_mode != "auto": + return requested_mode + if runtime_stack_id in CHAT_NATIVE_RUNTIMES: + return "chat" + if runtime_stack_id in COMPLETIONS_PREFERRED_RUNTIMES: + return "completions" + return "chat" + + +def _parse_prometheus_sample(line: str) -> tuple[str, float] | None: + """Parse a Prometheus sample line into ``(metric_name, value)``.""" + raw_line = line.strip() + if not raw_line or raw_line.startswith("#"): + return None + + try: + metric_with_labels, raw_value = raw_line.rsplit(maxsplit=1) + metric_name = metric_with_labels.split("{", 1)[0] + return metric_name, float(raw_value) + except (TypeError, ValueError): + return None + + +def _resolve_output_len( + raw_output_len: Any, + fallback_output_len: int, + output_len_cap: Optional[int], +) -> int: + try: + output_len = int(raw_output_len) + except (TypeError, ValueError): + output_len = fallback_output_len + if output_len <= 0: + output_len = fallback_output_len + if output_len_cap is not None: + output_len = min(output_len, output_len_cap) + return output_len + + +def _build_turn_from_messages( + turn_idx: int, + turn_id: Any, + messages: list[dict[str, Any]], + output_len: int, + wait_before_s: float, + request_mode: str, + count_text_tokens: Callable[[str], int], + image_token_estimate: int, +) -> ReplayTurn: + chat_messages = [_message_to_chat_payload(message) for message in messages] + completion_prompt = _messages_to_completion_prompt(messages) + if request_mode == "chat": + context_len = sum( + _message_token_estimate(message, count_text_tokens, image_token_estimate) + for message in messages + ) + actual_context_len = _chat_payload_token_count(chat_messages, count_text_tokens) + else: + context_len = count_text_tokens(completion_prompt) + actual_context_len = context_len # completions mode already uses rendered text + return ReplayTurn( + turn_idx=turn_idx, + turn_id=turn_id, + output_len=output_len, + wait_before_s=wait_before_s, + context_len=context_len, + actual_context_len=actual_context_len, + chat_messages=chat_messages, + completion_prompt=completion_prompt, + ) + + +def _build_session_from_multiturn_cell( + cell: dict[str, Any], + request_mode: str, + count_text_tokens: Callable[[str], int], + image_token_estimate: int, + ignore_waits: bool, + fallback_output_len: int, + output_len_cap: Optional[int], + max_turns_per_session: Optional[int], +) -> ReplaySession: + session = cell["session"] + turns: list[ReplayTurn] = [] + for raw_turn in session.get("turns", []): + turns.append( + _build_turn_from_messages( + turn_idx=int(raw_turn.get("turn_idx", len(turns))), + turn_id=raw_turn.get("turn_id"), + messages=list(raw_turn.get("messages", [])), + output_len=_resolve_output_len( + raw_turn.get("expected_output_tokens"), + fallback_output_len, + output_len_cap, + ), + wait_before_s=0.0 + if ignore_waits + else float(raw_turn.get("wait_before_ms", 0)) / 1000.0, + request_mode=request_mode, + count_text_tokens=count_text_tokens, + image_token_estimate=image_token_estimate, + ) + ) + if max_turns_per_session is not None and len(turns) >= max_turns_per_session: + break + + return ReplaySession( + session_id=str(session.get("session_id", cell["trace_id"])), + trace_id=str(cell["trace_id"]), + runtime_stack_id=str(cell["runtime_stack_id"]), + hardware_profile_id=str(cell["hardware_profile_id"]), + canonical_model_id=str(cell["canonical_model_id"]), + support_status=str(cell.get("support_status", "unknown")), + benchmark_certification_status=str( + cell.get("benchmark_certification_status", "unknown") + ), + request_mode=request_mode, + adapter_id="inferencex_multiturn", + turns=turns, + ) + + +def _build_session_from_trace_replay_cell( + cell: dict[str, Any], + request_mode: str, + count_text_tokens: Callable[[str], int], + image_token_estimate: int, + ignore_waits: bool, + fallback_output_len: int, + output_len_cap: Optional[int], + max_turns_per_session: Optional[int], +) -> ReplaySession: + turns: list[ReplayTurn] = [] + prior_offset_ms = 0 + for index, event in enumerate(cell.get("events", [])): + offset_ms = int(event.get("arrival_time_offset_ms", 0) or 0) + wait_before_ms = 0 if index == 0 else max(0, offset_ms - prior_offset_ms) + prior_offset_ms = offset_ms + turns.append( + _build_turn_from_messages( + turn_idx=index, + turn_id=event.get("turn_id"), + messages=list(event.get("input_messages", [])), + output_len=_resolve_output_len( + event.get("target_output_tokens"), + fallback_output_len, + output_len_cap, + ), + wait_before_s=0.0 if ignore_waits else wait_before_ms / 1000.0, + request_mode=request_mode, + count_text_tokens=count_text_tokens, + image_token_estimate=image_token_estimate, + ) + ) + if max_turns_per_session is not None and len(turns) >= max_turns_per_session: + break + + return ReplaySession( + session_id=str(cell.get("trace_metadata", {}).get("session_id", cell["trace_id"])), + trace_id=str(cell["trace_id"]), + runtime_stack_id=str(cell["runtime_stack_id"]), + hardware_profile_id=str(cell["hardware_profile_id"]), + canonical_model_id=str(cell["canonical_model_id"]), + support_status=str(cell.get("support_status", "unknown")), + benchmark_certification_status=str( + cell.get("benchmark_certification_status", "unknown") + ), + request_mode=request_mode, + adapter_id="inferencex_trace_replay", + turns=turns, + ) + + +def load_replay_sessions( + export_file: str, + count_text_tokens: Callable[[str], int], + runtime_stack_ids: set[str] | None = None, + hardware_profile_ids: set[str] | None = None, + canonical_model_ids: set[str] | None = None, + trace_ids: set[str] | None = None, + support_statuses: set[str] | None = None, + request_mode: str = "auto", + image_token_estimate: int = DEFAULT_IMAGE_TOKEN_ESTIMATE, + ignore_waits: bool = False, + fallback_output_len: int = DEFAULT_FALLBACK_OUTPUT_LEN, + output_len_cap: Optional[int] = None, + session_offset: int = 0, + max_sessions: Optional[int] = None, + max_turns_per_session: Optional[int] = None, + shuffle_sessions: bool = False, + seed: int = 0, + allow_mixed_selection: bool = False, +) -> tuple[list[ReplaySession], dict[str, Any]]: + payload = json.loads(Path(export_file).read_text()) + adapter_id = str(payload.get("adapter_id", "unknown")) + export_cells = list(payload.get("exports", [])) + if adapter_id not in {"inferencex_multiturn", "inferencex_trace_replay"}: + raise ValueError( + f"Unsupported export adapter {adapter_id!r}. Expected " + "'inferencex_multiturn' or 'inferencex_trace_replay'." + ) + + selected_cells = [ + cell + for cell in export_cells + if _matches_filter(str(cell.get("runtime_stack_id", "")), runtime_stack_ids) + and _matches_filter(str(cell.get("hardware_profile_id", "")), hardware_profile_ids) + and _matches_filter(str(cell.get("canonical_model_id", "")), canonical_model_ids) + and _matches_filter(str(cell.get("trace_id", "")), trace_ids) + and _matches_filter(str(cell.get("support_status", "")), support_statuses) + ] + if not selected_cells: + raise ValueError( + "No export cells matched the requested filters. " + "Check runtime_stack_id / hardware_profile_id / canonical_model_id / " + "trace_id / support_status." + ) + + if shuffle_sessions: + random.Random(seed).shuffle(selected_cells) + + if session_offset: + selected_cells = selected_cells[session_offset:] + if max_sessions is not None: + selected_cells = selected_cells[:max_sessions] + if not selected_cells: + raise ValueError("Selection became empty after applying session_offset/max_sessions.") + + uniqueness = { + "runtime_stack_id": sorted({str(cell["runtime_stack_id"]) for cell in selected_cells}), + "hardware_profile_id": sorted({str(cell["hardware_profile_id"]) for cell in selected_cells}), + "canonical_model_id": sorted({str(cell["canonical_model_id"]) for cell in selected_cells}), + } + if not allow_mixed_selection: + mixed_fields = [field for field, values in uniqueness.items() if len(values) > 1] + if mixed_fields: + details = ", ".join(f"{field}={uniqueness[field]}" for field in mixed_fields) + raise ValueError( + "Selected export cells span multiple target server identities; " + f"filter more narrowly or pass --allow-mixed-selection. Mixed fields: {details}" + ) + + sessions: list[ReplaySession] = [] + for cell in selected_cells: + resolved_mode = resolve_request_mode(str(cell["runtime_stack_id"]), request_mode) + if adapter_id == "inferencex_multiturn": + sessions.append( + _build_session_from_multiturn_cell( + cell=cell, + request_mode=resolved_mode, + count_text_tokens=count_text_tokens, + image_token_estimate=image_token_estimate, + ignore_waits=ignore_waits, + fallback_output_len=fallback_output_len, + output_len_cap=output_len_cap, + max_turns_per_session=max_turns_per_session, + ) + ) + else: + sessions.append( + _build_session_from_trace_replay_cell( + cell=cell, + request_mode=resolved_mode, + count_text_tokens=count_text_tokens, + image_token_estimate=image_token_estimate, + ignore_waits=ignore_waits, + fallback_output_len=fallback_output_len, + output_len_cap=output_len_cap, + max_turns_per_session=max_turns_per_session, + ) + ) + + selection_metadata = { + "adapter_id": adapter_id, + "export_file": str(export_file), + "selected_sessions": len(sessions), + "trace_ids": [session.trace_id for session in sessions], + "runtime_stack_ids": sorted({session.runtime_stack_id for session in sessions}), + "hardware_profile_ids": sorted({session.hardware_profile_id for session in sessions}), + "canonical_model_ids": sorted({session.canonical_model_id for session in sessions}), + "support_statuses": sorted({session.support_status for session in sessions}), + "support_status_counts": { + status: sum(1 for session in sessions if session.support_status == status) + for status in sorted({session.support_status for session in sessions}) + }, + "benchmark_certification_statuses": sorted( + {session.benchmark_certification_status for session in sessions} + ), + "benchmark_certification_status_counts": { + status: sum( + 1 + for session in sessions + if session.benchmark_certification_status == status + ) + for status in sorted( + {session.benchmark_certification_status for session in sessions} + ) + }, + "request_mode_mix": { + mode: sum(1 for session in sessions if session.request_mode == mode) + for mode in sorted({session.request_mode for session in sessions}) + }, + } + return sessions, selection_metadata + + +async def _iter_sse_lines( + response: aiohttp.ClientResponse, +): + """Yield individual SSE data payloads from a streaming response. + + Buffers partial lines across TCP chunks and splits multi-line chunks. + Handles the common case where multiple ``data: {...}`` frames arrive + in a single TCP read, or a single frame is split across reads. + """ + buffer = b"" + async for chunk in response.content: + buffer += chunk + while b"\n" in buffer: + line, buffer = buffer.split(b"\n", 1) + line = line.strip() + if not line: + continue + decoded = line.decode("utf-8") + if decoded.startswith(":"): + continue # SSE comment / keep-alive + if decoded.startswith("data: "): + payload_str = decoded[6:].strip() + elif decoded.startswith("data:"): + payload_str = decoded[5:].strip() + else: + continue + if payload_str == "[DONE]": + return + yield payload_str + # Flush remaining buffer + remaining = buffer.strip() + if remaining: + decoded = remaining.decode("utf-8") + for prefix in ("data: ", "data:"): + if decoded.startswith(prefix): + payload_str = decoded[len(prefix):].strip() + if payload_str and payload_str != "[DONE]": + yield payload_str + break + + +async def _stream_chat_request( + api_url: str, + payload: dict[str, Any], + headers: dict[str, str], + context_len: int, + count_text_tokens: Callable[[str], int], + request_mode: str, +) -> tuple[TurnResult, int]: + turn = TurnResult( + turn_idx=-1, + context_len=context_len, + output_len=0, + success=False, + request_mode=request_mode, + ) + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: + async with session.post(url=api_url, json=payload, headers=headers) as response: + if response.status != 200: + error_text = (await response.text()).strip() + turn.error = f"HTTP {response.status}: {error_text or response.reason}" + return turn, response.status + + async for sse_payload in _iter_sse_lines(response): + data = json.loads(sse_payload) + if choices := data.get("choices"): + delta = choices[0].get("delta", {}) + content = delta.get("content") + if isinstance(content, list): + content = "".join( + part.get("text", "") + for part in content + if isinstance(part, dict) and part.get("type") == "text" + ) + if content: + timestamp = time.perf_counter() + if ttft == 0.0: + ttft = timestamp - st + turn.ttft = ttft + else: + turn.itl.append(timestamp - most_recent_timestamp) + most_recent_timestamp = timestamp + generated_text += content + elif usage := data.get("usage"): + turn.output_len = int(usage.get("completion_tokens") or 0) + + turn.e2el = max(0.0, most_recent_timestamp - st) + turn.success = True + if turn.output_len == 0 and generated_text: + turn.output_len = count_text_tokens(generated_text) + if turn.output_len > 1: + turn.tpot = (turn.e2el - turn.ttft) / (turn.output_len - 1) + return turn, 200 + + +async def _send_chat_turn( + chat_messages: list[dict[str, Any]], + model_id: str, + model_name: Optional[str], + api_url: str, + output_len: int, + context_len: int, + count_text_tokens: Callable[[str], int], + ignore_eos: bool = False, +) -> TurnResult: + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', 'EMPTY')}", + } + payload_base = { + "model": model_name or model_id, + "messages": chat_messages, + "temperature": 0.0, + "stream": True, + "stream_options": {"include_usage": True}, + } + if ignore_eos: + payload_base["ignore_eos"] = True + + errors: list[str] = [] + for max_tokens_key in ("max_completion_tokens", "max_tokens"): + payload = {**payload_base, max_tokens_key: output_len} + turn, status = await _stream_chat_request( + api_url=api_url, + payload=payload, + headers=headers, + context_len=context_len, + count_text_tokens=count_text_tokens, + request_mode="chat", + ) + if turn.success: + return turn + errors.append(turn.error) + if status not in {400, 404, 422}: + break + + return TurnResult( + turn_idx=-1, + context_len=context_len, + output_len=0, + success=False, + error=" | ".join(error for error in errors if error), + request_mode="chat", + ) + + +async def _send_completion_turn( + prompt: str, + model_id: str, + model_name: Optional[str], + api_url: str, + output_len: int, + context_len: int, + count_text_tokens: Callable[[str], int], + ignore_eos: bool = False, +) -> TurnResult: + payload = { + "model": model_name or model_id, + "prompt": prompt, + "temperature": 0.0, + "max_tokens": output_len, + "stream": True, + "stream_options": {"include_usage": True}, + } + if ignore_eos: + payload["ignore_eos"] = True + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', 'EMPTY')}", + } + + turn = TurnResult( + turn_idx=-1, + context_len=context_len, + output_len=0, + success=False, + request_mode="completions", + ) + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + + try: + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: + async with session.post(url=api_url, json=payload, headers=headers) as response: + if response.status != 200: + error_text = (await response.text()).strip() + turn.error = f"HTTP {response.status}: {error_text or response.reason}" + return turn + + async for sse_payload in _iter_sse_lines(response): + data = json.loads(sse_payload) + if choices := data.get("choices"): + choice = choices[0] + content = choice.get("text") + if content is None: + delta = choice.get("delta", {}) + content = delta.get("content") + if isinstance(content, list): + content = "".join( + part.get("text", "") + for part in content + if isinstance(part, dict) and part.get("type") == "text" + ) + if content: + timestamp = time.perf_counter() + if ttft == 0.0: + ttft = timestamp - st + turn.ttft = ttft + else: + turn.itl.append(timestamp - most_recent_timestamp) + most_recent_timestamp = timestamp + generated_text += content + elif usage := data.get("usage"): + turn.output_len = int(usage.get("completion_tokens") or 0) + except Exception as exc: + turn.error = str(exc) + return turn + + turn.e2el = max(0.0, most_recent_timestamp - st) + turn.success = True + if turn.output_len == 0 and generated_text: + turn.output_len = count_text_tokens(generated_text) + if turn.output_len > 1: + turn.tpot = (turn.e2el - turn.ttft) / (turn.output_len - 1) + return turn + + +async def poll_server_metrics(api_url: str, interval: float = 2.0) -> list[dict[str, float]]: + """Poll ``/metrics`` periodically to capture KV / cache status.""" + import urllib.parse + + parsed = urllib.parse.urlparse(api_url) + metrics_url = f"{parsed.scheme}://{parsed.netloc}/metrics" + metrics_history: list[dict[str, float]] = [] + + try: + async with aiohttp.ClientSession(trust_env=True) as session: + while True: + try: + async with session.get(metrics_url, timeout=aiohttp.ClientTimeout(total=5.0)) as response: + if response.status == 200: + text = await response.text() + snapshot: dict[str, float] = {} + for line in text.split("\n"): + parsed_line = _parse_prometheus_sample(line) + if parsed_line is None: + continue + metric_name, metric_value = parsed_line + if metric_name == "vllm:gpu_cache_usage_perc": + snapshot["vllm_gpu_cache_usage"] = metric_value + elif metric_name == "vllm:cpu_cache_usage_perc": + snapshot["vllm_cpu_cache_usage"] = metric_value + elif metric_name == "sglang:cache_hit_rate": + snapshot["sglang_cache_hit_rate"] = metric_value + elif metric_name == "sglang:kv_cache_usage": + snapshot["sglang_kv_cache_usage"] = metric_value + elif metric_name == "sglang:token_usage": + snapshot["sglang_token_usage"] = metric_value + elif metric_name == "vllm:num_preemptions_total": + snapshot["vllm_preemptions_total"] = metric_value + elif metric_name == "vllm:num_requests_running": + snapshot["vllm_requests_running"] = metric_value + elif metric_name == "vllm:num_requests_waiting": + snapshot["vllm_requests_waiting"] = metric_value + if snapshot: + metrics_history.append(snapshot) + except Exception: + pass + await asyncio.sleep(interval) + except asyncio.CancelledError: + pass + + return metrics_history + + +def _percentile(values: list[float], percentile: float) -> float: + if not values: + return 0.0 + return float(np.percentile(values, percentile)) + + +def calculate_multiturn_metrics( + session_results: list[SessionResult], + max_turns: int, + selected_percentiles: list[float], +) -> dict[str, Any]: + ms = 1000.0 + per_turn: dict[str, dict[str, Any]] = {} + + for turn_index in range(max_turns): + ttfts: list[float] = [] + tpots: list[float] = [] + e2els: list[float] = [] + context_lens: list[int] = [] + actual_context_lens: list[int] = [] + output_lens: list[int] = [] + successes = 0 + for session in session_results: + if turn_index < len(session.turns): + turn = session.turns[turn_index] + if turn.success: + ttfts.append(turn.ttft) + tpots.append(turn.tpot) + e2els.append(turn.e2el) + context_lens.append(turn.context_len) + actual_context_lens.append(turn.actual_context_len) + output_lens.append(turn.output_len) + successes += 1 + + key = f"turn_{turn_index + 1}" + metrics: dict[str, Any] = { + "completed": successes, + "mean_context_len": float(np.mean(context_lens)) if context_lens else 0.0, + "mean_actual_context_len": float(np.mean(actual_context_lens)) if actual_context_lens else 0.0, + "mean_output_len": float(np.mean(output_lens)) if output_lens else 0.0, + } + for label, values in (("ttft", ttfts), ("tpot", tpots), ("e2el", e2els)): + metrics[f"mean_{label}_ms"] = float(np.mean(values)) * ms if values else 0.0 + metrics[f"median_{label}_ms"] = float(np.median(values)) * ms if values else 0.0 + metrics[f"std_{label}_ms"] = float(np.std(values)) * ms if values else 0.0 + for percentile in selected_percentiles: + percentile_label = str(int(percentile)) if int(percentile) == percentile else str(percentile) + metrics[f"p{percentile_label}_{label}_ms"] = _percentile(values, percentile) * ms + per_turn[key] = metrics + + all_ttfts: list[float] = [] + all_tpots: list[float] = [] + all_e2els: list[float] = [] + total_input = 0 + total_actual_input = 0 + total_output = 0 + completed_sessions = 0 + total_wall = 0.0 + max_actual_context_per_turn = 0 + + for session in session_results: + if session.turns and all(turn.success for turn in session.turns): + completed_sessions += 1 + total_input += session.total_input_tokens + total_actual_input += session.total_actual_input_tokens + total_output += session.total_output_tokens + total_wall = max(total_wall, session.total_duration) + for turn in session.turns: + if turn.success: + all_ttfts.append(turn.ttft) + all_tpots.append(turn.tpot) + all_e2els.append(turn.e2el) + if turn.actual_context_len > max_actual_context_per_turn: + max_actual_context_per_turn = turn.actual_context_len + + aggregate: dict[str, Any] = { + "completed_sessions": completed_sessions, + "total_sessions": len(session_results), + "total_input_tokens": total_input, + "total_actual_input_tokens": total_actual_input, + "max_actual_context_len_per_turn": max_actual_context_per_turn, + "total_output_tokens": total_output, + "total_wall_time_s": total_wall, + "session_throughput_sps": completed_sessions / total_wall if total_wall > 0 else 0.0, + "output_throughput_tps": total_output / total_wall if total_wall > 0 else 0.0, + "total_token_throughput_tps": (total_input + total_output) / total_wall if total_wall > 0 else 0.0, + } + for label, values in (("ttft", all_ttfts), ("tpot", all_tpots), ("e2el", all_e2els)): + aggregate[f"mean_{label}_ms"] = float(np.mean(values)) * ms if values else 0.0 + aggregate[f"median_{label}_ms"] = float(np.median(values)) * ms if values else 0.0 + aggregate[f"std_{label}_ms"] = float(np.std(values)) * ms if values else 0.0 + for percentile in selected_percentiles: + percentile_label = str(int(percentile)) if int(percentile) == percentile else str(percentile) + aggregate[f"p{percentile_label}_{label}_ms"] = _percentile(values, percentile) * ms + + return {"per_turn_metrics": per_turn, "aggregate_metrics": aggregate} + + +async def _run_replay_session( + session: ReplaySession, + model_id: str, + model_name: Optional[str], + chat_api_url: str, + completion_api_url: str, + count_text_tokens: Callable[[str], int], + pbar: Optional[tqdm], + ignore_eos: bool, +) -> SessionResult: + result = SessionResult(session_id=session.session_id) + start = time.perf_counter() + + for replay_turn in session.turns: + if replay_turn.wait_before_s > 0: + await asyncio.sleep(replay_turn.wait_before_s) + + if session.request_mode == "chat": + turn_result = await _send_chat_turn( + chat_messages=replay_turn.chat_messages, + model_id=model_id, + model_name=model_name, + api_url=chat_api_url, + output_len=replay_turn.output_len, + context_len=replay_turn.context_len, + count_text_tokens=count_text_tokens, + ignore_eos=ignore_eos, + ) + else: + turn_result = await _send_completion_turn( + prompt=replay_turn.completion_prompt, + model_id=model_id, + model_name=model_name, + api_url=completion_api_url, + output_len=replay_turn.output_len, + context_len=replay_turn.context_len, + count_text_tokens=count_text_tokens, + ignore_eos=ignore_eos, + ) + + turn_result.turn_idx = replay_turn.turn_idx + turn_result.actual_context_len = replay_turn.actual_context_len + result.turns.append(turn_result) + if turn_result.success: + result.total_input_tokens += turn_result.context_len + result.total_actual_input_tokens += turn_result.actual_context_len + result.total_output_tokens += turn_result.output_len + if pbar is not None: + pbar.update(1) + + result.total_duration = time.perf_counter() - start + return result + + +async def _run_warmup_sessions( + sessions: list[ReplaySession], + model_id: str, + model_name: Optional[str], + chat_api_url: str, + completion_api_url: str, + count_text_tokens: Callable[[str], int], + num_warmup_sessions: int, + ignore_eos: bool, +) -> None: + if num_warmup_sessions <= 0 or not sessions: + return + + print(f"Running {num_warmup_sessions} warmup session(s) (results discarded) ...") + warmup_jobs: list[asyncio.Task[SessionResult]] = [] + for index in range(num_warmup_sessions): + source = sessions[index % len(sessions)] + warmup_turns = [ + ReplayTurn( + turn_idx=turn.turn_idx, + turn_id=turn.turn_id, + output_len=turn.output_len, + wait_before_s=0.0, + context_len=turn.context_len, + actual_context_len=turn.actual_context_len, + chat_messages=turn.chat_messages, + completion_prompt=turn.completion_prompt, + ) + for turn in source.turns[: min(2, len(source.turns))] + ] + warmup_jobs.append( + asyncio.create_task( + _run_replay_session( + session=ReplaySession( + session_id=f"warmup-{index}", + trace_id=source.trace_id, + runtime_stack_id=source.runtime_stack_id, + hardware_profile_id=source.hardware_profile_id, + canonical_model_id=source.canonical_model_id, + support_status=source.support_status, + benchmark_certification_status=source.benchmark_certification_status, + request_mode=source.request_mode, + adapter_id=source.adapter_id, + turns=warmup_turns, + ), + model_id=model_id, + model_name=model_name, + chat_api_url=chat_api_url, + completion_api_url=completion_api_url, + count_text_tokens=count_text_tokens, + pbar=None, + ignore_eos=ignore_eos, + ) + ) + ) + + results = await asyncio.gather(*warmup_jobs, return_exceptions=True) + succeeded = sum( + 1 + for result in results + if isinstance(result, SessionResult) and any(turn.success for turn in result.turns) + ) + failed = num_warmup_sessions - succeeded + if failed: + print( + f" ⚠️ {failed}/{num_warmup_sessions} warmup session(s) failed. " + "Check the server endpoint and selected export cell." + ) + else: + print(f" ✅ {succeeded} warmup session(s) completed successfully.") + print() + + +async def run_export_replay_benchmark( + sessions: list[ReplaySession], + selection_metadata: dict[str, Any], + model_id: str, + model_name: Optional[str], + chat_api_url: str, + completion_api_url: str, + count_text_tokens: Callable[[str], int], + max_concurrency: int, + selected_percentiles: list[float], + disable_tqdm: bool, + num_warmup_sessions: int = 1, + ignore_eos: bool = False, +) -> dict[str, Any]: + if not sessions: + raise ValueError("No replay sessions were selected.") + + max_turns = max(len(session.turns) for session in sessions) + total_turns = sum(len(session.turns) for session in sessions) + + print("============================================================") + print(" Export Replay Selection") + print("============================================================") + print(f" Adapter: {selection_metadata['adapter_id']}") + print(f" Sessions selected: {selection_metadata['selected_sessions']}") + print(f" Runtime stack(s): {', '.join(selection_metadata['runtime_stack_ids'])}") + print(f" Hardware profile(s): {', '.join(selection_metadata['hardware_profile_ids'])}") + print(f" Canonical model(s): {', '.join(selection_metadata['canonical_model_ids'])}") + print( + " Support status(es): " + f"{', '.join(selection_metadata['support_statuses'])}" + ) + print( + " Certification status: " + f"{', '.join(selection_metadata['benchmark_certification_statuses'])}" + ) + print(f" Request mode mix: {selection_metadata['request_mode_mix']}") + print(f" Total turns: {total_turns}") + print("============================================================") + print() + + await _run_warmup_sessions( + sessions=sessions, + model_id=model_id, + model_name=model_name, + chat_api_url=chat_api_url, + completion_api_url=completion_api_url, + count_text_tokens=count_text_tokens, + num_warmup_sessions=num_warmup_sessions, + ignore_eos=ignore_eos, + ) + + pbar = None if disable_tqdm else tqdm(total=total_turns, desc="turns") + semaphore = asyncio.Semaphore(max_concurrency) + + async def _limited_run(session: ReplaySession) -> SessionResult: + async with semaphore: + return await _run_replay_session( + session=session, + model_id=model_id, + model_name=model_name, + chat_api_url=chat_api_url, + completion_api_url=completion_api_url, + count_text_tokens=count_text_tokens, + pbar=pbar, + ignore_eos=ignore_eos, + ) + + print( + f"Starting export replay benchmark: {len(sessions)} sessions, " + f"max_turns={max_turns}, max_concurrency={max_concurrency}" + ) + benchmark_start = time.perf_counter() + metrics_task = asyncio.create_task(poll_server_metrics(chat_api_url, interval=2.0)) + jobs = [asyncio.create_task(_limited_run(session)) for session in sessions] + session_results = await asyncio.gather(*jobs) + benchmark_duration = time.perf_counter() - benchmark_start + + metrics_task.cancel() + try: + server_metrics = await metrics_task + except asyncio.CancelledError: + server_metrics = [] + + if pbar is not None: + pbar.close() + + metrics = calculate_multiturn_metrics( + session_results=session_results, + max_turns=max_turns, + selected_percentiles=selected_percentiles, + ) + aggregate = metrics["aggregate_metrics"] + per_turn = metrics["per_turn_metrics"] + + cache_usage_avg = 0.0 + cache_hit_rate_avg = 0.0 + gpu_cache_usage_avg = 0.0 + gpu_cache_usage_peak = 0.0 + cpu_cache_usage_avg = 0.0 + cpu_cache_usage_peak = 0.0 + gpu_cache_metric_name: str | None = None + cpu_cache_metric_name: str | None = None + observability_status = "no_cache_metrics" + cpu_samples: list[float] = [] + kv_offload_observed = False + if server_metrics: + vllm_gpu_samples = [ + item["vllm_gpu_cache_usage"] + for item in server_metrics + if "vllm_gpu_cache_usage" in item + ] + sglang_gpu_samples: list[float] = [] + saw_sglang_kv_metric = False + saw_sglang_token_metric = False + for item in server_metrics: + if "sglang_kv_cache_usage" in item: + sglang_gpu_samples.append(item["sglang_kv_cache_usage"]) + saw_sglang_kv_metric = True + elif "sglang_token_usage" in item: + sglang_gpu_samples.append(item["sglang_token_usage"]) + saw_sglang_token_metric = True + + if saw_sglang_kv_metric: + gpu_cache_metric_name = "sglang:kv_cache_usage" + elif saw_sglang_token_metric: + gpu_cache_metric_name = "sglang:token_usage" + + if vllm_gpu_samples: + gpu_samples = vllm_gpu_samples + gpu_cache_metric_name = "vllm:gpu_cache_usage_perc" + else: + gpu_samples = sglang_gpu_samples + + cpu_samples = [ + item["vllm_cpu_cache_usage"] + for item in server_metrics + if "vllm_cpu_cache_usage" in item + ] + if cpu_samples: + cpu_cache_metric_name = "vllm:cpu_cache_usage_perc" + cache_hit_samples = [ + item["sglang_cache_hit_rate"] + for item in server_metrics + if "sglang_cache_hit_rate" in item + ] + + if gpu_samples: + gpu_cache_usage_avg = float(np.mean(gpu_samples)) + gpu_cache_usage_peak = float(np.max(gpu_samples)) + cache_usage_avg = gpu_cache_usage_avg + if cpu_samples: + cpu_cache_usage_avg = float(np.mean(cpu_samples)) + cpu_cache_usage_peak = float(np.max(cpu_samples)) + kv_offload_observed = any(sample > 0.0 for sample in cpu_samples) + if cache_hit_samples: + cache_hit_rate_avg = float(np.mean(cache_hit_samples)) + if cpu_samples: + observability_status = "direct_cpu_cache_metric" + elif gpu_samples or cache_hit_samples: + observability_status = "indirect_without_cpu_cache_metric" + + print() + print("{s:{c}^{n}}".format(s=" Export Replay Benchmark Result ", n=60, c="=")) + print(f" {'Completed sessions:':<35} {aggregate['completed_sessions']}/{aggregate['total_sessions']}") + print(f" {'Benchmark duration (s):':<35} {benchmark_duration:.2f}") + print(f" {'Total input tokens (estimated):':<35} {aggregate['total_input_tokens']}") + print(f" {'Total input tokens (actual sent):':<35} {aggregate['total_actual_input_tokens']}") + print(f" {'Max actual context/turn:':<35} {aggregate['max_actual_context_len_per_turn']}") + print(f" {'Total output tokens:':<35} {aggregate['total_output_tokens']}") + print(f" {'Session throughput (sessions/s):':<35} {aggregate['session_throughput_sps']:.2f}") + print(f" {'Output throughput (tok/s):':<35} {aggregate['output_throughput_tps']:.2f}") + print(f" {'Total throughput (tok/s):':<35} {aggregate['total_token_throughput_tps']:.2f}") + if server_metrics: + print() + print(f" {'Server KV Cache Usage (avg):':<35} {cache_usage_avg:.1%}") + if cpu_cache_metric_name: + print(f" {'Server CPU Cache Usage (avg):':<35} {cpu_cache_usage_avg:.1%}") + if cache_hit_rate_avg > 0: + print(f" {'Prefix Cache Hit Rate (avg):':<35} {cache_hit_rate_avg:.1%}") + if observability_status == "indirect_without_cpu_cache_metric": + print( + f" {'Offload observability:':<35} " + "indirect only (no direct CPU cache metric)" + ) + print() + print("{s:{c}^{n}}".format(s=" Per-Turn TTFT Progression ", n=60, c="-")) + print(f" {'Turn':<8} {'Est Ctx':<10} {'Act Ctx':<10} {'Mean TTFT':<14} {'P99 TTFT':<14} {'Mean E2EL':<14}") + print(f" {'─'*8} {'─'*10} {'─'*10} {'─'*14} {'─'*14} {'─'*14}") + for turn_index in range(max_turns): + key = f"turn_{turn_index + 1}" + if key not in per_turn: + continue + turn_metrics = per_turn[key] + print( + f" {turn_index + 1:<8} " + f"{turn_metrics['mean_context_len']:<10.0f} " + f"{turn_metrics.get('mean_actual_context_len', 0.0):<10.0f} " + f"{turn_metrics['mean_ttft_ms']:<14.1f} " + f"{turn_metrics.get('p99_ttft_ms', 0.0):<14.1f} " + f"{turn_metrics['mean_e2el_ms']:<14.1f}" + ) + print("=" * 60) + + return { + "mode": "export_replay", + "adapter_id": selection_metadata["adapter_id"], + "selection": selection_metadata, + "duration": benchmark_duration, + "num_sessions": len(sessions), + "max_turns": max_turns, + "max_concurrency": max_concurrency, + "num_warmup_sessions": num_warmup_sessions, + "server_metrics_summary": { + "cache_usage_avg": cache_usage_avg, + "cache_hit_rate_avg": cache_hit_rate_avg, + "gpu_cache_usage_avg": gpu_cache_usage_avg, + "gpu_cache_usage_peak": gpu_cache_usage_peak, + "gpu_cache_metric_name": gpu_cache_metric_name, + "cpu_cache_usage_avg": cpu_cache_usage_avg, + "cpu_cache_usage_peak": cpu_cache_usage_peak, + "cpu_cache_metric_name": cpu_cache_metric_name, + "cpu_cache_metric_available": bool(cpu_samples), + "observability_status": observability_status, + # Observability-only signal; not a certification or quality claim. + "kv_offload_observed": kv_offload_observed, + "samples": len(server_metrics), + "preemption_count": int( + max( + (item.get("vllm_preemptions_total", 0.0) for item in server_metrics), + default=0.0, + ) + ) if server_metrics else 0, + "peak_requests_running": float( + max( + (item.get("vllm_requests_running", 0.0) for item in server_metrics), + default=0.0, + ) + ) if server_metrics else 0.0, + "peak_requests_waiting": float( + max( + (item.get("vllm_requests_waiting", 0.0) for item in server_metrics), + default=0.0, + ) + ) if server_metrics else 0.0, + }, + "depth_telemetry": { + "total_estimated_input_tokens": aggregate["total_input_tokens"], + "total_actual_input_tokens": aggregate["total_actual_input_tokens"], + "max_actual_context_len_per_turn": aggregate["max_actual_context_len_per_turn"], + }, + **metrics, + } + + +def main(args: argparse.Namespace) -> None: + random.seed(args.seed) + np.random.seed(args.seed) + + base_url = args.base_url or f"http://{args.host}:{args.port}" + base_url = base_url.rstrip("/") + chat_api_url = args.chat_api_url or f"{base_url}{args.chat_endpoint}" + completion_api_url = args.completion_api_url or f"{base_url}{args.completion_endpoint}" + + tokenizer_id = None if args.skip_tokenizer_load else (args.tokenizer or args.model) + count_text_tokens = build_text_token_counter( + tokenizer_id=tokenizer_id, + tokenizer_mode=args.tokenizer_mode, + trust_remote_code=args.trust_remote_code, + ) + sessions, selection_metadata = load_replay_sessions( + export_file=args.export_file, + count_text_tokens=count_text_tokens, + runtime_stack_ids=_csv_values(args.runtime_stack_id), + hardware_profile_ids=_csv_values(args.hardware_profile_id), + canonical_model_ids=_csv_values(args.canonical_model_id), + trace_ids=_csv_values(args.trace_id), + support_statuses=_csv_values(args.support_status), + request_mode=args.request_mode, + image_token_estimate=args.image_token_estimate, + ignore_waits=args.ignore_waits, + fallback_output_len=args.fallback_output_len, + output_len_cap=args.max_output_len, + session_offset=args.session_offset, + max_sessions=args.max_sessions, + max_turns_per_session=args.max_turns_per_session, + shuffle_sessions=args.shuffle_sessions, + seed=args.seed, + allow_mixed_selection=args.allow_mixed_selection, + ) + + result = asyncio.run( + run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection_metadata, + model_id=args.model, + model_name=args.served_model_name, + chat_api_url=chat_api_url, + completion_api_url=completion_api_url, + count_text_tokens=count_text_tokens, + max_concurrency=args.max_concurrency, + selected_percentiles=[float(item) for item in args.metric_percentiles.split(",")], + disable_tqdm=args.disable_tqdm, + num_warmup_sessions=args.num_warmup_sessions, + ignore_eos=args.ignore_eos, + ) + ) + + if args.save_result: + result_json: dict[str, Any] = { + "date": datetime.now().strftime("%Y%m%d-%H%M%S"), + "model_id": args.model, + } + if tokenizer_id is not None: + result_json["tokenizer_id"] = tokenizer_id + if args.metadata: + for item in args.metadata: + if "=" in item: + key, value = item.split("=", 1) + result_json[key.strip()] = value.strip() + result_json = {**result_json, **result} + + file_name = args.result_filename or f"export-replay-{Path(args.export_file).stem}.json" + if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) + file_name = os.path.join(args.result_dir, file_name) + + with open(file_name, "w", encoding="utf-8") as handle: + json.dump(result_json, handle, indent=2) + print(f"\nResults saved to {file_name}") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description=( + "Replay ISB1 export sessions against an OpenAI-compatible server. " + "Supports chat-completions replay for standalone vLLM/SGLang and " + "prompt-projected completions replay for TRT / Dynamo-style cells." + ) + ) + + parser.add_argument("--export-file", type=str, required=True, + help="Path to an inferencex_multiturn or inferencex_trace_replay export JSON") + parser.add_argument("--base-url", type=str, default=None, + help="Server base URL, e.g. http://0.0.0.0:8000") + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--chat-endpoint", type=str, default="/v1/chat/completions") + parser.add_argument("--completion-endpoint", type=str, default="/v1/completions") + parser.add_argument("--chat-api-url", type=str, default=None, + help="Override the full chat endpoint URL") + parser.add_argument("--completion-api-url", type=str, default=None, + help="Override the full completions endpoint URL") + + parser.add_argument("--model", type=str, required=True, + help="Model identifier sent to the target server") + parser.add_argument("--served-model-name", type=str, default=None, + help="Served model name if different from --model") + parser.add_argument("--tokenizer", type=str, default=None, + help="Tokenizer name/path if different from --model") + parser.add_argument("--tokenizer-mode", type=str, default="auto", + choices=["auto", "slow", "mistral", "custom"]) + parser.add_argument("--trust-remote-code", action="store_true") + parser.add_argument("--skip-tokenizer-load", action="store_true", + help="Use approximate token counting instead of loading a tokenizer") + + parser.add_argument("--runtime-stack-id", type=str, default=None, + help="Comma-separated runtime_stack_id filter(s)") + parser.add_argument("--hardware-profile-id", type=str, default=None, + help="Comma-separated hardware_profile_id filter(s)") + parser.add_argument("--canonical-model-id", type=str, default=None, + help="Comma-separated canonical_model_id filter(s)") + parser.add_argument("--trace-id", type=str, default=None, + help="Comma-separated trace_id filter(s)") + parser.add_argument("--support-status", type=str, default=None, + help="Comma-separated support_status filter(s)") + parser.add_argument("--request-mode", type=str, default="auto", + choices=["auto", "chat", "completions"]) + parser.add_argument("--allow-mixed-selection", action="store_true", + help="Allow multiple runtime/model/hardware identities in one run") + parser.add_argument("--shuffle-sessions", action="store_true") + parser.add_argument("--session-offset", type=int, default=0) + parser.add_argument("--max-sessions", type=int, default=None) + parser.add_argument("--max-turns-per-session", type=int, default=None) + parser.add_argument("--ignore-waits", action="store_true", + help="Ignore export wait_before/arrival-time gaps") + parser.add_argument("--fallback-output-len", type=int, default=DEFAULT_FALLBACK_OUTPUT_LEN, + help="Fallback output length when export metadata is missing") + parser.add_argument("--max-output-len", type=int, default=None, + help="Optional cap applied to each exported target output length") + parser.add_argument("--image-token-estimate", type=int, default=DEFAULT_IMAGE_TOKEN_ESTIMATE, + help="Approximate token cost for image blocks when no explicit token count exists") + + parser.add_argument("--max-concurrency", type=int, default=8, + help="Maximum concurrently active replay sessions") + parser.add_argument("--num-warmup-sessions", type=int, default=1, + help="Warmup sessions to prime KV/prefix cache before measurement") + parser.add_argument("--ignore-eos", action="store_true") + + parser.add_argument("--save-result", action="store_true") + parser.add_argument("--result-dir", type=str, default=None) + parser.add_argument("--result-filename", type=str, default=None) + parser.add_argument("--metadata", metavar="KEY=VALUE", nargs="*") + parser.add_argument("--metric-percentiles", type=str, default="90,99,99.9") + + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--disable-tqdm", action="store_true") + + main(parser.parse_args()) diff --git a/utils/gate_isb1.py b/utils/gate_isb1.py new file mode 100644 index 000000000..e223e8c29 --- /dev/null +++ b/utils/gate_isb1.py @@ -0,0 +1,298 @@ +import argparse +import json +from pathlib import Path +from typing import Any, Callable + + +Row = dict[str, Any] +Criterion = tuple[str, Callable[[Row], bool]] + +EXPECTED_131K_COVERAGE = { + ("b200", "vllm"), + ("b200", "sglang"), + ("h100", "vllm"), + ("h100", "sglang"), + ("h200", "vllm"), + ("h200", "sglang"), +} +EXPECTED_1M_COVERAGE = { + ("b200", "vllm"), + ("b200", "sglang"), +} + + +def normalize_hw_label(hw: str | None) -> str: + """Normalize runner labels like h200-cw-1 to coverage labels like h200.""" + if not hw: + return "" + return hw.split("-", 1)[0] + + +def load_rows(report_path: Path) -> list[Row]: + """Load aggregated ISB1 rows from JSON.""" + payload = json.loads(report_path.read_text()) + if isinstance(payload, list): + return [row for row in payload if isinstance(row, dict)] + if isinstance(payload, dict): + return [payload] + raise ValueError(f"Unsupported ISB1 payload type: {type(payload)!r}") + + +def build_row_reference(row: Row, failed_criteria: list[str] | None = None) -> Row: + """Build a concise row reference for gate reports.""" + reference: Row = { + "result_filename": row.get("result_filename"), + "artifact_stems": row.get("artifact_stems") or {}, + "hw": row.get("hw"), + "framework": row.get("framework"), + "infmax_model_prefix": row.get("infmax_model_prefix"), + "support_status": row.get("support_status"), + "context_pressure_status": (row.get("context_pressure_signal") or {}).get("status"), + } + if failed_criteria: + reference["failed_criteria"] = failed_criteria + return reference + + +def completed_sessions_match(row: Row) -> bool: + return row.get("completed_sessions") == row.get("total_sessions") + + +def throughput_positive(row: Row) -> bool: + return float(row.get("session_throughput_sps") or 0.0) > 0.0 + + +def certification_verified(row: Row) -> bool: + return row.get("benchmark_certification_status") == "dataset_replay_verified" + + +def context_not_suspicious(row: Row) -> bool: + return not bool(row.get("context_pressure_suspicious")) + + +def vllm_context_ok(row: Row) -> bool: + if row.get("framework") != "vllm": + return True + signal = row.get("context_pressure_signal") or {} + return signal.get("status") == "ok" and not bool(row.get("context_pressure_suspicious")) + + +def get_present_coverage(rows: list[Row]) -> set[tuple[str, str]]: + return { + (normalize_hw_label(row.get("hw")), row.get("framework", "")) + for row in rows + } + + +def evaluate_gate( + gate_id: str, + label: str, + rows: list[Row], + criteria: list[Criterion], + *, + expected_coverage: set[tuple[str, str]] | None = None, + exact_coverage: bool = False, +) -> Row: + """Evaluate a gate definition over matching rows.""" + if not rows: + return { + "id": gate_id, + "label": label, + "status": "no_rows", + "matched_rows": 0, + "failing_rows": [], + "review_required_rows": [], + "missing_coverage": [], + "unexpected_coverage": [], + } + + failing_rows = [] + review_required_rows = [] + for row in rows: + failed_criteria = [description for description, checker in criteria if not checker(row)] + if failed_criteria: + failing_rows.append(build_row_reference(row, failed_criteria)) + signal = row.get("context_pressure_signal") or {} + if signal.get("requires_log_review"): + review_required_rows.append(build_row_reference(row)) + + missing_coverage: list[list[str]] = [] + unexpected_coverage: list[list[str]] = [] + if expected_coverage is not None: + present_coverage = get_present_coverage(rows) + missing_coverage = [list(item) for item in sorted(expected_coverage - present_coverage)] + if exact_coverage: + unexpected_coverage = [list(item) for item in sorted(present_coverage - expected_coverage)] + + status = "pass" + if failing_rows or missing_coverage or unexpected_coverage: + status = "fail" + + return { + "id": gate_id, + "label": label, + "status": status, + "matched_rows": len(rows), + "failing_rows": failing_rows, + "review_required_rows": review_required_rows, + "missing_coverage": missing_coverage, + "unexpected_coverage": unexpected_coverage, + } + + +def build_gate_report(rows: list[Row], advisory: bool = True) -> Row: + """Build the full advisory gate report for an aggregated ISB1 result set.""" + gates = [ + evaluate_gate( + "control_lanes", + "DSR1/GPT-OSS control lanes", + [ + row + for row in rows + if row.get("infmax_model_prefix") in {"dsr1", "gptoss"} + and row.get("support_status") == "supported" + ], + [ + ("completed_sessions == total_sessions", completed_sessions_match), + ("session_throughput_sps > 0", throughput_positive), + ], + ), + evaluate_gate( + "qwen_131k", + "Qwen 131k preview lanes", + [ + row + for row in rows + if row.get("infmax_model_prefix") == "qwen3.5" + and row.get("support_status") == "reviewed_preview" + and (row.get("effective_max_context_depth") or 0) < 200000 + ], + [ + ("completed_sessions == total_sessions", completed_sessions_match), + ("session_throughput_sps > 0", throughput_positive), + ], + expected_coverage=EXPECTED_131K_COVERAGE, + ), + evaluate_gate( + "qwen_500k", + "Qwen 500k preview lanes", + [ + row + for row in rows + if row.get("infmax_model_prefix") == "qwen3.5" + and row.get("effective_max_context_depth") == 524288 + and row.get("context_pressure_class") == "extended_500k" + ], + [ + ("completed_sessions == total_sessions", completed_sessions_match), + ( + "benchmark_certification_status == dataset_replay_verified", + certification_verified, + ), + ("context_pressure_suspicious == false", context_not_suspicious), + ("vllm context_pressure_signal.status == ok", vllm_context_ok), + ], + ), + evaluate_gate( + "qwen_1m", + "Qwen 1M preview lanes", + [ + row + for row in rows + if row.get("infmax_model_prefix") == "qwen3.5" + and row.get("effective_max_context_depth") == 1048576 + and row.get("context_pressure_class") == "extended_1m" + ], + [ + ("completed_sessions == total_sessions", completed_sessions_match), + ("context_pressure_suspicious == false", context_not_suspicious), + ("vllm context_pressure_signal.status == ok", vllm_context_ok), + ], + expected_coverage=EXPECTED_1M_COVERAGE, + exact_coverage=True, + ), + ] + + statuses = {gate["status"] for gate in gates} + if "fail" in statuses: + overall = "fail" + elif statuses == {"pass"}: + overall = "pass" + else: + overall = "partial" + + return { + "gates": gates, + "overall": overall, + "advisory": advisory, + } + + +def render_markdown(report: Row) -> str: + """Render a concise markdown advisory summary for workflow step summaries.""" + lines = [ + "## ISB1 Advisory Gates", + "", + f"Overall: **{report['overall'].upper()}** ({'advisory' if report['advisory'] else 'strict'})", + "", + ] + + for gate in report["gates"]: + lines.append(f"### {gate['label']} — {gate['status'].upper()}") + lines.append("") + lines.append(f"- Matched rows: {gate['matched_rows']}") + if gate["missing_coverage"]: + formatted = ", ".join(f"{hw}/{framework}" for hw, framework in gate["missing_coverage"]) + lines.append(f"- Missing coverage: {formatted}") + if gate["unexpected_coverage"]: + formatted = ", ".join( + f"{hw}/{framework}" for hw, framework in gate["unexpected_coverage"] + ) + lines.append(f"- Unexpected coverage: {formatted}") + if gate["failing_rows"]: + lines.append("- Failing rows:") + for row in gate["failing_rows"]: + failed_criteria = ", ".join(row.get("failed_criteria", [])) or "unknown" + lines.append( + f" - `{row.get('result_filename', 'unknown')}` ({row.get('hw', '-')}/" + f"{row.get('framework', '-')}) failed: {failed_criteria}" + ) + elif gate["matched_rows"]: + lines.append("- No failing rows.") + if gate["review_required_rows"]: + review_rows = ", ".join( + f"`{row.get('result_filename', 'unknown')}`" for row in gate["review_required_rows"] + ) + lines.append( + "- Manual log review still required for: " + f"{review_rows}" + ) + lines.append("") + + return "\n".join(lines).rstrip() + "\n" + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Evaluate advisory ISB1 gates.") + parser.add_argument("report_path", type=Path) + parser.add_argument("--strict", action="store_true") + parser.add_argument("--format", choices=["json", "markdown"], default="json") + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + report = build_gate_report(load_rows(args.report_path), advisory=not args.strict) + + if args.format == "markdown": + print(render_markdown(report)) + else: + print(json.dumps(report, indent=2)) + + if args.strict and report["overall"] == "fail": + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index bc4562415..14c69d3e9 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -10,7 +10,11 @@ from validation import ( validate_matrix_entry, + validate_isb1_matrix_entry, + validate_isb1_kv_stress_matrix_entry, load_config_files, + load_isb1_config_files, + load_isb1_kv_stress_config_files, load_runner_file, Fields ) @@ -374,6 +378,243 @@ def generate_full_sweep(args, all_config_data, runner_data): return matrix_values +def generate_isb1_sweep(args, all_config_data, runner_data): + """Generate ISB1 replay sweep configurations with optional filtering.""" + if args.runner_type: + valid_runner_types = set(runner_data.keys()) + invalid_runners = set(args.runner_type) - valid_runner_types + if invalid_runners: + raise ValueError( + f"Invalid runner type(s): {invalid_runners}. " + f"Valid runner types are: {', '.join(sorted(valid_runner_types))}" + ) + + matrix_values = [] + + for _, val in all_config_data.items(): + if args.model_prefix and val[Fields.MODEL_PREFIX.value] not in args.model_prefix: + continue + + if args.precision and val[Fields.PRECISION.value] not in args.precision: + continue + + if args.framework and val[Fields.FRAMEWORK.value] not in args.framework: + continue + + if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type: + continue + + image = val[Fields.IMAGE.value] + model = val[Fields.MODEL.value] + model_code = val[Fields.MODEL_PREFIX.value] + precision = val[Fields.PRECISION.value] + framework = val[Fields.FRAMEWORK.value] + runner = val[Fields.RUNNER.value] + benchmark_type = val[Fields.BENCHMARK_TYPE.value] + runtime_stack_id = val[Fields.RUNTIME_STACK_ID.value] + hardware_profile_id = val[Fields.HARDWARE_PROFILE_ID.value] + canonical_model_id = val[Fields.CANONICAL_MODEL_ID.value] + max_model_len = val.get(Fields.MAX_MODEL_LEN.value) + + runner_nodes_to_use = None + if args.runner_node_filter: + runner_nodes = runner_data.get(runner, []) + runner_nodes_to_use = [ + node for node in runner_nodes if args.runner_node_filter in node + ] + if not runner_nodes_to_use: + continue + + replay_configs = val[Fields.REPLAY_CONFIGS.value] + for replay_config in replay_configs: + export_file = replay_config[Fields.EXPORT_FILE.value] + request_mode = replay_config[Fields.REQUEST_MODE.value] + support_status = replay_config.get(Fields.SUPPORT_STATUS.value) + + for replay_space in replay_config[Fields.SEARCH_SPACE.value]: + max_concurrency = replay_space[Fields.MAX_CONCURRENCY.value] + + if args.max_concurrency is not None: + if args.max_concurrency <= 0: + continue + max_concurrency = min(max_concurrency, args.max_concurrency) + + runners_for_entry = ( + runner_nodes_to_use if runner_nodes_to_use else [runner] + ) + for runner_value in runners_for_entry: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.BENCHMARK_TYPE.value: benchmark_type, + Fields.EXPORT_FILE.value: export_file, + Fields.RUNTIME_STACK_ID.value: runtime_stack_id, + Fields.HARDWARE_PROFILE_ID.value: hardware_profile_id, + Fields.CANONICAL_MODEL_ID.value: canonical_model_id, + Fields.SUPPORT_STATUS.value: support_status, + Fields.REQUEST_MODE.value: request_mode, + Fields.MAX_CONCURRENCY.value: max_concurrency, + Fields.MAX_SESSIONS.value: replay_space.get(Fields.MAX_SESSIONS.value), + Fields.MAX_TURNS_PER_SESSION.value: replay_space.get(Fields.MAX_TURNS_PER_SESSION.value), + Fields.MAX_OUTPUT_LEN.value: replay_space.get(Fields.MAX_OUTPUT_LEN.value), + Fields.NUM_WARMUP_SESSIONS.value: replay_space.get( + Fields.NUM_WARMUP_SESSIONS.value, 0 + ), + Fields.IGNORE_WAITS.value: replay_space.get( + Fields.IGNORE_WAITS.value, False + ), + Fields.IGNORE_EOS.value: replay_space.get( + Fields.IGNORE_EOS.value, False + ), + Fields.MAX_MODEL_LEN.value: max_model_len, + Fields.OFFLOAD_MODE.value: val.get(Fields.OFFLOAD_MODE.value), + Fields.KV_CACHE_DTYPE.value: val.get(Fields.KV_CACHE_DTYPE.value), + Fields.DISABLE_PREFIX_CACHING.value: val.get( + Fields.DISABLE_PREFIX_CACHING.value + ), + 'benchmark-duration-s': replay_space.get('benchmark-duration-s'), + Fields.EXP_NAME.value: f"{model_code}_isb1", + } + validate_isb1_matrix_entry(entry) + matrix_values.append(entry) + + return matrix_values + + +def generate_isb1_kv_stress_sweep(args, all_config_data, runner_data): + """Generate ISB1 KV stress sweep configurations with optional filtering.""" + if args.runner_type: + valid_runner_types = set(runner_data.keys()) + invalid_runners = set(args.runner_type) - valid_runner_types + if invalid_runners: + raise ValueError( + f"Invalid runner type(s): {invalid_runners}. " + f"Valid runner types are: {', '.join(sorted(valid_runner_types))}" + ) + + matrix_values = [] + + for _, val in all_config_data.items(): + if args.model_prefix and val[Fields.MODEL_PREFIX.value] not in args.model_prefix: + continue + + if args.precision and val[Fields.PRECISION.value] not in args.precision: + continue + + if args.framework and val[Fields.FRAMEWORK.value] not in args.framework: + continue + + if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type: + continue + + image = val[Fields.IMAGE.value] + model = val[Fields.MODEL.value] + model_code = val[Fields.MODEL_PREFIX.value] + precision = val[Fields.PRECISION.value] + framework = val[Fields.FRAMEWORK.value] + runner = val[Fields.RUNNER.value] + benchmark_type = val[Fields.BENCHMARK_TYPE.value] + runtime_stack_id = val[Fields.RUNTIME_STACK_ID.value] + hardware_profile_id = val[Fields.HARDWARE_PROFILE_ID.value] + canonical_model_id = val[Fields.CANONICAL_MODEL_ID.value] + max_model_len = val.get(Fields.MAX_MODEL_LEN.value) + kv_cache_dtype = val[Fields.KV_CACHE_DTYPE.value] + + runner_nodes_to_use = None + if args.runner_node_filter: + runner_nodes = runner_data.get(runner, []) + runner_nodes_to_use = [ + node for node in runner_nodes if args.runner_node_filter in node + ] + if not runner_nodes_to_use: + continue + + kv_stress_configs = val[Fields.KV_STRESS_CONFIGS.value] + for kv_stress_config in kv_stress_configs: + export_file = kv_stress_config[Fields.EXPORT_FILE.value] + request_mode = kv_stress_config[Fields.REQUEST_MODE.value] + support_status = kv_stress_config.get(Fields.SUPPORT_STATUS.value) + workload_type = kv_stress_config[Fields.WORKLOAD_TYPE.value] + + runners_for_entry = ( + runner_nodes_to_use if runner_nodes_to_use else [runner] + ) + + def _append_kv_stress_entry( + max_concurrency: int, + offload_mode: str, + duration_s: int, + *, + tp: int | None = None, + ep: int | None = None, + ) -> None: + disable_prefix_caching = offload_mode == "noprefix" + for runner_value in runners_for_entry: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.BENCHMARK_TYPE.value: benchmark_type, + Fields.EXPORT_FILE.value: export_file, + Fields.RUNTIME_STACK_ID.value: runtime_stack_id, + Fields.HARDWARE_PROFILE_ID.value: hardware_profile_id, + Fields.CANONICAL_MODEL_ID.value: canonical_model_id, + Fields.SUPPORT_STATUS.value: support_status, + Fields.REQUEST_MODE.value: request_mode, + Fields.MAX_CONCURRENCY.value: max_concurrency, + Fields.OFFLOAD_MODE.value: offload_mode, + Fields.KV_CACHE_DTYPE.value: kv_cache_dtype, + Fields.DISABLE_PREFIX_CACHING.value: disable_prefix_caching, + 'benchmark-duration-s': duration_s, + Fields.WORKLOAD_TYPE.value: workload_type, + Fields.MAX_MODEL_LEN.value: max_model_len, + Fields.EXP_NAME.value: f"{model_code}_isb1_kv_stress", + } + if tp is not None: + entry[Fields.TP.value] = tp + if ep is not None: + entry[Fields.EP.value] = ep + validate_isb1_kv_stress_matrix_entry(entry) + matrix_values.append(entry) + + tp_configs = kv_stress_config.get('tp-configs') + if tp_configs: + for tp_config in tp_configs: + tp_value = tp_config[Fields.TP.value] + ep_value = tp_config.get(Fields.EP.value, 1) + users = tp_config[Fields.USERS.value] + offload_modes = tp_config[Fields.OFFLOAD_MODES.value] + duration_s = tp_config[Fields.DURATION_S.value] + + for max_concurrency in users: + for offload_mode in offload_modes: + _append_kv_stress_entry( + max_concurrency, + offload_mode, + duration_s, + tp=tp_value, + ep=ep_value, + ) + else: + for stress_space in kv_stress_config[Fields.SEARCH_SPACE.value]: + users = stress_space[Fields.USERS.value] + offload_modes = stress_space[Fields.OFFLOAD_MODES.value] + duration_s = stress_space[Fields.DURATION_S.value] + + for max_concurrency in users: + for offload_mode in offload_modes: + _append_kv_stress_entry(max_concurrency, offload_mode, duration_s) + + return matrix_values + + def generate_runner_model_sweep_config(args, all_config_data, runner_data): """Generate runner-model sweep configurations. @@ -885,6 +1126,86 @@ def main(): help='Show this help message and exit' ) + # Subcommand: isb1-sweep + isb1_sweep_parser = subparsers.add_parser( + 'isb1-sweep', + parents=[parent_parser], + add_help=False, + help='Generate ISB1 replay sweep configurations' + ) + isb1_sweep_parser.add_argument( + '--model-prefix', + nargs='+', + required=False, + help='Model prefix(es) to filter configurations (optional, can specify multiple)' + ) + isb1_sweep_parser.add_argument( + '--precision', + nargs='+', + required=False, + help='Precision(s) to filter by (optional, can specify multiple)' + ) + isb1_sweep_parser.add_argument( + '--framework', + nargs='+', + required=False, + help='Framework(s) to filter by (optional, can specify multiple)' + ) + isb1_sweep_parser.add_argument( + '--runner-type', + nargs='+', + required=False, + help='Runner type(s) to filter by (e.g., h200, b200) (optional, can specify multiple)' + ) + isb1_sweep_parser.add_argument( + '--max-concurrency', + type=int, + required=False, + help='Maximum replay concurrency value to include (caps higher values)' + ) + isb1_sweep_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + + # Subcommand: isb1-kv-stress-sweep + isb1_kv_stress_sweep_parser = subparsers.add_parser( + 'isb1-kv-stress-sweep', + parents=[parent_parser], + add_help=False, + help='Generate ISB1 KV stress sweep configurations' + ) + isb1_kv_stress_sweep_parser.add_argument( + '--model-prefix', + nargs='+', + required=False, + help='Model prefix(es) to filter configurations (optional, can specify multiple)' + ) + isb1_kv_stress_sweep_parser.add_argument( + '--precision', + nargs='+', + required=False, + help='Precision(s) to filter by (optional, can specify multiple)' + ) + isb1_kv_stress_sweep_parser.add_argument( + '--framework', + nargs='+', + required=False, + help='Framework(s) to filter by (optional, can specify multiple)' + ) + isb1_kv_stress_sweep_parser.add_argument( + '--runner-type', + nargs='+', + required=False, + help='Runner type(s) to filter by (e.g., h200, b200) (optional, can specify multiple)' + ) + isb1_kv_stress_sweep_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + # Subcommand: test-config test_config_keys_parser = subparsers.add_parser( 'test-config', @@ -915,7 +1236,12 @@ def main(): apply_node_type_defaults(args) # Load and validate configuration files (validation happens by default in load functions) - all_config_data = load_config_files(args.config_files) + if args.command == 'isb1-sweep': + all_config_data = load_isb1_config_files(args.config_files) + elif args.command == 'isb1-kv-stress-sweep': + all_config_data = load_isb1_kv_stress_config_files(args.config_files) + else: + all_config_data = load_config_files(args.config_files) runner_data = load_runner_file(args.runner_config) # Route to appropriate function based on subcommand @@ -924,13 +1250,17 @@ def main(): elif args.command == 'runner-model-sweep': matrix_values = generate_runner_model_sweep_config( args, all_config_data, runner_data) + elif args.command == 'isb1-sweep': + matrix_values = generate_isb1_sweep(args, all_config_data, runner_data) + elif args.command == 'isb1-kv-stress-sweep': + matrix_values = generate_isb1_kv_stress_sweep(args, all_config_data, runner_data) elif args.command == 'test-config': matrix_values = generate_test_config_sweep(args, all_config_data) else: parser.error(f"Unknown command: {args.command}") # Handle eval options (mutually exclusive: --no-evals or --evals-only) - if not args.no_evals: + if args.command not in ('isb1-sweep', 'isb1-kv-stress-sweep') and not args.no_evals: matrix_values = mark_eval_entries(matrix_values) if args.evals_only: matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)] diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index d05299472..cbee3f0a6 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -1,22 +1,73 @@ """Comprehensive tests for generate_sweep_configs.py""" import pytest import argparse +import json +from pathlib import Path from generate_sweep_configs import ( seq_len_stoi, seq_len_itos, seq_len_to_str, generate_full_sweep, + generate_isb1_sweep, + generate_isb1_kv_stress_sweep, generate_runner_model_sweep_config, apply_node_type_defaults, expand_config_keys, mark_eval_entries, ) +from validation import ( + load_config_files, + load_isb1_config_files, + load_isb1_kv_stress_config_files, +) # ============================================================================= # Test Fixtures # ============================================================================= + +def _write_isb1_export_fixture( + root: Path, + relative_path: str, + *, + runtime_stack_id: str, + hardware_profile_id: str, + canonical_model_id: str, + support_status: str, + benchmark_certification_status: str = "dataset_replay_verified", +) -> None: + export_path = root / relative_path + export_path.parent.mkdir(parents=True, exist_ok=True) + export_path.write_text( + json.dumps( + { + "adapter_id": "inferencex_multiturn", + "exports": [ + { + "trace_id": f"{export_path.stem}-trace", + "runtime_stack_id": runtime_stack_id, + "hardware_profile_id": hardware_profile_id, + "canonical_model_id": canonical_model_id, + "support_status": support_status, + "benchmark_certification_status": benchmark_certification_status, + "session": { + "session_id": "fixture-session", + "turns": [ + { + "turn_idx": 0, + "turn_id": 0, + "messages": [{"role": "user", "content": "hi"}], + "expected_output_tokens": 8, + } + ], + }, + } + ], + } + ) + ) + @pytest.fixture def sample_single_node_config(): """Single node config based on dsr1-fp8-mi300x-sglang.""" @@ -149,6 +200,161 @@ def full_sweep_args_multi_node(): return args +@pytest.fixture +def sample_isb1_config(): + """ISB1 replay config based on NVIDIA H200 replay lane.""" + return { + "dsr1-isb1-h200-vllm": { + "image": "vllm/vllm-openai:v0.8.5", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "benchmark-type": "isb1_replay", + "runtime-stack-id": "vllm-0.8.5-h200", + "hardware-profile-id": "h200-8gpu", + "canonical-model-id": "deepseek-r1-0528", + "max-model-len": 16384, + "replay-configs": [ + { + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "request-mode": "multi-turn", + "support-status": "supported", + "search-space": [ + { + "max-concurrency": 4, + "max-sessions": 2, + "max-turns-per-session": 6, + "max-output-len": 512, + "num-warmup-sessions": 1, + "ignore-waits": True, + "ignore-eos": False, + }, + {"max-concurrency": 8}, + {"max-concurrency": 16}, + ], + }, + { + "export-file": "datasets/isb1/exports/core/code_8k1k.json", + "request-mode": "multi-turn", + "support-status": "supported", + "search-space": [ + {"max-concurrency": 4}, + {"max-concurrency": 8}, + ], + }, + ], + } + } + + +@pytest.fixture +def isb1_sweep_args(): + """Args for isb1-sweep command.""" + args = argparse.Namespace() + args.model_prefix = None + args.precision = None + args.framework = None + args.runner_type = None + args.max_concurrency = None + args.runner_node_filter = None + return args + + +@pytest.fixture +def sample_isb1_kv_stress_config(): + """ISB1 KV stress config with users/offload-mode search space.""" + return { + "gptoss-fp4-h200-isb1-kv-stress-vllm-code": { + "image": "vllm/vllm-openai:v0.18.0", + "model": "openai/gpt-oss-120b", + "model-prefix": "gptoss", + "precision": "fp4", + "framework": "vllm", + "runner": "h200", + "benchmark-type": "isb1_kv_stress", + "runtime-stack-id": "standalone:vllm", + "hardware-profile-id": "nvidia:h200_sxm_141gb", + "canonical-model-id": "gpt_oss_120b", + "max-model-len": 131272, + "kv-cache-dtype": "fp8", + "kv-stress-configs": [ + { + "export-file": "datasets/isb1/exports/extension_131k/vllm/code_131k1k.json", + "request-mode": "multi-turn", + "support-status": "reviewed_preview", + "workload-type": "code", + "search-space": [ + { + "users": [2, 4, 8], + "offload-modes": ["on", "off", "noprefix"], + "duration-s": 1800, + } + ], + } + ], + } + } + + +@pytest.fixture +def sample_isb1_kv_stress_tp_config(): + """ISB1 KV stress config using per-TP expansion.""" + return { + "gptoss-fp4-h200-isb1-kv-stress-vllm-code-tp": { + "image": "vllm/vllm-openai:v0.18.0", + "model": "openai/gpt-oss-120b", + "model-prefix": "gptoss", + "precision": "fp4", + "framework": "vllm", + "runner": "h200", + "benchmark-type": "isb1_kv_stress", + "runtime-stack-id": "standalone:vllm", + "hardware-profile-id": "nvidia:h200_sxm_141gb", + "canonical-model-id": "gpt_oss_120b", + "max-model-len": 131272, + "kv-cache-dtype": "fp8", + "kv-stress-configs": [ + { + "export-file": "datasets/isb1/exports/extension_131k/vllm/code_131k1k.json", + "request-mode": "multi-turn", + "support-status": "reviewed_preview", + "workload-type": "code", + "search-space": [ + { + "users": [1], + "offload-modes": ["off"], + "duration-s": 10, + } + ], + "tp-configs": [ + { + "tp": 8, + "ep": 1, + "users": [2, 4, 8], + "offload-modes": ["on", "off", "noprefix"], + "duration-s": 1800, + } + ], + } + ], + } + } + + +@pytest.fixture +def isb1_kv_stress_sweep_args(): + """Args for isb1-kv-stress-sweep command.""" + args = argparse.Namespace() + args.model_prefix = None + args.precision = None + args.framework = None + args.runner_type = None + args.runner_node_filter = None + return args + + # ============================================================================= # Test seq_len mappings # ============================================================================= @@ -181,6 +387,573 @@ def test_unknown_sequence_lengths(self): assert seq_len_to_str(4096, 1024) == "4096_1024" +# ============================================================================= +# Test generate_isb1_sweep +# ============================================================================= + +class TestGenerateISB1Sweep: + """Tests for generate_isb1_sweep.""" + + def test_basic_sweep_generation(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + + def test_matrix_entry_structure(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + entry = result[0] + assert entry["benchmark-type"] == "isb1_replay" + assert entry["export-file"].endswith("chat_8k1k.json") + assert entry["runtime-stack-id"] == "vllm-0.8.5-h200" + assert entry["hardware-profile-id"] == "h200-8gpu" + assert entry["canonical-model-id"] == "deepseek-r1-0528" + assert entry["support-status"] == "supported" + assert entry["request-mode"] == "multi-turn" + assert entry["max-concurrency"] == 4 + assert entry["max-sessions"] == 2 + assert entry["max-turns-per-session"] == 6 + assert entry["max-output-len"] == 512 + assert entry["num-warmup-sessions"] == 1 + assert entry["ignore-waits"] is True + assert entry["ignore-eos"] is False + assert entry["max-model-len"] == 16384 + assert entry["exp-name"] == "dsr1_isb1" + assert "run-eval" not in entry + + def test_filter_by_model_prefix(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.model_prefix = ["dsr1"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + + isb1_sweep_args.model_prefix = ["gptoss"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_filter_by_precision(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.precision = ["fp8"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + + isb1_sweep_args.precision = ["fp4"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_filter_by_framework(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.framework = ["vllm"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + + isb1_sweep_args.framework = ["sglang"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_filter_by_runner_type(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.runner_type = ["h200"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + + isb1_sweep_args.runner_type = ["h100"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_invalid_runner_type_raises_error(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.runner_type = ["not-a-runner"] + with pytest.raises(ValueError, match="Invalid runner type"): + generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + + def test_max_concurrency_cap(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.max_concurrency = 6 + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + assert sorted(entry["max-concurrency"] for entry in result) == [4, 4, 6, 6, 6] + + def test_non_positive_max_concurrency_skips_all(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.max_concurrency = 0 + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_max_model_len_passthrough_optional(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert all(entry["max-model-len"] == 16384 for entry in result) + + sample_isb1_config["dsr1-isb1-h200-vllm"].pop("max-model-len") + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert all(entry["max-model-len"] is None for entry in result) + + def test_runner_node_filter_expands_runner_nodes(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.runner_node_filter = "cw" + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 10 + assert all(entry["runner"].startswith("h200-cw") for entry in result) + + def test_runner_node_filter_no_match_returns_empty(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.runner_node_filter = "does-not-exist" + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_main_routes_isb1_sweep(self, tmp_path, sample_isb1_config, sample_runner_config, monkeypatch): + import yaml + import sys + from generate_sweep_configs import main + + sample_entry = sample_isb1_config["dsr1-isb1-h200-vllm"] + for replay_config in sample_entry["replay-configs"]: + _write_isb1_export_fixture( + tmp_path, + replay_config["export-file"], + runtime_stack_id=sample_entry["runtime-stack-id"], + hardware_profile_id=sample_entry["hardware-profile-id"], + canonical_model_id=sample_entry["canonical-model-id"], + support_status=replay_config["support-status"], + ) + + config_file = tmp_path / "isb1.yaml" + runner_file = tmp_path / "runners.yaml" + config_file.write_text(yaml.dump(sample_isb1_config)) + runner_file.write_text(yaml.dump(sample_runner_config)) + + monkeypatch.setattr( + sys, + "argv", + [ + "generate_sweep_configs.py", + "isb1-sweep", + "--config-files", + str(config_file), + "--runner-config", + str(runner_file), + ], + ) + + result = main() + assert len(result) == 5 + assert all(entry["benchmark-type"] == "isb1_replay" for entry in result) + + +class TestKVStressSweep: + """Tests for generate_isb1_kv_stress_sweep.""" + + def test_basic_kv_stress_sweep_generation( + self, + sample_isb1_kv_stress_config, + sample_runner_config, + isb1_kv_stress_sweep_args, + ): + result = generate_isb1_kv_stress_sweep( + isb1_kv_stress_sweep_args, + sample_isb1_kv_stress_config, + sample_runner_config, + ) + # users(3) * offload-modes(3) = 9 flattened rows + assert len(result) == 9 + + def test_flatten_users_x_offload_modes( + self, + sample_isb1_kv_stress_config, + sample_runner_config, + isb1_kv_stress_sweep_args, + ): + result = generate_isb1_kv_stress_sweep( + isb1_kv_stress_sweep_args, + sample_isb1_kv_stress_config, + sample_runner_config, + ) + + assert all(entry["benchmark-type"] == "isb1_kv_stress" for entry in result) + assert all(isinstance(entry["max-concurrency"], int) for entry in result) + assert all(isinstance(entry["offload-mode"], str) for entry in result) + assert all(entry["benchmark-duration-s"] == 1800 for entry in result) + assert all(entry["kv-cache-dtype"] == "fp8" for entry in result) + assert all(entry["workload-type"] == "code" for entry in result) + + pairs = {(entry["max-concurrency"], entry["offload-mode"]) for entry in result} + assert pairs == { + (2, "on"), + (2, "off"), + (2, "noprefix"), + (4, "on"), + (4, "off"), + (4, "noprefix"), + (8, "on"), + (8, "off"), + (8, "noprefix"), + } + + def test_tp_config_expansion_produces_expected_rows( + self, + sample_isb1_kv_stress_tp_config, + sample_runner_config, + isb1_kv_stress_sweep_args, + ): + result = generate_isb1_kv_stress_sweep( + isb1_kv_stress_sweep_args, + sample_isb1_kv_stress_tp_config, + sample_runner_config, + ) + + # users(3) * offload-modes(3) = 9 rows from tp-configs expansion + assert len(result) == 9 + assert {entry["tp"] for entry in result} == {8} + assert {entry["ep"] for entry in result} == {1} + + def test_repo_kv_stress_config_loads_and_expands(self, isb1_kv_stress_sweep_args): + repo_root = Path(__file__).resolve().parents[2] + config_data = load_isb1_kv_stress_config_files( + [str(repo_root / ".github/configs/isb1-kv-stress.yaml")] + ) + runner_data = { + "b200": ["b200-nb_0"], + "h200": ["h200-cw_2"], + } + + matrix = generate_isb1_kv_stress_sweep( + isb1_kv_stress_sweep_args, + config_data, + runner_data, + ) + + # 4 configs (gptoss/qwen * b200/h200) * 8 users * 3 offload modes + assert len(matrix) == 96 + assert all(entry["benchmark-type"] == "isb1_kv_stress" for entry in matrix) + assert all("tp" not in entry for entry in matrix) + assert all("ep" not in entry for entry in matrix) + + +class TestISB1SweepIsolation: + """Tests for ISB1 sweep isolation from throughput config lane.""" + + def test_repo_isb1_master_includes_runtime_expansion_cells(self, isb1_sweep_args): + repo_root = Path(__file__).resolve().parents[2] + config_data = load_isb1_config_files( + [str(repo_root / ".github/configs/isb1-master.yaml")] + ) + runner_data = { + "b200": ["b200-nb_0"], + "h100": ["h100-cw_0"], + "h200": ["h200-cw_2"], + } + + matrix = generate_isb1_sweep(isb1_sweep_args, config_data, runner_data) + config_keys = set(config_data) + matrix_key_triples = { + (entry["model-prefix"], entry["framework"], entry["runner"]) + for entry in matrix + } + + assert "dsr1-fp8-b200-isb1-vllm" in config_keys + assert "dsr1-fp8-h200-isb1-vllm" in config_keys + assert "gptoss-fp4-b200-isb1-sglang" in config_keys + assert "gptoss-fp4-h100-isb1-sglang" in config_keys + assert "gptoss-fp4-h200-isb1-sglang" in config_keys + assert "gptoss-fp4-h100-isb1-sglang-offload-core-preview-chat" in config_keys + assert "gptoss-fp4-h100-isb1-vllm-offload-core-preview-code" in config_keys + assert "gptoss-fp4-h100-isb1-sglang-500k-preview-code" in config_keys + assert "gptoss-fp4-h100-isb1-vllm-500k-preview-code" in config_keys + assert "qwen3.5-fp8-b200-isb1-sglang-500k-preview-code" in config_keys + assert "qwen3.5-fp8-h100-isb1-sglang-500k-preview-code" in config_keys + assert "qwen3.5-fp8-h200-isb1-sglang-500k-preview-code" in config_keys + assert "qwen3.5-fp8-b200-isb1-vllm-500k-preview-code" in config_keys + assert "qwen3.5-fp8-h100-isb1-vllm-500k-preview-code" in config_keys + assert "qwen3.5-fp8-h200-isb1-vllm-500k-preview-code" in config_keys + assert "qwen3.5-fp8-b200-isb1-sglang-extension" in config_keys + assert "qwen3.5-fp8-h100-isb1-sglang-extension" in config_keys + assert "qwen3.5-fp8-h200-isb1-sglang-extension" in config_keys + assert "qwen3.5-fp8-b200-isb1-vllm-extension" in config_keys + assert "qwen3.5-fp8-h100-isb1-vllm-extension" in config_keys + assert "qwen3.5-fp8-h200-isb1-vllm-extension" in config_keys + + assert ("dsr1", "vllm", "b200") in matrix_key_triples + assert ("dsr1", "vllm", "h200") in matrix_key_triples + assert ("gptoss", "sglang", "b200") in matrix_key_triples + assert ("gptoss", "sglang", "h100") in matrix_key_triples + assert ("gptoss", "sglang", "h200") in matrix_key_triples + assert ("qwen3.5", "sglang", "b200") in matrix_key_triples + assert ("qwen3.5", "sglang", "h100") in matrix_key_triples + assert ("qwen3.5", "sglang", "h200") in matrix_key_triples + assert ("qwen3.5", "vllm", "b200") in matrix_key_triples + assert ("qwen3.5", "vllm", "h100") in matrix_key_triples + assert ("qwen3.5", "vllm", "h200") in matrix_key_triples + + assert "dsr1-fp8-h100-isb1-sglang" not in config_keys + assert "dsr1-fp8-h100-isb1-vllm" not in config_keys + + assert any( + entry["export-file"].endswith("extension_32k/vllm/chat_32k1k.json") + and entry["support-status"] == "supported" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("core/vllm/code_8k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert not any( + entry["export-file"].endswith("core/vllm/code_8k1k.json") + and entry["support-status"] == "supported" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_32k/vllm/code_32k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_64k/vllm/code_64k1k.json") + and entry["support-status"] == "supported" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_64k/sglang/chat_64k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + "preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json" + in entry["export-file"] + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_131k/sglang/chat_131k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_131k/sglang/code_131k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_131k/vllm/chat_131k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_131k/vllm/code_131k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + qwen_sglang_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "extension_131k/sglang/code_131k1k_qwen3.5.json" + ) + ] + assert len(qwen_sglang_entries) == 6 + assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_sglang_entries) + assert all(entry["framework"] == "sglang" for entry in qwen_sglang_entries) + assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_sglang_entries) + assert {entry["max-concurrency"] for entry in qwen_sglang_entries} == {2, 4} + + qwen_vllm_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "extension_131k/vllm/code_131k1k_qwen3.5.json" + ) + ] + assert len(qwen_vllm_entries) == 6 + assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_vllm_entries) + assert all(entry["framework"] == "vllm" for entry in qwen_vllm_entries) + assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_vllm_entries) + assert {entry["max-concurrency"] for entry in qwen_vllm_entries} == {2, 4} + + sglang_500k_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "preview/long_context_500k/" + "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json" + ) + ] + assert len(sglang_500k_entries) == 3 + assert all(entry["support-status"] == "reviewed_preview" for entry in sglang_500k_entries) + assert all(entry["max-model-len"] == 524288 for entry in sglang_500k_entries) + assert all(entry["max-concurrency"] == 1 for entry in sglang_500k_entries) + + vllm_500k_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "preview/long_context_500k/" + "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json" + ) + ] + assert len(vllm_500k_entries) == 3 + assert all(entry["support-status"] == "reviewed_preview" for entry in vllm_500k_entries) + assert all(entry["max-model-len"] == 524288 for entry in vllm_500k_entries) + assert all(entry["max-concurrency"] == 1 for entry in vllm_500k_entries) + + qwen_sglang_500k_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json" + ) + ] + assert len(qwen_sglang_500k_entries) == 3 + assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_sglang_500k_entries) + assert all(entry["framework"] == "sglang" for entry in qwen_sglang_500k_entries) + assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_sglang_500k_entries) + assert all(entry["max-model-len"] == 524288 for entry in qwen_sglang_500k_entries) + assert all(entry["max-concurrency"] == 1 for entry in qwen_sglang_500k_entries) + + qwen_vllm_500k_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json" + ) + ] + assert len(qwen_vllm_500k_entries) == 3 + assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_vllm_500k_entries) + assert all(entry["framework"] == "vllm" for entry in qwen_vllm_500k_entries) + assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_vllm_500k_entries) + assert all(entry["max-model-len"] == 524288 for entry in qwen_vllm_500k_entries) + assert all(entry["max-concurrency"] == 1 for entry in qwen_vllm_500k_entries) + + assert not any( + entry["export-file"].endswith( + "preview/long_context_1m/" + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json" + ) + or entry["export-file"].endswith( + "preview/long_context_1m/" + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json" + ) + for entry in matrix + ) + + def test_repo_qwen_1m_preview_config_is_manual_and_separate(self, isb1_sweep_args): + repo_root = Path(__file__).resolve().parents[2] + config_data = load_isb1_config_files( + [str(repo_root / ".github/configs/isb1-qwen-1m-preview.yaml")] + ) + runner_data = { + "b200": ["b200-nb_0"], + "h100": ["h100-cw_0"], + "h200": ["h200-cw_2"], + } + + matrix = generate_isb1_sweep(isb1_sweep_args, config_data, runner_data) + config_keys = set(config_data) + + assert config_keys == { + "qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code", + "qwen3.5-fp8-b200-isb1-vllm-1m-gated-preview-code", + } + assert len(matrix) == 2 + assert {entry["runner"] for entry in matrix} == {"b200"} + assert {entry["framework"] for entry in matrix} == {"sglang", "vllm"} + assert {entry["model-prefix"] for entry in matrix} == {"qwen3.5"} + assert {entry["support-status"] for entry in matrix} == {"reviewed_preview"} + assert {entry["max-model-len"] for entry in matrix} == {1048576} + assert {entry["max-concurrency"] for entry in matrix} == {1} + assert {entry["max-sessions"] for entry in matrix} == {1} + assert {entry["max-turns-per-session"] for entry in matrix} == {3} + assert { + entry["canonical-model-id"] for entry in matrix + } == {"qwen3_5_397b_a17b"} + assert { + entry["export-file"] for entry in matrix + } == { + "datasets/isb1/exports/preview/long_context_1m/" + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json", + "datasets/isb1/exports/preview/long_context_1m/" + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json", + } + assert all((repo_root / entry["export-file"]).exists() for entry in matrix) + + + def test_isb1_config_does_not_validate_as_throughput(self, tmp_path, sample_isb1_config): + import yaml + + config_file = tmp_path / "isb1.yaml" + config_file.write_text(yaml.dump(sample_isb1_config)) + + with pytest.raises(ValueError): + load_config_files([str(config_file)]) + + def test_throughput_config_does_not_validate_as_isb1(self, tmp_path, sample_single_node_config): + import yaml + + config_file = tmp_path / "throughput.yaml" + config_file.write_text(yaml.dump(sample_single_node_config)) + + with pytest.raises(ValueError): + load_isb1_config_files([str(config_file)]) + + # ============================================================================= # Test generate_full_sweep for single-node # ============================================================================= diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py index 0f1f44c27..06267da22 100644 --- a/utils/matrix_logic/test_validation.py +++ b/utils/matrix_logic/test_validation.py @@ -1,20 +1,31 @@ """Comprehensive tests for validation.py""" +import json +from pathlib import Path + import pytest +import yaml from validation import ( Fields, SingleNodeMatrixEntry, MultiNodeMatrixEntry, + ISB1ReplayMatrixEntry, WorkerConfig, SingleNodeSearchSpaceEntry, MultiNodeSearchSpaceEntry, + ISB1ReplaySearchSpaceEntry, + ISB1ReplayConfigEntry, SingleNodeSeqLenConfig, MultiNodeSeqLenConfig, SingleNodeMasterConfigEntry, MultiNodeMasterConfigEntry, + ISB1MasterConfigEntry, validate_matrix_entry, + validate_isb1_matrix_entry, validate_master_config, + validate_isb1_master_config, validate_runner_config, load_config_files, + load_isb1_config_files, load_runner_file, ) @@ -23,6 +34,68 @@ # Test Fixtures # ============================================================================= + +def _write_isb1_export_fixture( + root: Path, + relative_path: str, + *, + runtime_stack_id: str, + hardware_profile_id: str, + canonical_model_id: str, + support_status: str, + benchmark_certification_status: str = "dataset_replay_verified", +) -> None: + export_path = root / relative_path + export_path.parent.mkdir(parents=True, exist_ok=True) + export_path.write_text( + json.dumps( + { + "adapter_id": "inferencex_multiturn", + "exports": [ + { + "trace_id": f"{export_path.stem}-trace", + "runtime_stack_id": runtime_stack_id, + "hardware_profile_id": hardware_profile_id, + "canonical_model_id": canonical_model_id, + "support_status": support_status, + "benchmark_certification_status": benchmark_certification_status, + "session": { + "session_id": "fixture-session", + "turns": [ + { + "turn_idx": 0, + "turn_id": 0, + "messages": [{"role": "user", "content": "hello"}], + "expected_output_tokens": 8, + } + ], + }, + } + ], + } + ) + ) + + +def _write_manifest_fixture( + root: Path, + relative_path: str, + *, + export_file: str, + max_model_len: int, +) -> None: + manifest_path = root / relative_path + manifest_path.parent.mkdir(parents=True, exist_ok=True) + manifest_path.write_text( + json.dumps( + { + "manifest_version": "0.1.0", + "max_model_len": max_model_len, + "exports": [{"export_file": export_file}], + } + ) + ) + @pytest.fixture def valid_single_node_matrix_entry(): """Valid single node matrix entry based on dsr1-fp4-mi355x-sglang config.""" @@ -159,6 +232,74 @@ def valid_multinode_master_config(): } +@pytest.fixture +def valid_isb1_master_config(): + """Valid ISB1 replay master config for NVIDIA PR1a.""" + return { + "image": "vllm/vllm-openai:v0.8.5", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "benchmark-type": "isb1_replay", + "runtime-stack-id": "vllm-0.8.5-h200", + "hardware-profile-id": "h200-8gpu", + "canonical-model-id": "deepseek-r1-0528", + "max-model-len": 16384, + "replay-configs": [ + { + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "request-mode": "multi-turn", + "support-status": "supported", + "search-space": [ + { + "max-concurrency": 4, + "max-sessions": 2, + "max-turns-per-session": 6, + "max-output-len": 512, + "num-warmup-sessions": 1, + "ignore-waits": True, + "ignore-eos": False, + }, + { + "max-concurrency": 8, + }, + ], + } + ], + } + + +@pytest.fixture +def valid_isb1_matrix_entry(valid_isb1_master_config): + """Valid ISB1 replay matrix entry.""" + return { + "image": valid_isb1_master_config["image"], + "model": valid_isb1_master_config["model"], + "model-prefix": valid_isb1_master_config["model-prefix"], + "precision": valid_isb1_master_config["precision"], + "framework": valid_isb1_master_config["framework"], + "runner": valid_isb1_master_config["runner"], + "benchmark-type": valid_isb1_master_config["benchmark-type"], + "export-file": valid_isb1_master_config["replay-configs"][0]["export-file"], + "runtime-stack-id": valid_isb1_master_config["runtime-stack-id"], + "hardware-profile-id": valid_isb1_master_config["hardware-profile-id"], + "canonical-model-id": valid_isb1_master_config["canonical-model-id"], + "support-status": valid_isb1_master_config["replay-configs"][0]["support-status"], + "request-mode": valid_isb1_master_config["replay-configs"][0]["request-mode"], + "max-concurrency": 4, + "max-sessions": 2, + "max-turns-per-session": 6, + "max-output-len": 512, + "num-warmup-sessions": 1, + "ignore-waits": True, + "ignore-eos": False, + "max-model-len": valid_isb1_master_config["max-model-len"], + "exp-name": "dsr1_isb1", + } + + @pytest.fixture def valid_runner_config(): """Valid runner config based on .github/configs/runners.yaml.""" @@ -193,6 +334,10 @@ def test_key_fields_exist(self): assert Fields.SPEC_DECODING.value == "spec-decoding" assert Fields.PREFILL.value == "prefill" assert Fields.DECODE.value == "decode" + assert Fields.BENCHMARK_TYPE.value == "benchmark-type" + assert Fields.SUPPORT_STATUS.value == "support-status" + assert Fields.MAX_CONCURRENCY.value == "max-concurrency" + assert Fields.REPLAY_CONFIGS.value == "replay-configs" # ============================================================================= @@ -658,6 +803,153 @@ def test_disagg_default_false(self, valid_single_node_master_config): assert config.disagg is False +# ============================================================================= +# Test ISB1 replay models +# ============================================================================= + +class TestISB1ReplaySearchSpaceEntry: + """Tests for ISB1ReplaySearchSpaceEntry model.""" + + def test_valid_with_required_only(self): + config = ISB1ReplaySearchSpaceEntry(**{ + "max-concurrency": 4, + }) + assert config.max_concurrency == 4 + assert config.num_warmup_sessions == 0 + assert config.ignore_waits is False + assert config.ignore_eos is False + + def test_valid_with_all_fields(self): + config = ISB1ReplaySearchSpaceEntry(**{ + "max-concurrency": 8, + "max-sessions": 2, + "max-turns-per-session": 6, + "max-output-len": 512, + "num-warmup-sessions": 1, + "ignore-waits": True, + "ignore-eos": True, + }) + assert config.max_sessions == 2 + assert config.max_turns_per_session == 6 + assert config.max_output_len == 512 + assert config.num_warmup_sessions == 1 + assert config.ignore_waits is True + assert config.ignore_eos is True + + def test_missing_required_field(self): + with pytest.raises(Exception): + ISB1ReplaySearchSpaceEntry(**{ + "max-sessions": 2, + }) + + def test_extra_field_forbidden(self): + with pytest.raises(Exception): + ISB1ReplaySearchSpaceEntry(**{ + "max-concurrency": 4, + "unknown-field": "value", + }) + + +class TestISB1ReplayConfigEntry: + """Tests for ISB1ReplayConfigEntry model.""" + + def test_valid_entry(self): + config = ISB1ReplayConfigEntry(**{ + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "request-mode": "multi-turn", + "support-status": "supported", + "search-space": [{"max-concurrency": 4}], + }) + assert config.export_file.endswith("chat_8k1k.json") + assert config.request_mode == "multi-turn" + assert config.support_status == "supported" + assert len(config.search_space) == 1 + + def test_invalid_support_status(self): + with pytest.raises(Exception): + ISB1ReplayConfigEntry(**{ + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "request-mode": "multi-turn", + "support-status": "definitely_supported", + "search-space": [{"max-concurrency": 4}], + }) + + def test_missing_export_file(self): + with pytest.raises(Exception): + ISB1ReplayConfigEntry(**{ + "request-mode": "multi-turn", + "search-space": [{"max-concurrency": 4}], + }) + + def test_missing_request_mode(self): + with pytest.raises(Exception): + ISB1ReplayConfigEntry(**{ + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "search-space": [{"max-concurrency": 4}], + }) + + def test_empty_search_space(self): + with pytest.raises(Exception): + ISB1ReplayConfigEntry(**{ + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "request-mode": "multi-turn", + "search-space": [], + }) + + +class TestISB1MasterConfigEntry: + """Tests for ISB1MasterConfigEntry model.""" + + def test_valid_isb1_master_config(self, valid_isb1_master_config): + config = ISB1MasterConfigEntry(**valid_isb1_master_config) + assert config.benchmark_type == "isb1_replay" + assert config.model_prefix == "dsr1" + assert config.runner == "h200" + assert config.max_model_len == 16384 + assert len(config.replay_configs) == 1 + + def test_max_model_len_optional(self, valid_isb1_master_config): + del valid_isb1_master_config["max-model-len"] + config = ISB1MasterConfigEntry(**valid_isb1_master_config) + assert config.max_model_len is None + + def test_benchmark_type_must_match(self, valid_isb1_master_config): + valid_isb1_master_config["benchmark-type"] = "throughput" + with pytest.raises(Exception): + ISB1MasterConfigEntry(**valid_isb1_master_config) + + def test_throughput_only_field_rejected(self, valid_isb1_master_config): + valid_isb1_master_config["multinode"] = False + with pytest.raises(Exception): + ISB1MasterConfigEntry(**valid_isb1_master_config) + + def test_missing_required_field(self, valid_isb1_master_config): + del valid_isb1_master_config["runtime-stack-id"] + with pytest.raises(Exception): + ISB1MasterConfigEntry(**valid_isb1_master_config) + + +class TestISB1ReplayMatrixEntry: + """Tests for ISB1ReplayMatrixEntry model.""" + + def test_valid_entry(self, valid_isb1_matrix_entry): + entry = ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry) + assert entry.benchmark_type == "isb1_replay" + assert entry.support_status == "supported" + assert entry.max_concurrency == 4 + assert entry.exp_name == "dsr1_isb1" + + def test_missing_required_field(self, valid_isb1_matrix_entry): + del valid_isb1_matrix_entry["export-file"] + with pytest.raises(Exception): + ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry) + + def test_extra_throughput_field_forbidden(self, valid_isb1_matrix_entry): + valid_isb1_matrix_entry["tp"] = 8 + with pytest.raises(Exception): + ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry) + + # ============================================================================= # Test validate_master_config function # ============================================================================= @@ -696,6 +988,37 @@ def test_invalid_config_raises_valueerror(self, valid_single_node_master_config) assert "failed validation" in str(exc_info.value) +class TestValidateISB1MasterConfig: + """Tests for validate_isb1_master_config function.""" + + def test_valid_isb1_config(self, valid_isb1_master_config): + configs = {"dsr1-isb1-h200-vllm": valid_isb1_master_config} + result = validate_isb1_master_config(configs) + assert result == configs + + def test_invalid_isb1_config_raises_valueerror(self, valid_isb1_master_config): + del valid_isb1_master_config["model"] + configs = {"broken-isb1-config": valid_isb1_master_config} + with pytest.raises(ValueError) as exc_info: + validate_isb1_master_config(configs) + assert "broken-isb1-config" in str(exc_info.value) + assert "failed validation" in str(exc_info.value) + + +class TestValidateISB1MatrixEntry: + """Tests for validate_isb1_matrix_entry function.""" + + def test_valid_entry(self, valid_isb1_matrix_entry): + result = validate_isb1_matrix_entry(valid_isb1_matrix_entry) + assert result == valid_isb1_matrix_entry + + def test_invalid_entry_raises_valueerror(self, valid_isb1_matrix_entry): + del valid_isb1_matrix_entry["benchmark-type"] + with pytest.raises(ValueError) as exc_info: + validate_isb1_matrix_entry(valid_isb1_matrix_entry) + assert "failed validation" in str(exc_info.value) + + # ============================================================================= # Test validate_runner_config function # ============================================================================= @@ -823,6 +1146,224 @@ def test_validation_runs_by_default(self, tmp_path): assert "failed validation" in str(exc_info.value) +class TestLoadISB1ConfigFiles: + """Tests for load_isb1_config_files function.""" + + def test_load_single_file_with_validation(self, tmp_path, valid_isb1_master_config): + config_file = tmp_path / "isb1-config.yaml" + _write_isb1_export_fixture( + tmp_path, + valid_isb1_master_config["replay-configs"][0]["export-file"], + runtime_stack_id=valid_isb1_master_config["runtime-stack-id"], + hardware_profile_id=valid_isb1_master_config["hardware-profile-id"], + canonical_model_id=valid_isb1_master_config["canonical-model-id"], + support_status=valid_isb1_master_config["replay-configs"][0]["support-status"], + ) + + config_file.write_text( + yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config}) + ) + result = load_isb1_config_files([str(config_file)]) + assert "dsr1-isb1-h200-vllm" in result + assert result["dsr1-isb1-h200-vllm"]["benchmark-type"] == "isb1_replay" + + def test_export_contract_rejects_mismatched_support_status( + self, tmp_path, valid_isb1_master_config + ): + config_file = tmp_path / "isb1-config.yaml" + _write_isb1_export_fixture( + tmp_path, + valid_isb1_master_config["replay-configs"][0]["export-file"], + runtime_stack_id=valid_isb1_master_config["runtime-stack-id"], + hardware_profile_id=valid_isb1_master_config["hardware-profile-id"], + canonical_model_id=valid_isb1_master_config["canonical-model-id"], + support_status="reviewed_preview", + ) + config_file.write_text( + yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config}) + ) + + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files([str(config_file)]) + assert "support-status" in str(exc_info.value) + assert "Available support tiers" in str(exc_info.value) + + def test_export_contract_requires_dataset_replay_verified_certification( + self, tmp_path, valid_isb1_master_config + ): + config_file = tmp_path / "isb1-config.yaml" + _write_isb1_export_fixture( + tmp_path, + valid_isb1_master_config["replay-configs"][0]["export-file"], + runtime_stack_id=valid_isb1_master_config["runtime-stack-id"], + hardware_profile_id=valid_isb1_master_config["hardware-profile-id"], + canonical_model_id=valid_isb1_master_config["canonical-model-id"], + support_status=valid_isb1_master_config["replay-configs"][0]["support-status"], + benchmark_certification_status="pending_review", + ) + config_file.write_text( + yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config}) + ) + + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files([str(config_file)]) + assert "benchmark_certification_status" in str(exc_info.value) + assert "dataset_replay_verified" in str(exc_info.value) + + def test_export_contract_requires_max_model_len_for_preview_style_export( + self, tmp_path, valid_isb1_master_config + ): + config_file = tmp_path / "isb1-config.yaml" + preview_config = { + **valid_isb1_master_config, + "replay-configs": [ + { + **valid_isb1_master_config["replay-configs"][0], + "export-file": ( + "datasets/isb1/exports/preview/offload_core/" + "inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json" + ), + "support-status": "reviewed_preview", + } + ], + } + del preview_config["max-model-len"] + + _write_isb1_export_fixture( + tmp_path, + preview_config["replay-configs"][0]["export-file"], + runtime_stack_id=preview_config["runtime-stack-id"], + hardware_profile_id=preview_config["hardware-profile-id"], + canonical_model_id=preview_config["canonical-model-id"], + support_status="reviewed_preview", + ) + config_file.write_text(yaml.dump({"preview-row": preview_config})) + + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files([str(config_file)]) + assert "max-model-len" in str(exc_info.value) + + def test_export_contract_accepts_preview_style_export_with_explicit_max_model_len( + self, tmp_path, valid_isb1_master_config + ): + config_file = tmp_path / "isb1-config.yaml" + preview_config = { + **valid_isb1_master_config, + "runtime-stack-id": "standalone:vllm", + "hardware-profile-id": "nvidia:h100_sxm_80gb", + "canonical-model-id": "gpt_oss_120b", + "max-model-len": 524288, + "replay-configs": [ + { + **valid_isb1_master_config["replay-configs"][0], + "export-file": ( + "datasets/isb1/exports/preview/long_context_500k/" + "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json" + ), + "support-status": "reviewed_preview", + } + ], + } + + _write_isb1_export_fixture( + tmp_path, + preview_config["replay-configs"][0]["export-file"], + runtime_stack_id=preview_config["runtime-stack-id"], + hardware_profile_id=preview_config["hardware-profile-id"], + canonical_model_id=preview_config["canonical-model-id"], + support_status="reviewed_preview", + ) + config_file.write_text(yaml.dump({"preview-row": preview_config})) + + result = load_isb1_config_files([str(config_file)]) + assert "preview-row" in result + + def test_export_contract_warns_when_manifest_max_model_len_mismatches_config( + self, tmp_path, valid_isb1_master_config + ): + config_file = tmp_path / "isb1-config.yaml" + preview_config = { + **valid_isb1_master_config, + "runtime-stack-id": "standalone:vllm", + "hardware-profile-id": "nvidia:h100_sxm_80gb", + "canonical-model-id": "qwen3_5_397b_a17b", + "max-model-len": 524288, + "replay-configs": [ + { + **valid_isb1_master_config["replay-configs"][0], + "export-file": ( + "datasets/isb1/exports/preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json" + ), + "support-status": "reviewed_preview", + } + ], + } + + export_file = preview_config["replay-configs"][0]["export-file"] + _write_isb1_export_fixture( + tmp_path, + export_file, + runtime_stack_id=preview_config["runtime-stack-id"], + hardware_profile_id=preview_config["hardware-profile-id"], + canonical_model_id=preview_config["canonical-model-id"], + support_status="reviewed_preview", + ) + _write_manifest_fixture( + tmp_path, + "datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json", + export_file=export_file, + max_model_len=1048576, + ) + config_file.write_text(yaml.dump({"preview-row": preview_config})) + + with pytest.warns(UserWarning, match="max-model-len"): + result = load_isb1_config_files([str(config_file)]) + assert "preview-row" in result + + def test_load_single_file_without_validation(self, tmp_path): + config_file = tmp_path / "isb1-config.yaml" + config_file.write_text(""" +test-isb1: + image: test-image + benchmark-type: isb1_replay +""") + result = load_isb1_config_files([str(config_file)], validate=False) + assert "test-isb1" in result + assert result["test-isb1"]["benchmark-type"] == "isb1_replay" + + def test_validation_runs_by_default(self, tmp_path): + config_file = tmp_path / "isb1-config.yaml" + config_file.write_text(""" +invalid-isb1: + image: test-image + benchmark-type: isb1_replay +""") + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files([str(config_file)]) + assert "failed validation" in str(exc_info.value) + + def test_duplicate_keys_raise_error(self, tmp_path): + config1 = tmp_path / "config1.yaml" + config1.write_text(""" +duplicate-key: + benchmark-type: isb1_replay +""") + config2 = tmp_path / "config2.yaml" + config2.write_text(""" +duplicate-key: + benchmark-type: isb1_replay +""") + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files([str(config1), str(config2)], validate=False) + assert "Duplicate configuration keys" in str(exc_info.value) + + def test_nonexistent_file_raises_error(self): + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files(["nonexistent-isb1.yaml"]) + assert "does not exist" in str(exc_info.value) + + # ============================================================================= # Test load_runner_file # ============================================================================= diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index 312952b96..331e374b4 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -2,8 +2,12 @@ from typing import List, Optional, Union, Literal from enum import Enum +import json import pprint +import re +import warnings import yaml +from pathlib import Path """ The below class defines the field names expected to be present in the JSON entries @@ -55,6 +59,31 @@ class Fields(Enum): RUN_EVAL = 'run-eval' EVAL_ONLY = 'eval-only' + # ISB1 replay fields + BENCHMARK_TYPE = 'benchmark-type' + EXPORT_FILE = 'export-file' + RUNTIME_STACK_ID = 'runtime-stack-id' + HARDWARE_PROFILE_ID = 'hardware-profile-id' + CANONICAL_MODEL_ID = 'canonical-model-id' + REQUEST_MODE = 'request-mode' + MAX_CONCURRENCY = 'max-concurrency' + SUPPORT_STATUS = 'support-status' + MAX_SESSIONS = 'max-sessions' + MAX_TURNS_PER_SESSION = 'max-turns-per-session' + MAX_OUTPUT_LEN = 'max-output-len' + NUM_WARMUP_SESSIONS = 'num-warmup-sessions' + IGNORE_WAITS = 'ignore-waits' + IGNORE_EOS = 'ignore-eos' + REPLAY_CONFIGS = 'replay-configs' + KV_STRESS_CONFIGS = 'kv-stress-configs' + OFFLOAD_MODE = 'offload-mode' + OFFLOAD_MODES = 'offload-modes' + KV_CACHE_DTYPE = 'kv-cache-dtype' + DISABLE_PREFIX_CACHING = 'disable-prefix-caching' + USERS = 'users' + DURATION_S = 'duration-s' + WORKLOAD_TYPE = 'workload-type' + """ Below is the validation logic for the OUTPUT of utils/matrix_logic/generate_sweep_configs.py, i.e., @@ -147,6 +176,119 @@ def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: return entry +class ISB1ReplayMatrixEntry(BaseModel): + """Pydantic model for validating ISB1 replay matrix entry structure.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + benchmark_type: Literal["isb1_replay"] = Field( + alias=Fields.BENCHMARK_TYPE.value + ) + export_file: str = Field(alias=Fields.EXPORT_FILE.value) + runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) + hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) + canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) + support_status: Optional[ + Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] + ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) + request_mode: str = Field(alias=Fields.REQUEST_MODE.value) + max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0) + max_sessions: Optional[int] = Field( + default=None, alias=Fields.MAX_SESSIONS.value, gt=0 + ) + max_turns_per_session: Optional[int] = Field( + default=None, alias=Fields.MAX_TURNS_PER_SESSION.value, gt=0 + ) + max_output_len: Optional[int] = Field( + default=None, alias=Fields.MAX_OUTPUT_LEN.value, gt=0 + ) + num_warmup_sessions: int = Field( + default=0, alias=Fields.NUM_WARMUP_SESSIONS.value, ge=0 + ) + ignore_waits: bool = Field(default=False, alias=Fields.IGNORE_WAITS.value) + ignore_eos: bool = Field(default=False, alias=Fields.IGNORE_EOS.value) + max_model_len: Optional[int] = Field( + default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 + ) + offload_mode: Optional[Literal["on", "off", "noprefix", "legacy"]] = Field( + default=None, alias=Fields.OFFLOAD_MODE.value + ) + kv_cache_dtype: Optional[Literal["auto", "fp8"]] = Field( + default=None, alias=Fields.KV_CACHE_DTYPE.value + ) + disable_prefix_caching: Optional[bool] = Field( + default=None, alias=Fields.DISABLE_PREFIX_CACHING.value + ) + benchmark_duration_s: Optional[int] = Field( + default=None, alias='benchmark-duration-s', gt=0 + ) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + + +def validate_isb1_matrix_entry(entry: dict) -> dict: + """Validate that ISB1 replay matrix entries match the expected structure.""" + try: + ISB1ReplayMatrixEntry(**entry) + except ValidationError as e: + raise ValueError( + f"The following ISB1 matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}" + ) + return entry + + +class ISB1KVStressMatrixEntry(BaseModel): + """Pydantic model for validating ISB1 KV stress matrix entry structure.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + benchmark_type: Literal["isb1_kv_stress"] = Field( + alias=Fields.BENCHMARK_TYPE.value + ) + export_file: str = Field(alias=Fields.EXPORT_FILE.value) + runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) + hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) + canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) + support_status: Optional[ + Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] + ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) + request_mode: str = Field(alias=Fields.REQUEST_MODE.value) + max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0) + offload_mode: Literal["on", "off", "noprefix", "legacy"] = Field( + alias=Fields.OFFLOAD_MODE.value + ) + kv_cache_dtype: Literal["auto", "fp8"] = Field(alias=Fields.KV_CACHE_DTYPE.value) + disable_prefix_caching: bool = Field(alias=Fields.DISABLE_PREFIX_CACHING.value) + benchmark_duration_s: int = Field(alias='benchmark-duration-s', gt=0) + workload_type: Literal["chat", "code"] = Field(alias=Fields.WORKLOAD_TYPE.value) + tp: Optional[int] = Field(default=None, alias=Fields.TP.value, gt=0) + ep: Optional[int] = Field(default=None, alias=Fields.EP.value, gt=0) + max_model_len: Optional[int] = Field( + default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 + ) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + + +def validate_isb1_kv_stress_matrix_entry(entry: dict) -> dict: + """Validate that ISB1 KV stress matrix entries match the expected structure.""" + try: + ISB1KVStressMatrixEntry(**entry) + except ValidationError as e: + raise ValueError( + f"The following ISB1 KV stress matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}" + ) + return entry + + """ Below is the validation logic for the INPUT to utils/matrix_logic/generate_sweep_configs.py, i.e., the master configuration files found in .github/configs. The validation enforces a strict set of @@ -237,6 +379,89 @@ def validate_conc_fields(self): return _validate_conc_fields(self) +class ISB1ReplaySearchSpaceEntry(BaseModel): + """ISB1 replay search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0) + max_sessions: Optional[int] = Field( + default=None, alias=Fields.MAX_SESSIONS.value, gt=0 + ) + max_turns_per_session: Optional[int] = Field( + default=None, alias=Fields.MAX_TURNS_PER_SESSION.value, gt=0 + ) + max_output_len: Optional[int] = Field( + default=None, alias=Fields.MAX_OUTPUT_LEN.value, gt=0 + ) + num_warmup_sessions: int = Field( + default=0, alias=Fields.NUM_WARMUP_SESSIONS.value, ge=0 + ) + ignore_waits: bool = Field(default=False, alias=Fields.IGNORE_WAITS.value) + ignore_eos: bool = Field(default=False, alias=Fields.IGNORE_EOS.value) + benchmark_duration_s: Optional[int] = Field( + default=None, alias='benchmark-duration-s', gt=0 + ) + + +class ISB1ReplayConfigEntry(BaseModel): + """Per-export replay configuration for ISB1.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + export_file: str = Field(alias=Fields.EXPORT_FILE.value) + request_mode: str = Field(alias=Fields.REQUEST_MODE.value) + support_status: Optional[ + Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] + ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) + search_space: List[ISB1ReplaySearchSpaceEntry] = Field( + alias=Fields.SEARCH_SPACE.value, min_length=1 + ) + + +class ISB1KVStressSearchSpaceEntry(BaseModel): + """ISB1 KV stress search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + users: List[int] = Field(alias=Fields.USERS.value, min_length=1) + offload_modes: List[Literal["on", "off", "noprefix", "legacy"]] = Field( + alias=Fields.OFFLOAD_MODES.value, + min_length=1, + ) + duration_s: int = Field(alias=Fields.DURATION_S.value, gt=0) + + +class ISB1KVStressTPConfig(BaseModel): + """Per-TP KV stress configuration for ISB1 parity sweeps.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + tp: int = Field(gt=0) + ep: int = Field(default=1, gt=0) + users: List[int] = Field(alias=Fields.USERS.value, min_length=1) + offload_modes: List[Literal["on", "off", "noprefix", "legacy"]] = Field( + alias=Fields.OFFLOAD_MODES.value, + min_length=1, + ) + duration_s: int = Field(alias=Fields.DURATION_S.value, gt=0) + + +class ISB1KVStressConfigEntry(BaseModel): + """Per-export KV stress configuration for ISB1.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + export_file: str = Field(alias=Fields.EXPORT_FILE.value) + request_mode: str = Field(alias=Fields.REQUEST_MODE.value) + support_status: Optional[ + Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] + ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) + workload_type: Literal["chat", "code"] = Field(alias=Fields.WORKLOAD_TYPE.value) + search_space: List[ISB1KVStressSearchSpaceEntry] = Field( + alias=Fields.SEARCH_SPACE.value, min_length=1 + ) + tp_configs: Optional[List[ISB1KVStressTPConfig]] = Field( + default=None, + alias='tp-configs', + ) + + class SingleNodeSeqLenConfig(BaseModel): """Single node sequence length configuration.""" model_config = ConfigDict(extra='forbid', populate_by_name=True) @@ -289,6 +514,335 @@ class MultiNodeMasterConfigEntry(BaseModel): alias=Fields.SEQ_LEN_CONFIGS.value) +class ISB1MasterConfigEntry(BaseModel): + """Top-level ISB1 replay master configuration entry.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + benchmark_type: Literal["isb1_replay"] = Field( + alias=Fields.BENCHMARK_TYPE.value + ) + runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) + hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) + canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) + max_model_len: Optional[int] = Field( + default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 + ) + offload_mode: Optional[Literal["on", "off", "noprefix", "legacy"]] = Field( + default=None, alias=Fields.OFFLOAD_MODE.value + ) + kv_cache_dtype: Optional[Literal["auto", "fp8"]] = Field( + default=None, alias=Fields.KV_CACHE_DTYPE.value + ) + disable_prefix_caching: Optional[bool] = Field( + default=None, alias=Fields.DISABLE_PREFIX_CACHING.value + ) + replay_configs: List[ISB1ReplayConfigEntry] = Field( + alias=Fields.REPLAY_CONFIGS.value, min_length=1 + ) + + +class ISB1KVStressMasterConfigEntry(BaseModel): + """Top-level ISB1 KV stress master configuration entry.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + benchmark_type: Literal["isb1_kv_stress"] = Field( + alias=Fields.BENCHMARK_TYPE.value + ) + runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) + hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) + canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) + max_model_len: Optional[int] = Field( + default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 + ) + kv_cache_dtype: Literal["auto", "fp8"] = Field(alias=Fields.KV_CACHE_DTYPE.value) + kv_stress_configs: List[ISB1KVStressConfigEntry] = Field( + alias=Fields.KV_STRESS_CONFIGS.value, + min_length=1, + ) + + +ISB1_SHAPE_STEM_RE = re.compile(r"(?P\d+)k(?P\d+)k") +ISB1_RUNNABLE_CERTIFICATION_STATUSES = ["dataset_replay_verified"] + + +def _candidate_config_roots(config_file: str) -> list[Path]: + """Return candidate repo roots for resolving relative export-file paths.""" + config_path = Path(config_file).resolve() + parent_candidates = [config_path.parents[i] for i in range(min(3, len(config_path.parents)))] + candidates = [ + config_path.parent, + *parent_candidates, + Path.cwd().resolve(), + ] + + unique_candidates: list[Path] = [] + for candidate in candidates: + if candidate not in unique_candidates: + unique_candidates.append(candidate) + return unique_candidates + + +def _resolve_export_path(config_file: str, export_file: str) -> Path: + """Resolve an export file relative to the config file or current repo root.""" + export_path = Path(export_file) + if export_path.is_absolute(): + return export_path + + candidate_roots = _candidate_config_roots(config_file) + for candidate_root in candidate_roots: + candidate = candidate_root / export_path + if candidate.exists(): + return candidate + + return candidate_roots[0] / export_path + + +def _load_export_payload(export_path: Path) -> dict: + """Load an ISB1 export payload from disk.""" + try: + with export_path.open("r") as handle: + payload = json.load(handle) + except FileNotFoundError as exc: + raise ValueError(f"Referenced ISB1 export file does not exist: '{export_path}'.") from exc + except json.JSONDecodeError as exc: + raise ValueError(f"Referenced ISB1 export file is not valid JSON: '{export_path}'.") from exc + + exports = payload.get("exports") + if not isinstance(exports, list) or not exports: + raise ValueError( + f"Referenced ISB1 export file must contain a non-empty 'exports' list: '{export_path}'." + ) + return payload + + +def _identity_cells(payload: dict, entry: dict) -> list[dict]: + """Return export cells matching the configured runtime/hardware/model identity.""" + return [ + cell + for cell in payload["exports"] + if cell.get("runtime_stack_id") == entry[Fields.RUNTIME_STACK_ID.value] + and cell.get("hardware_profile_id") == entry[Fields.HARDWARE_PROFILE_ID.value] + and cell.get("canonical_model_id") == entry[Fields.CANONICAL_MODEL_ID.value] + ] + + +def _warn_manifest_max_model_len_mismatch( + *, + export_path: Path, + export_file: str, + max_model_len: Optional[int], + key: str, +) -> None: + """Emit advisory warning if sibling manifest max_model_len disagrees with config.""" + if max_model_len is None: + return + + for manifest_path in sorted(export_path.parent.glob("manifest*.json")): + try: + manifest_payload = json.loads(manifest_path.read_text()) + except (OSError, json.JSONDecodeError): + continue + + manifest_exports = manifest_payload.get("exports") + if isinstance(manifest_exports, list): + export_files = { + item.get("export_file") + for item in manifest_exports + if isinstance(item, dict) and isinstance(item.get("export_file"), str) + } + if export_files and export_file not in export_files: + continue + + manifest_max_model_len = manifest_payload.get("max_model_len") + if manifest_max_model_len is None: + continue + + try: + manifest_max_model_len = int(manifest_max_model_len) + except (TypeError, ValueError): + continue + + if manifest_max_model_len != max_model_len: + warnings.warn( + f"ISB1 master config entry '{key}' sets '{Fields.MAX_MODEL_LEN.value}'=" + f"{max_model_len} for export '{export_file}', but sibling manifest " + f"'{manifest_path}' declares max_model_len={manifest_max_model_len}.", + stacklevel=2, + ) + + +def certify_isb1_replay_contract(master_configs: dict, config_file: str) -> dict: + """Validate that every replay-config resolves to a real, runnable export selection.""" + for key, entry in master_configs.items(): + max_model_len = entry.get(Fields.MAX_MODEL_LEN.value) + + for replay_config in entry[Fields.REPLAY_CONFIGS.value]: + export_file = replay_config[Fields.EXPORT_FILE.value] + support_status = replay_config.get(Fields.SUPPORT_STATUS.value) + export_path = _resolve_export_path(config_file, export_file) + payload = _load_export_payload(export_path) + _warn_manifest_max_model_len_mismatch( + export_path=export_path, + export_file=export_file, + max_model_len=max_model_len, + key=key, + ) + + if not ISB1_SHAPE_STEM_RE.search(export_path.stem) and max_model_len is None: + raise ValueError( + f"ISB1 master config entry '{key}' references mixed-shape export " + f"'{export_file}' without '{Fields.MAX_MODEL_LEN.value}'." + ) + + identity_cells = _identity_cells(payload, entry) + identity_statuses = sorted( + { + cell.get("support_status") + for cell in identity_cells + if cell.get("support_status") is not None + } + ) + matching_cells = [ + cell + for cell in identity_cells + if support_status is None or cell.get("support_status") == support_status + ] + + if support_status is None and len(identity_statuses) > 1: + raise ValueError( + f"ISB1 master config entry '{key}' must pin " + f"'{Fields.SUPPORT_STATUS.value}' for export '{export_file}'. " + f"Matching cells span multiple tiers: {identity_statuses}." + ) + + if not matching_cells: + available_statuses = identity_statuses or [""] + raise ValueError( + f"ISB1 master config entry '{key}' requests export '{export_file}' " + f"with support-status '{support_status}', but no export cell matches " + f"runtime_stack_id='{entry[Fields.RUNTIME_STACK_ID.value]}', " + f"hardware_profile_id='{entry[Fields.HARDWARE_PROFILE_ID.value]}', " + f"canonical_model_id='{entry[Fields.CANONICAL_MODEL_ID.value]}'. " + f"Available support tiers for that identity: {available_statuses}." + ) + + certification_statuses = sorted( + { + cell.get("benchmark_certification_status") + for cell in matching_cells + if cell.get("benchmark_certification_status") is not None + } + ) + if not certification_statuses: + raise ValueError( + f"ISB1 master config entry '{key}' requests export '{export_file}' " + "but the selected export cells do not declare " + "'benchmark_certification_status'." + ) + if certification_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES: + raise ValueError( + f"ISB1 master config entry '{key}' requests export '{export_file}' " + "with runnable support tier selection, but the selected export cells " + f"have benchmark_certification_status values {certification_statuses}. " + "Current InferenceX consumer lanes only accept " + f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}." + ) + + return master_configs + + +def certify_isb1_kv_stress_contract(master_configs: dict, config_file: str) -> dict: + """Validate that every kv-stress-config resolves to a real, runnable export selection.""" + for key, entry in master_configs.items(): + max_model_len = entry.get(Fields.MAX_MODEL_LEN.value) + + for kv_stress_config in entry[Fields.KV_STRESS_CONFIGS.value]: + export_file = kv_stress_config[Fields.EXPORT_FILE.value] + support_status = kv_stress_config.get(Fields.SUPPORT_STATUS.value) + export_path = _resolve_export_path(config_file, export_file) + payload = _load_export_payload(export_path) + _warn_manifest_max_model_len_mismatch( + export_path=export_path, + export_file=export_file, + max_model_len=max_model_len, + key=key, + ) + + if not ISB1_SHAPE_STEM_RE.search(export_path.stem) and max_model_len is None: + raise ValueError( + f"ISB1 KV stress config entry '{key}' references mixed-shape export " + f"'{export_file}' without '{Fields.MAX_MODEL_LEN.value}'." + ) + + identity_cells = _identity_cells(payload, entry) + identity_statuses = sorted( + { + cell.get("support_status") + for cell in identity_cells + if cell.get("support_status") is not None + } + ) + matching_cells = [ + cell + for cell in identity_cells + if support_status is None or cell.get("support_status") == support_status + ] + + if support_status is None and len(identity_statuses) > 1: + raise ValueError( + f"ISB1 KV stress config entry '{key}' must pin " + f"'{Fields.SUPPORT_STATUS.value}' for export '{export_file}'. " + f"Matching cells span multiple tiers: {identity_statuses}." + ) + + if not matching_cells: + available_statuses = identity_statuses or [""] + raise ValueError( + f"ISB1 KV stress config entry '{key}' requests export '{export_file}' " + f"with support-status '{support_status}', but no export cell matches " + f"runtime_stack_id='{entry[Fields.RUNTIME_STACK_ID.value]}', " + f"hardware_profile_id='{entry[Fields.HARDWARE_PROFILE_ID.value]}', " + f"canonical_model_id='{entry[Fields.CANONICAL_MODEL_ID.value]}'. " + f"Available support tiers for that identity: {available_statuses}." + ) + + certification_statuses = sorted( + { + cell.get("benchmark_certification_status") + for cell in matching_cells + if cell.get("benchmark_certification_status") is not None + } + ) + if not certification_statuses: + raise ValueError( + f"ISB1 KV stress config entry '{key}' requests export '{export_file}' " + "but the selected export cells do not declare " + "'benchmark_certification_status'." + ) + if certification_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES: + raise ValueError( + f"ISB1 KV stress config entry '{key}' requests export '{export_file}' " + "with runnable support tier selection, but the selected export cells " + f"have benchmark_certification_status values {certification_statuses}. " + "Current InferenceX consumer lanes only accept " + f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}." + ) + + return master_configs + + def validate_master_config(master_configs: dict) -> List[dict]: """Validate input master configuration structure.""" for key, entry in master_configs.items(): @@ -304,6 +858,30 @@ def validate_master_config(master_configs: dict) -> List[dict]: f"Master config entry '{key}' failed validation:\n{e}") return master_configs + +def validate_isb1_master_config(master_configs: dict) -> List[dict]: + """Validate ISB1 replay master configuration structure.""" + for key, entry in master_configs.items(): + try: + ISB1MasterConfigEntry(**entry) + except ValidationError as e: + raise ValueError( + f"ISB1 master config entry '{key}' failed validation:\n{e}" + ) + return master_configs + + +def validate_isb1_kv_stress_master_config(master_configs: dict) -> List[dict]: + """Validate ISB1 KV stress master configuration structure.""" + for key, entry in master_configs.items(): + try: + ISB1KVStressMasterConfigEntry(**entry) + except ValidationError as e: + raise ValueError( + f"ISB1 KV stress master config entry '{key}' failed validation:\n{e}" + ) + return master_configs + # Runner Config Validation @@ -371,26 +949,17 @@ class ChangelogMatrixEntry(BaseModel): # ============================================================================= -def load_config_files(config_files: List[str], validate: bool = True) -> dict: - """Load and merge configuration files. - - Args: - config_files: List of paths to YAML configuration files. - validate: If True, run validate_master_config on loaded data. Defaults to True. - - Returns: - Merged configuration dictionary. - - Raises: - ValueError: If file doesn't exist, isn't a dict, or has duplicate keys. - """ +def _load_and_merge_yaml_files(config_files: List[str]) -> dict: + """Load and merge YAML configuration files.""" all_config_data = {} for config_file in config_files: try: with open(config_file, 'r') as f: config_data = yaml.safe_load(f) - assert isinstance( - config_data, dict), f"Config file '{config_file}' must contain a dictionary" + if not isinstance(config_data, dict): + raise ValueError( + f"Config file '{config_file}' must contain a dictionary" + ) # Don't allow '*' wildcard in master config keys as we need to reserve these # for expansion in process_changelog.py @@ -411,12 +980,60 @@ def load_config_files(config_files: List[str], validate: bool = True) -> dict: except FileNotFoundError: raise ValueError(f"Input file '{config_file}' does not exist.") + return all_config_data + + +def load_config_files(config_files: List[str], validate: bool = True) -> dict: + """Load and merge throughput configuration files. + + Args: + config_files: List of paths to YAML configuration files. + validate: If True, run validate_master_config on loaded data. Defaults to True. + + Returns: + Merged configuration dictionary. + + Raises: + ValueError: If file doesn't exist, isn't a dict, or has duplicate keys. + """ + all_config_data = _load_and_merge_yaml_files(config_files) + if validate: validate_master_config(all_config_data) return all_config_data +def load_isb1_config_files(config_files: List[str], validate: bool = True) -> dict: + """Load and merge ISB1 replay configuration files.""" + all_config_data = _load_and_merge_yaml_files(config_files) + + if validate: + validate_isb1_master_config(all_config_data) + for config_file in config_files: + certify_isb1_replay_contract( + _load_and_merge_yaml_files([config_file]), + config_file=config_file, + ) + + return all_config_data + + +def load_isb1_kv_stress_config_files(config_files: List[str], validate: bool = True) -> dict: + """Load and merge ISB1 KV stress configuration files.""" + all_config_data = _load_and_merge_yaml_files(config_files) + + if validate: + validate_isb1_kv_stress_master_config(all_config_data) + for config_file in config_files: + certify_isb1_kv_stress_contract( + _load_and_merge_yaml_files([config_file]), + config_file=config_file, + ) + + return all_config_data + + def load_runner_file(runner_file: str, validate: bool = True) -> dict: """Load runner configuration file. diff --git a/utils/process_result.py b/utils/process_result.py index 0a84a1f18..e680239d1 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -4,6 +4,15 @@ from pathlib import Path +def fail_if_isb1_replay_requested(): + """Guard against sending ISB1 replay results through the throughput processor.""" + if os.environ.get('BENCHMARK_TYPE') == 'isb1_replay': + raise SystemExit( + 'process_result.py does not support ISB1 replay results. ' + 'Use utils/process_result_isb1.py instead.' + ) + + def get_required_env_vars(required_vars): """Load and validate required environment variables.""" env_values = {} @@ -22,6 +31,8 @@ def get_required_env_vars(required_vars): return env_values +fail_if_isb1_replay_requested() + # Base required env vars base_env = get_required_env_vars([ 'RUNNER_TYPE', 'FRAMEWORK', 'PRECISION', 'SPEC_DECODING', @@ -42,6 +53,12 @@ def get_required_env_vars(required_vars): with open(f'{result_filename}.json') as f: bmk_result = json.load(f) +if 'aggregate_metrics' in bmk_result and 'total_token_throughput_tps' in bmk_result['aggregate_metrics']: + raise SystemExit( + 'Detected an ISB1 replay-style result payload in process_result.py. ' + 'Use utils/process_result_isb1.py instead.' + ) + data = { 'hw': hw, 'conc': int(bmk_result['max_concurrency']), diff --git a/utils/process_result_isb1.py b/utils/process_result_isb1.py new file mode 100644 index 000000000..7f338ab2c --- /dev/null +++ b/utils/process_result_isb1.py @@ -0,0 +1,490 @@ +import json +import os +import re +import sys +from pathlib import Path +from typing import Any, Optional, Tuple + +ISB1_RUNNABLE_CERTIFICATION_STATUSES = ["dataset_replay_verified"] + + +def get_required_env_vars(required_vars): + """Load and validate required environment variables.""" + env_values = {} + missing_env_vars = [] + + for var_name in required_vars: + value = os.environ.get(var_name) + if value is None: + missing_env_vars.append(var_name) + env_values[var_name] = value + + if missing_env_vars: + raise EnvironmentError( + f"Missing required environment variables: {', '.join(missing_env_vars)}" + ) + + return env_values + + +def parse_export_shape(export_file: str) -> Tuple[int, int, Optional[str], str, dict[str, Any]]: + """Derive ISL/OSL plus export lane/surface and preview metadata from the export path/file.""" + export_path = Path(export_file) + match = re.search(r"(?P\d+)k(?P\d+)k", export_path.stem) + + isl = int(os.environ.get("ISL", "0") or 0) + osl = int(os.environ.get("OSL", "0") or 0) + surface = export_path.stem + metadata: dict[str, Any] = {} + + if match: + isl = int(match.group("isl")) * 1024 + osl = int(match.group("osl")) * 1024 + surface = export_path.stem[: match.start()].rstrip("_-") or export_path.stem + + lane = None + if "exports" in export_path.parts: + exports_idx = export_path.parts.index("exports") + if exports_idx + 1 < len(export_path.parts): + lane = export_path.parts[exports_idx + 1] + if lane == "preview" and exports_idx + 2 < len(export_path.parts): + lane = f"preview/{export_path.parts[exports_idx + 2]}" + + try: + payload = json.loads(export_path.read_text()) + except (FileNotFoundError, json.JSONDecodeError): + payload = None + + if payload is not None: + served_shape = payload.get("served_shape") or {} + isl = int(served_shape.get("isl", isl) or isl) + osl = int(served_shape.get("osl", osl) or osl) + surface = payload.get("surface") or payload.get("adapter_surface") or surface + + context_bands = sorted( + { + cell.get("context_band") + for cell in payload.get("exports", []) + if cell.get("context_band") + } + ) + metadata = { + "adapter_id": payload.get("adapter_id"), + "bundle_id": payload.get("bundle_id"), + "profile_id": payload.get("profile_id"), + "duration_tier": payload.get("duration_tier"), + "context_bands": context_bands, + "adapter_support_status": payload.get("adapter_support_status"), + "profile_tier": payload.get("tier"), + } + producer_handoff = payload.get("producer_handoff_metadata") or {} + if producer_handoff: + metadata["producer_handoff_class"] = producer_handoff.get("class") + metadata["producer_claim_boundary"] = producer_handoff.get("claim_boundary") + + # Extract producer KV expectations from first export cell trace_metadata + first_cell = (payload.get("exports") or [{}])[0] if payload.get("exports") else {} + trace_metadata = first_cell.get("trace_metadata", {}) + if trace_metadata: + metadata["producer_estimated_kv_bytes_peak"] = trace_metadata.get("estimated_kv_bytes_peak") + pressure_profile = trace_metadata.get("context_pressure_profile", {}) + metadata["producer_expected_offload_mode"] = ( + pressure_profile.get("expected_offload_mode") + or trace_metadata.get("expected_offload_mode") + ) + + return isl, osl, lane, surface, metadata + + +def validate_support_status_selection( + expected_support_status: Optional[str], selection: dict[str, Any] +) -> None: + """Ensure processed ISB1 output is labeled with the tier actually selected by the harness.""" + if not expected_support_status: + return + + selected_statuses = selection.get("support_statuses") or [] + if not selected_statuses: + raise ValueError( + "ISB1 replay result is missing selection.support_statuses; " + "cannot certify the processed support tier." + ) + + unique_statuses = sorted(set(selected_statuses)) + if unique_statuses != [expected_support_status]: + raise ValueError( + "ISB1 replay result support-status mismatch: " + f"workflow requested '{expected_support_status}' but harness selected {unique_statuses}." + ) + + +def validate_certification_selection(selection: dict[str, Any]) -> None: + """Ensure processed ISB1 output carries the expected runnable certification.""" + selected_statuses = selection.get("benchmark_certification_statuses") or [] + if not selected_statuses: + raise ValueError( + "ISB1 replay result is missing selection.benchmark_certification_statuses; " + "cannot certify the processed replay result." + ) + + unique_statuses = sorted(set(selected_statuses)) + if unique_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES: + raise ValueError( + "ISB1 replay result benchmark-certification mismatch: " + "current consumer lanes require " + f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}, but harness selected {unique_statuses}." + ) + + +def build_context_pressure_signal( + context_pressure_class: str, + kv_offload_observed: bool, + peak_cpu_cache_usage: float, + cpu_cache_metric_available: bool, + depth_coverage_ratio: Optional[float] = None, + max_actual_context_len: Optional[int] = None, +) -> dict[str, Any]: + """Emit a machine-readable status for preview-lane context-pressure validation.""" + if context_pressure_class == "standard": + status = "not_applicable" + reason = "standard_context" + requires_log_review = False + elif depth_coverage_ratio is not None and depth_coverage_ratio < 0.1: + status = "depth_mismatch" + reason = "configured_depth_not_exercised" + requires_log_review = True + elif not cpu_cache_metric_available: + status = "observability_gap" + reason = "no_direct_cpu_cache_metric" + requires_log_review = True + elif not kv_offload_observed and peak_cpu_cache_usage == 0.0: + status = "suspicious" + reason = "high_context_without_cpu_cache_usage" + requires_log_review = True + else: + status = "ok" + reason = "cpu_cache_signal_present" + requires_log_review = False + + result = { + "status": status, + "reason": reason, + "requires_log_review": requires_log_review, + "cpu_cache_metric_available": cpu_cache_metric_available, + } + if depth_coverage_ratio is not None: + result["depth_coverage_ratio"] = round(depth_coverage_ratio, 4) + if max_actual_context_len is not None: + result["max_actual_context_len"] = max_actual_context_len + return result + + +def build_runtime_overrides(replay_result: dict[str, Any]) -> dict[str, Optional[str]]: + """Return a stable runtime-overrides payload for aggregated ISB1 results.""" + override_mapping = { + "vllm_cpu_offload_gb": "VLLM_CPU_OFFLOAD_GB", + "vllm_swap_space_gb": "VLLM_SWAP_SPACE_GB", + "sglang_mem_fraction_override": "SGLANG_MEM_FRACTION_OVERRIDE", + "sglang_chunked_prefill_override": "SGLANG_CHUNKED_PREFILL_OVERRIDE", + } + runtime_overrides: dict[str, Optional[str]] = {} + + for result_key, env_var in override_mapping.items(): + value = replay_result.get(result_key) + if value in (None, ""): + value = os.environ.get(env_var) + runtime_overrides[result_key] = value if value not in (None, "") else None + + return runtime_overrides + + +def build_artifact_stems(result_filename: str) -> dict[str, str]: + """Return artifact names emitted by benchmark-isb1-tmpl.yml for this result stem.""" + return { + "processed": f"isb1_{result_filename}", + "raw_replay": f"replay_{result_filename}", + "server_logs": f"server_logs_{result_filename}", + "gpu_metrics": f"gpu_metrics_{result_filename}", + } + + +def build_dispatch_ref() -> Optional[str]: + """Return the best available workflow dispatch ref for traceability.""" + for env_var in ("DISPATCH_REF", "INPUT_REF", "GITHUB_REF"): + value = os.environ.get(env_var) + if value not in (None, ""): + return value + return None + + +base_env = get_required_env_vars( + [ + "RUNNER_TYPE", + "FRAMEWORK", + "PRECISION", + "RESULT_FILENAME", + "MODEL_PREFIX", + "IMAGE", + "TP", + "EP_SIZE", + "DP_ATTENTION", + "BENCHMARK_TYPE", + "EXPORT_FILE", + "RUNTIME_STACK_ID", + "HARDWARE_PROFILE_ID", + "CANONICAL_MODEL_ID", + "REQUEST_MODE", + "MAX_CONCURRENCY", + ] +) + +result_filename = base_env["RESULT_FILENAME"] +with open(f"{result_filename}.json") as f: + replay_result = json.load(f) + +aggregate = replay_result["aggregate_metrics"] +tp_size = int(base_env["TP"]) +ep_size = int(base_env["EP_SIZE"]) +validate_support_status_selection( + os.environ.get("SUPPORT_STATUS") or None, + replay_result.get("selection", {}), +) +validate_certification_selection(replay_result.get("selection", {})) +isl, osl, export_lane, benchmark_surface, export_metadata = parse_export_shape( + base_env["EXPORT_FILE"] +) + +total_tput = float(aggregate["total_token_throughput_tps"]) +output_tput = float(aggregate["output_throughput_tps"]) + +server_metrics_summary = replay_result.get("server_metrics_summary", {}) +cpu_cache_metric_available_raw = server_metrics_summary.get("cpu_cache_metric_available") +cpu_cache_metric_available = bool(cpu_cache_metric_available_raw) +if cpu_cache_metric_available_raw is None: + # Backward-compatibility shim for older replay outputs that predate the + # explicit availability field. Presence of the metric name/fields is a + # better signal than the sampled value because a real metric can be present + # and legitimately report 0.0. + cpu_cache_metric_available = bool(server_metrics_summary.get("cpu_cache_metric_name")) or any( + metric_name in server_metrics_summary + for metric_name in ("cpu_cache_usage_avg", "cpu_cache_usage_peak") + ) + +data = { + "hw": base_env["RUNNER_TYPE"], + "conc": int(replay_result.get("max_concurrency", base_env["MAX_CONCURRENCY"])), + "image": base_env["IMAGE"], + "model": replay_result["model_id"], + "infmax_model_prefix": base_env["MODEL_PREFIX"], + "framework": base_env["FRAMEWORK"], + "precision": base_env["PRECISION"], + "spec_decoding": os.environ.get("SPEC_DECODING", "none"), + "disagg": False, + "isl": isl, + "osl": osl, + "is_multinode": False, + "tp": tp_size, + "ep": ep_size, + "dp_attention": base_env["DP_ATTENTION"], + "tput_per_gpu": total_tput / tp_size, + "output_tput_per_gpu": output_tput / tp_size, + "input_tput_per_gpu": (total_tput - output_tput) / tp_size, + "benchmark_type": base_env["BENCHMARK_TYPE"], + "result_filename": result_filename, + "artifact_stems": build_artifact_stems(result_filename), + "dispatch_ref": build_dispatch_ref(), + "export_file": base_env["EXPORT_FILE"], + "export_lane": export_lane, + "benchmark_surface": benchmark_surface, + "adapter_id": export_metadata.get("adapter_id"), + "bundle_id": export_metadata.get("bundle_id"), + "profile_id": export_metadata.get("profile_id"), + "duration_tier": export_metadata.get("duration_tier"), + "context_bands": export_metadata.get("context_bands", []), + "adapter_support_status": export_metadata.get("adapter_support_status"), + "profile_tier": export_metadata.get("profile_tier"), + "producer_handoff_class": export_metadata.get("producer_handoff_class"), + "producer_claim_boundary": export_metadata.get("producer_claim_boundary"), + "runtime_stack_id": base_env["RUNTIME_STACK_ID"], + "hardware_profile_id": base_env["HARDWARE_PROFILE_ID"], + "canonical_model_id": base_env["CANONICAL_MODEL_ID"], + "support_status": os.environ.get("SUPPORT_STATUS") or None, + "benchmark_certification_status": replay_result.get("selection", {}).get( + "benchmark_certification_statuses", [None] + )[0], + "request_mode": base_env["REQUEST_MODE"], + "workload_type": os.environ.get("WORKLOAD_TYPE") or benchmark_surface, + "benchmark_duration_s": ( + float(os.environ["BENCHMARK_DURATION_S"]) + if os.environ.get("BENCHMARK_DURATION_S") not in (None, "") + else None + ), + "campaign_class": ( + "kv_stress" + if base_env["BENCHMARK_TYPE"] == "isb1_kv_stress" + else "replay" + ), + "harness_request_mode": replay_result.get("harness_request_mode", "auto"), + "mode": replay_result.get("mode"), + "selection": replay_result.get("selection", {}), + "aggregate_metrics": aggregate, + "per_turn_metrics": replay_result.get("per_turn_metrics", {}), + "server_metrics_summary": server_metrics_summary, + "cache_observability_status": server_metrics_summary.get("observability_status"), + "gpu_cache_metric_name": server_metrics_summary.get("gpu_cache_metric_name"), + "cpu_cache_metric_name": server_metrics_summary.get("cpu_cache_metric_name"), + "cpu_cache_metric_available": cpu_cache_metric_available, + "kv_offload_observed": bool(server_metrics_summary.get("kv_offload_observed", False)), + "peak_gpu_cache_usage": float(server_metrics_summary.get("gpu_cache_usage_peak", 0.0)), + "peak_cpu_cache_usage": float(server_metrics_summary.get("cpu_cache_usage_peak", 0.0)), + "session_throughput_sps": float(aggregate.get("session_throughput_sps", 0.0)), + "completed_sessions": int(aggregate.get("completed_sessions", 0)), + "total_sessions": int(aggregate.get("total_sessions", 0)), + "num_sessions": replay_result.get("num_sessions"), + "max_turns": replay_result.get("max_turns"), + "num_warmup_sessions": replay_result.get( + "num_warmup_sessions", int(os.environ.get("NUM_WARMUP_SESSIONS", "0") or 0) + ), + "max_model_len": ( + int(os.environ["MAX_MODEL_LEN"]) + if os.environ.get("MAX_MODEL_LEN") not in (None, "") + else None + ), + "max_sessions": ( + int(os.environ["MAX_SESSIONS"]) + if os.environ.get("MAX_SESSIONS") not in (None, "") + else None + ), + "max_turns_per_session": ( + int(os.environ["MAX_TURNS_PER_SESSION"]) + if os.environ.get("MAX_TURNS_PER_SESSION") not in (None, "") + else None + ), + "max_output_len": ( + int(os.environ["MAX_OUTPUT_LEN"]) + if os.environ.get("MAX_OUTPUT_LEN") not in (None, "") + else None + ), + "ignore_waits": os.environ.get("IGNORE_WAITS", "false").lower() == "true", + "ignore_eos": os.environ.get("IGNORE_EOS", "false").lower() == "true", + "offload_mode": os.environ.get("OFFLOAD_MODE") or None, + "kv_cache_dtype": os.environ.get("KV_CACHE_DTYPE") or None, + "disable_prefix_caching": os.environ.get("DISABLE_PREFIX_CACHING", "false").lower() == "true", + "runtime_overrides": build_runtime_overrides(replay_result), +} + +effective_max_context_depth = data["max_model_len"] or (isl + osl + 200) +data["effective_max_context_depth"] = effective_max_context_depth +if effective_max_context_depth > 600000: + data["context_pressure_class"] = "extended_1m" +elif effective_max_context_depth > 200000: + data["context_pressure_class"] = "extended_500k" +else: + data["context_pressure_class"] = "standard" + +# Depth telemetry: actual vs configured context depth +depth_telemetry = replay_result.get("depth_telemetry", {}) +max_actual_context_len = int(depth_telemetry.get("max_actual_context_len_per_turn") or 0) or None +total_actual_input_tokens = int(depth_telemetry.get("total_actual_input_tokens") or 0) or None +depth_coverage_ratio = None +if max_actual_context_len and effective_max_context_depth > 0: + depth_coverage_ratio = max_actual_context_len / effective_max_context_depth + +data["total_actual_input_tokens"] = total_actual_input_tokens +data["max_actual_context_len_per_turn"] = max_actual_context_len +data["depth_coverage_ratio"] = round(depth_coverage_ratio, 4) if depth_coverage_ratio is not None else None +data["depth_gap_tokens"] = ( + effective_max_context_depth - max_actual_context_len + if max_actual_context_len is not None else None +) + +# Depth coverage classification +if depth_coverage_ratio is not None: + if depth_coverage_ratio >= 0.9: + data["depth_coverage_class"] = "full" + elif depth_coverage_ratio >= 0.5: + data["depth_coverage_class"] = "partial" + elif depth_coverage_ratio >= 0.1: + data["depth_coverage_class"] = "bounded_preview" + else: + data["depth_coverage_class"] = "configuration_only" +else: + data["depth_coverage_class"] = None + +# Producer expectation comparison +producer_estimated_kv_bytes_peak = export_metadata.get("producer_estimated_kv_bytes_peak") +producer_expected_offload_mode = export_metadata.get("producer_expected_offload_mode") +data["producer_estimated_kv_bytes_peak"] = producer_estimated_kv_bytes_peak +data["producer_expected_offload_mode"] = producer_expected_offload_mode + +offload_mode_match = None +if producer_expected_offload_mode and data["context_pressure_class"] != "standard": + if producer_expected_offload_mode in ("hard_offload", "soft_offload"): + offload_mode_match = data["kv_offload_observed"] + elif producer_expected_offload_mode == "none": + offload_mode_match = True +data["producer_expectation_validation"] = { + "offload_mode_match": offload_mode_match, + "kv_bytes_validation": "not_available", + "depth_exercised": bool(depth_coverage_ratio and depth_coverage_ratio >= 0.5), +} + +# Preemption count from server metrics +data["preemption_count"] = int( + server_metrics_summary.get("preemption_count") + or replay_result.get("preemption_count") + or 0 +) + +context_pressure_signal = build_context_pressure_signal( + context_pressure_class=data["context_pressure_class"], + kv_offload_observed=data["kv_offload_observed"], + peak_cpu_cache_usage=data["peak_cpu_cache_usage"], + cpu_cache_metric_available=data["cpu_cache_metric_available"], + depth_coverage_ratio=depth_coverage_ratio, + max_actual_context_len=max_actual_context_len, +) +data["context_pressure_signal"] = context_pressure_signal +data["context_pressure_suspicious"] = context_pressure_signal["status"] == "suspicious" + +if data["context_pressure_suspicious"]: + print( + "WARNING: Preview lane at " + f"max-model-len={effective_max_context_depth} saw no CPU cache usage. " + "The server may have silently capped context or failed to activate KV offload. " + "Check server.log for OOM or context truncation.", + file=sys.stderr, + ) +elif context_pressure_signal["status"] == "depth_mismatch": + print( + "WARNING: Preview lane at " + f"max-model-len={effective_max_context_depth} had max actual context of " + f"{max_actual_context_len} tokens (depth_coverage_ratio=" + f"{depth_coverage_ratio:.4f}). The server was configured for " + f"{data['context_pressure_class'].replace('extended_', '')} but requests only exercised " + f"{max_actual_context_len} tokens. This is expected for file-backed replay previews; " + "it does not prove KV pressure at the configured depth.", + file=sys.stderr, + ) +elif context_pressure_signal["status"] == "observability_gap": + print( + "WARNING: Preview lane at " + f"max-model-len={effective_max_context_depth} lacks a direct CPU cache metric " + "for this framework. Inspect server.log and operator tuning notes before " + "treating the run as credible long-context evidence.", + file=sys.stderr, + ) + +for key, value in aggregate.items(): + if key.endswith("_ms"): + data[key.replace("_ms", "")] = float(value) / 1000.0 + if "tpot" in key: + metric_value = float(value) + data[key.replace("_ms", "").replace("tpot", "intvty")] = ( + 1000.0 / metric_value if metric_value > 0 else 0.0 + ) + +print(json.dumps(data, indent=2)) + +with open(f"agg_{result_filename}.json", "w") as f: + json.dump(data, f, indent=2) diff --git a/utils/summarize_isb1.py b/utils/summarize_isb1.py new file mode 100644 index 000000000..3c2428a4b --- /dev/null +++ b/utils/summarize_isb1.py @@ -0,0 +1,238 @@ +import argparse +import json +from pathlib import Path +from typing import Any + +try: + from tabulate import tabulate as _tabulate +except ImportError: # pragma: no cover - fallback for minimal local environments + _tabulate = None + + +SUPPORT_STATUS_ORDER = { + "supported": 0, + "reviewed_preview": 1, + "gated": 2, + "artifact_only": 3, + "unsupported": 4, + None: 5, +} + + +def load_isb1_rows(results_dir: Path) -> list[dict[str, Any]]: + """Load processed ISB1 rows from a results directory.""" + rows: list[dict[str, Any]] = [] + for result_path in results_dir.rglob("*.json"): + try: + payload = json.loads(result_path.read_text()) + except (OSError, json.JSONDecodeError): + continue + + candidates = payload if isinstance(payload, list) else [payload] + for candidate in candidates: + if isinstance(candidate, dict) and candidate.get("benchmark_type") == "isb1_replay": + rows.append(candidate) + return rows + + +def sort_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Sort rows in an operator-friendly order.""" + return sorted( + rows, + key=lambda row: ( + SUPPORT_STATUS_ORDER.get(row.get("support_status"), 99), + row.get("infmax_model_prefix", ""), + row.get("hw", ""), + row.get("framework", ""), + row.get("effective_max_context_depth", 0) or 0, + row.get("result_filename", ""), + ), + ) + + +def format_float(value: Any, precision: int = 2) -> str: + """Format a numeric value for markdown tables.""" + if value is None: + return "-" + try: + return f"{float(value):.{precision}f}" + except (TypeError, ValueError): + return str(value) + + +def format_bool(value: Any) -> str: + """Format a truthy value as yes/no for operators.""" + return "yes" if bool(value) else "no" + + +def render_table(headers: list[str], rows: list[list[Any]], tablefmt: str) -> str: + """Render a markdown/plain table with a lightweight fallback if tabulate is absent.""" + normalized_rows = [[str(cell) for cell in row] for row in rows] + if _tabulate is not None: + return _tabulate(normalized_rows, headers=headers, tablefmt=tablefmt) + + widths = [len(header) for header in headers] + for row in normalized_rows: + for index, cell in enumerate(row): + widths[index] = max(widths[index], len(cell)) + + def render_row(row: list[str]) -> str: + cells = [cell.ljust(widths[index]) for index, cell in enumerate(row)] + return f"| {' | '.join(cells)} |" + + divider = f"| {' | '.join('-' * width for width in widths)} |" + lines = [render_row(headers), divider] + lines.extend(render_row(row) for row in normalized_rows) + return "\n".join(lines) + + +def build_lane_summary_table(rows: list[dict[str, Any]], tablefmt: str) -> str: + """Render the main operator lane summary table.""" + headers = [ + "Lane", + "Model", + "HW", + "Framework", + "Support", + "Cert", + "Max Ctx", + "Context Class", + "Sessions", + "Session Tput", + "TTFT Median (s)", + "Ctx Pressure", + "Log Review", + "KV Offload", + "GPU Cache Peak", + "CPU Cache Peak", + ] + table_rows = [ + [ + row.get("result_filename", "-"), + row.get("infmax_model_prefix", "-"), + row.get("hw", "-"), + row.get("framework", "-"), + row.get("support_status", "-"), + row.get("benchmark_certification_status", "-"), + row.get("effective_max_context_depth", "-"), + row.get("context_pressure_class", "-"), + f"{row.get('completed_sessions', 0)}/{row.get('total_sessions', 0)}", + format_float(row.get("session_throughput_sps"), 2), + format_float(row.get("median_ttft"), 3), + (row.get("context_pressure_signal") or {}).get("status", "-"), + format_bool((row.get("context_pressure_signal") or {}).get("requires_log_review")), + format_bool(row.get("kv_offload_observed")), + format_float(row.get("peak_gpu_cache_usage"), 2), + format_float(row.get("peak_cpu_cache_usage"), 2), + ] + for row in rows + ] + return render_table(headers, table_rows, tablefmt) + + +def build_runtime_override_table(rows: list[dict[str, Any]], tablefmt: str) -> str | None: + """Render the runtime override table when any override is present.""" + override_rows = [] + for row in rows: + runtime_overrides = row.get("runtime_overrides") or {} + if not any(value not in (None, "") for value in runtime_overrides.values()): + continue + override_rows.append( + [ + row.get("result_filename", "-"), + row.get("infmax_model_prefix", "-"), + row.get("hw", "-"), + row.get("framework", "-"), + runtime_overrides.get("vllm_cpu_offload_gb") or "-", + runtime_overrides.get("vllm_swap_space_gb") or "-", + runtime_overrides.get("sglang_mem_fraction_override") or "-", + runtime_overrides.get("sglang_chunked_prefill_override") or "-", + row.get("dispatch_ref") or "-", + ] + ) + + if not override_rows: + return None + + headers = [ + "Lane", + "Model", + "HW", + "Framework", + "VLLM CPU Offload GB", + "VLLM Swap GB", + "SGLang Mem Fraction", + "SGLang Chunked Prefill", + "Dispatch Ref", + ] + return render_table(headers, override_rows, tablefmt) + + +def build_action_items(rows: list[dict[str, Any]]) -> list[str]: + """Build operator action items for suspicious or manual-review rows.""" + items: list[str] = [] + for row in rows: + signal = row.get("context_pressure_signal") or {} + if not row.get("context_pressure_suspicious") and not signal.get("requires_log_review"): + continue + + artifact_stems = row.get("artifact_stems") or {} + items.append( + "- " + f"`{row.get('result_filename', 'unknown')}` ({row.get('infmax_model_prefix', '-')}/" + f"{row.get('hw', '-')}/{row.get('framework', '-')}) " + f"requires follow-up: context pressure `{signal.get('status', 'unknown')}`; " + f"review replay `{artifact_stems.get('raw_replay', '-')}`, " + f"logs `{artifact_stems.get('server_logs', '-')}`, " + f"GPU metrics `{artifact_stems.get('gpu_metrics', '-')}`" + + ( + f", dispatch `{row.get('dispatch_ref')}`" + if row.get("dispatch_ref") + else "" + ) + + "." + ) + return items + + +def generate_summary(results_dir: Path, tablefmt: str = "github") -> str: + """Generate an ISB1-specific operator summary in markdown/plain text.""" + rows = sort_rows(load_isb1_rows(results_dir)) + sections = ["## ISB1 Operator Summary", ""] + + if not rows: + sections.append("No ISB1 replay rows found.") + return "\n".join(sections).rstrip() + "\n" + + sections.extend(["### Lane Summary", "", build_lane_summary_table(rows, tablefmt), ""]) + + runtime_override_table = build_runtime_override_table(rows, tablefmt) + if runtime_override_table: + sections.extend(["### Runtime Overrides", "", runtime_override_table, ""]) + + action_items = build_action_items(rows) + sections.append("### Action Items") + sections.append("") + if action_items: + sections.extend(action_items) + else: + sections.append("- None. No suspicious or manual-log-review rows were detected.") + + return "\n".join(sections).rstrip() + "\n" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate an ISB1-specific operator summary.") + parser.add_argument("results_dir", type=Path) + parser.add_argument("--format", choices=["github", "plain"], default="github") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + print(generate_summary(args.results_dir, tablefmt=args.format)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/utils/test_benchmark_export_replay.py b/utils/test_benchmark_export_replay.py new file mode 100644 index 000000000..31e4dc656 --- /dev/null +++ b/utils/test_benchmark_export_replay.py @@ -0,0 +1,766 @@ +import asyncio +import json +from pathlib import Path + +from aiohttp import web + +from bench_serving.benchmark_export_replay import ( + load_replay_sessions, + run_export_replay_benchmark, +) + + +def _count_tokens(text: str) -> int: + return max(1, len((text or "").split())) if text else 0 + + +def _multiturn_payload(runtime_stack_id: str = "standalone:sglang") -> dict: + return { + "adapter_id": "inferencex_multiturn", + "exports": [ + { + "trace_id": "trace-chat-1", + "runtime_stack_id": runtime_stack_id, + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "qwen3_30b_a3b", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + "session": { + "session_id": "session-chat-1", + "turns": [ + { + "turn_idx": 0, + "turn_id": 0, + "messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Investigate the flaky test."} + ], + } + ], + "expected_output_tokens": 8, + "wait_before_ms": 0, + }, + { + "turn_idx": 1, + "turn_id": 1, + "messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Investigate the flaky test."} + ], + }, + { + "role": "assistant", + "content_blocks": [ + {"type": "text", "text": "I found a race in the setup."} + ], + }, + { + "role": "tool", + "content_blocks": [ + {"type": "log", "text": "pytest -k flaky_test -> failed"} + ], + }, + ], + "expected_output_tokens": 6, + "wait_before_ms": 10, + }, + ], + }, + } + ], + } + + +def _trace_replay_payload(runtime_stack_id: str = "standalone:trt_llm") -> dict: + return { + "adapter_id": "inferencex_trace_replay", + "exports": [ + { + "trace_id": "trace-replay-1", + "runtime_stack_id": runtime_stack_id, + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "gpt_oss_120b", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + "trace_metadata": {"session_id": "session-replay-1"}, + "events": [ + { + "turn_id": 0, + "arrival_time_offset_ms": 0, + "input_messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Summarize the incident report."} + ], + } + ], + "target_output_tokens": 7, + }, + { + "turn_id": 1, + "arrival_time_offset_ms": 25, + "input_messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Summarize the incident report."} + ], + }, + { + "role": "assistant", + "content_blocks": [ + {"type": "text", "text": "The outage started after deploy."} + ], + }, + ], + "target_output_tokens": 5, + }, + ], + } + ], + } + + +async def _start_mock_server( + sse_mode: str = "normal", + metrics_text: str | None = None, +) -> tuple[web.AppRunner, str]: + """Start a mock OpenAI-compatible server. + + sse_mode controls how SSE frames are written to the wire: + - "normal": one data frame per write (default) + - "multiline": multiple data frames packed into a single write + - "split": a single data frame split across two writes + """ + + async def _stream_response(request: web.Request, chunks: list[dict]) -> web.StreamResponse: + response = web.StreamResponse( + status=200, + headers={"Content-Type": "text/event-stream"}, + ) + await response.prepare(request) + + if sse_mode == "multiline": + # Pack ALL data frames into a single TCP write + blob = b"" + for chunk in chunks: + blob += f"data: {json.dumps(chunk)}\n\n".encode() + blob += b"data: [DONE]\n\n" + await response.write(blob) + elif sse_mode == "split": + # Split the first frame across two writes + for idx, chunk in enumerate(chunks): + frame = f"data: {json.dumps(chunk)}\n\n".encode() + if idx == 0: + mid = len(frame) // 2 + await response.write(frame[:mid]) + await asyncio.sleep(0.005) + await response.write(frame[mid:]) + else: + await response.write(frame) + await asyncio.sleep(0.005) + await response.write(b"data: [DONE]\n\n") + else: + for chunk in chunks: + await response.write(f"data: {json.dumps(chunk)}\n\n".encode()) + await asyncio.sleep(0.005) + await response.write(b"data: [DONE]\n\n") + + await response.write_eof() + return response + + async def chat_handler(request: web.Request) -> web.StreamResponse: + payload = await request.json() + # Verify the fallback from max_completion_tokens -> max_tokens. + if "max_completion_tokens" in payload: + return web.json_response({"error": "unsupported field"}, status=400) + assert payload["messages"] + return await _stream_response( + request, + [ + {"choices": [{"delta": {"content": "patched"}}]}, + {"usage": {"completion_tokens": 2}}, + ], + ) + + async def completions_handler(request: web.Request) -> web.StreamResponse: + payload = await request.json() + assert payload["prompt"].startswith("USER:") + return await _stream_response( + request, + [ + {"choices": [{"text": "resolved"}]}, + {"usage": {"completion_tokens": 2}}, + ], + ) + + async def metrics_handler(_: web.Request) -> web.Response: + return web.Response( + text=metrics_text + or ( + "vllm:gpu_cache_usage_perc 0.42\n" + "vllm:cpu_cache_usage_perc 0.25\n" + "sglang:cache_hit_rate 0.8\n" + ) + ) + + app = web.Application() + app.router.add_post("/v1/chat/completions", chat_handler) + app.router.add_post("/v1/completions", completions_handler) + app.router.add_get("/metrics", metrics_handler) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, host="127.0.0.1", port=0) + await site.start() + sockets = getattr(site, "_server").sockets + port = sockets[0].getsockname()[1] + return runner, f"http://127.0.0.1:{port}" + + +def test_load_replay_sessions_multiturn_chat(tmp_path: Path) -> None: + export_file = tmp_path / "multiturn.json" + export_file.write_text(json.dumps(_multiturn_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="auto", + ignore_waits=False, + ) + + assert len(sessions) == 1 + assert sessions[0].request_mode == "chat" + assert sessions[0].turns[1].wait_before_s == 0.01 + assert selection["support_statuses"] == ["supported"] + assert selection["support_status_counts"] == {"supported": 1} + assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"] + assert selection["benchmark_certification_status_counts"] == { + "dataset_replay_verified": 1 + } + assert selection["request_mode_mix"] == {"chat": 1} + + +def test_load_replay_sessions_trace_replay_auto_uses_completions(tmp_path: Path) -> None: + export_file = tmp_path / "trace_replay.json" + export_file.write_text(json.dumps(_trace_replay_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:trt_llm"}, + hardware_profile_ids={"nvidia:b200_sxm_180gb"}, + canonical_model_ids={"gpt_oss_120b"}, + request_mode="auto", + ) + + assert len(sessions) == 1 + assert sessions[0].request_mode == "completions" + assert sessions[0].turns[1].wait_before_s == 0.025 + assert sessions[0].turns[0].completion_prompt.startswith("USER:") + assert selection["support_statuses"] == ["supported"] + assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"] + assert selection["request_mode_mix"] == {"completions": 1} + + +def test_load_replay_sessions_support_status_filter(tmp_path: Path) -> None: + payload = _multiturn_payload() + payload["exports"].append( + { + **payload["exports"][0], + "trace_id": "trace-chat-preview", + "support_status": "reviewed_preview", + } + ) + export_file = tmp_path / "multiturn_mixed_status.json" + export_file.write_text(json.dumps(payload)) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + support_statuses={"supported"}, + request_mode="auto", + ignore_waits=False, + ) + + assert [session.trace_id for session in sessions] == ["trace-chat-1"] + assert selection["support_statuses"] == ["supported"] + assert selection["support_status_counts"] == {"supported": 1} + assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"] + + +def test_run_export_replay_benchmark_chat(tmp_path: Path) -> None: + export_file = tmp_path / "multiturn.json" + export_file.write_text(json.dumps(_multiturn_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server() + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=1, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + assert result["aggregate_metrics"]["completed_sessions"] == 1 + assert result["selection"]["request_mode_mix"] == {"chat": 1} + assert result["server_metrics_summary"]["samples"] >= 0 + assert result["server_metrics_summary"]["gpu_cache_usage_peak"] == 0.42 + assert result["server_metrics_summary"]["cpu_cache_usage_peak"] == 0.25 + assert result["server_metrics_summary"]["gpu_cache_metric_name"] == "vllm:gpu_cache_usage_perc" + assert result["server_metrics_summary"]["cpu_cache_metric_name"] == "vllm:cpu_cache_usage_perc" + assert result["server_metrics_summary"]["cpu_cache_metric_available"] is True + assert result["server_metrics_summary"]["observability_status"] == "direct_cpu_cache_metric" + assert result["server_metrics_summary"]["kv_offload_observed"] is True + + +def test_run_export_replay_benchmark_completions(tmp_path: Path) -> None: + export_file = tmp_path / "trace_replay.json" + export_file.write_text(json.dumps(_trace_replay_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:trt_llm"}, + hardware_profile_ids={"nvidia:b200_sxm_180gb"}, + canonical_model_ids={"gpt_oss_120b"}, + request_mode="completions", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server() + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="gpt-oss-120b", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + assert result["aggregate_metrics"]["completed_sessions"] == 1 + assert result["selection"]["request_mode_mix"] == {"completions": 1} + + +def test_run_export_replay_benchmark_sglang_token_usage_metrics(tmp_path: Path) -> None: + export_file = tmp_path / "multiturn_sglang_metrics.json" + export_file.write_text(json.dumps(_multiturn_payload(runtime_stack_id="standalone:sglang"))) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server( + metrics_text=( + 'sglang:token_usage{model_name="Qwen/Qwen3-30B-A3B"} 0.61\n' + 'sglang:cache_hit_rate{model_name="Qwen/Qwen3-30B-A3B"} 0.8\n' + ) + ) + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + summary = result["server_metrics_summary"] + assert result["aggregate_metrics"]["completed_sessions"] == 1 + assert summary["samples"] >= 0 + assert summary["gpu_cache_usage_peak"] == 0.61 + assert summary["gpu_cache_metric_name"] == "sglang:token_usage" + assert summary["cpu_cache_metric_name"] is None + assert summary["cpu_cache_metric_available"] is False + assert summary["cache_hit_rate_avg"] == 0.8 + assert summary["observability_status"] == "indirect_without_cpu_cache_metric" + assert summary["kv_offload_observed"] is False + + +def test_sse_multiline_chunks(tmp_path: Path) -> None: + """Verify replay works when the server packs multiple SSE frames into one TCP write.""" + export_file = tmp_path / "multiturn.json" + export_file.write_text(json.dumps(_multiturn_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server(sse_mode="multiline") + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + assert result["aggregate_metrics"]["completed_sessions"] == 1 + + +def test_sse_split_across_chunks(tmp_path: Path) -> None: + """Verify replay works when a single SSE frame is split across TCP writes.""" + export_file = tmp_path / "multiturn.json" + export_file.write_text(json.dumps(_multiturn_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server(sse_mode="split") + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + assert result["aggregate_metrics"]["completed_sessions"] == 1 + + +def test_empty_content_no_phantom_itl(tmp_path: Path) -> None: + """Verify that SSE chunks with empty/null content don't inflate ITL counts.""" + export_file = tmp_path / "multiturn.json" + # Use a single-turn export to isolate ITL counting + single_turn_payload = { + "adapter_id": "inferencex_multiturn", + "exports": [ + { + "trace_id": "trace-itl-1", + "runtime_stack_id": "standalone:sglang", + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "qwen3_30b_a3b", + "support_status": "supported", + "session": { + "session_id": "session-itl-1", + "turns": [ + { + "turn_idx": 0, + "turn_id": 0, + "messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Hello"} + ], + } + ], + "expected_output_tokens": 4, + "wait_before_ms": 0, + }, + ], + }, + } + ], + } + export_file.write_text(json.dumps(single_turn_payload)) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + # Custom server that sends empty-content chunks between real ones + async def _chat_with_empty(request: web.Request) -> web.StreamResponse: + payload = await request.json() + if "max_completion_tokens" in payload: + return web.json_response({"error": "unsupported"}, status=400) + + response = web.StreamResponse( + status=200, + headers={"Content-Type": "text/event-stream"}, + ) + await response.prepare(request) + # Frame 1: real content + await response.write( + f'data: {{"choices": [{{"delta": {{"content": "hello"}}}}]}}\n\n'.encode() + ) + await asyncio.sleep(0.005) + # Frame 2: empty content (should not generate ITL entry) + await response.write( + f'data: {{"choices": [{{"delta": {{"content": ""}}}}]}}\n\n'.encode() + ) + await asyncio.sleep(0.005) + # Frame 3: null content (should not generate ITL entry) + await response.write( + f'data: {{"choices": [{{"delta": {{}}}}]}}\n\n'.encode() + ) + await asyncio.sleep(0.005) + # Frame 4: real content + await response.write( + f'data: {{"choices": [{{"delta": {{"content": " world"}}}}]}}\n\n'.encode() + ) + await asyncio.sleep(0.005) + # Usage frame + await response.write( + f'data: {{"usage": {{"completion_tokens": 2}}}}\n\n'.encode() + ) + await response.write(b"data: [DONE]\n\n") + await response.write_eof() + return response + + app = web.Application() + app.router.add_post("/v1/chat/completions", _chat_with_empty) + app.router.add_get("/metrics", lambda _: web.Response(text="")) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, host="127.0.0.1", port=0) + await site.start() + sockets = getattr(site, "_server").sockets + port = sockets[0].getsockname()[1] + base_url = f"http://127.0.0.1:{port}" + + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + agg = result["aggregate_metrics"] + assert agg["completed_sessions"] == 1 + # With 2 real content chunks, ITL should have exactly 1 entry + # (first content = TTFT, second content = 1 ITL). Empty/null chunks + # must not inflate this count. + turn_metrics = result["per_turn_metrics"]["turn_1"] + assert turn_metrics["completed"] == 1 + + +def test_actual_context_len_for_file_backed_assets(tmp_path: Path) -> None: + """Verify that actual_context_len counts rendered payload tokens, not asset metadata.""" + payload = { + "adapter_id": "inferencex_trace_replay", + "exports": [ + { + "trace_id": "test-asset-trace", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "gpt_oss_120b", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "context_band": "xlc2_384k_512k", + "trace_metadata": { + "session_id": "test-session", + "estimated_kv_bytes_peak": 27000000000, + "expected_offload_mode": "soft_offload", + }, + "events": [ + { + "event_id": "evt-0", + "trace_id": "test-asset-trace", + "session_id": "test-session", + "turn_id": 0, + "arrival_time_offset_ms": 0, + "input_messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Analyze this codebase"}, + { + "type": "table", + "text": None, + "asset_path": "synthetic_v0/context_assets/big_file.md", + "asset_token_count": 500000, + "asset_byte_count": 2500000, + }, + ], + } + ], + "output": {"output_token_count": 100}, + } + ], + } + ], + } + export_file = tmp_path / "asset_test.json" + export_file.write_text(json.dumps(payload)) + + sessions, _ = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:vllm"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"gpt_oss_120b"}, + request_mode="chat", + ignore_waits=True, + ) + + assert len(sessions) == 1 + turn = sessions[0].turns[0] + + # Estimated context_len should include the 500k asset_token_count + assert turn.context_len >= 500000 + + # Actual context_len should be much smaller — just the rendered text + # "[TABLE]" is ~1 token + "Analyze this codebase" is ~3 tokens + assert turn.actual_context_len < 100 + assert turn.actual_context_len > 0 + + # The gap proves the measurement works + assert turn.context_len > turn.actual_context_len * 100 + + +def test_depth_telemetry_in_benchmark_result(tmp_path: Path) -> None: + """Verify depth_telemetry block is emitted in benchmark results.""" + export_file = tmp_path / "multiturn.json" + export_file.write_text(json.dumps(_multiturn_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server() + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + + # depth_telemetry block must exist + assert "depth_telemetry" in result + dt = result["depth_telemetry"] + assert "total_estimated_input_tokens" in dt + assert "total_actual_input_tokens" in dt + assert "max_actual_context_len_per_turn" in dt + assert dt["total_actual_input_tokens"] > 0 + assert dt["max_actual_context_len_per_turn"] > 0 + + # Aggregate metrics must also carry actual input tokens + agg = result["aggregate_metrics"] + assert "total_actual_input_tokens" in agg + assert "max_actual_context_len_per_turn" in agg + + # Per-turn metrics should have actual context length + for turn_key, turn_metrics in result["per_turn_metrics"].items(): + assert "mean_actual_context_len" in turn_metrics diff --git a/utils/test_gate_isb1.py b/utils/test_gate_isb1.py new file mode 100644 index 000000000..3a9e590e0 --- /dev/null +++ b/utils/test_gate_isb1.py @@ -0,0 +1,218 @@ +import json +from pathlib import Path + +from gate_isb1 import build_gate_report, load_rows, main + + +def make_row( + *, + result_filename: str, + model: str, + hw: str, + framework: str, + support_status: str, + effective_max_context_depth: int, + context_pressure_class: str, + context_status: str, + requires_log_review: bool = False, + context_pressure_suspicious: bool = False, + completed_sessions: int = 2, + total_sessions: int = 2, + session_throughput_sps: float = 1.0, + benchmark_certification_status: str = "dataset_replay_verified", +): + return { + "benchmark_type": "isb1_replay", + "result_filename": result_filename, + "artifact_stems": { + "processed": f"isb1_{result_filename}", + "raw_replay": f"replay_{result_filename}", + "server_logs": f"server_logs_{result_filename}", + "gpu_metrics": f"gpu_metrics_{result_filename}", + }, + "infmax_model_prefix": model, + "hw": hw, + "framework": framework, + "support_status": support_status, + "effective_max_context_depth": effective_max_context_depth, + "context_pressure_class": context_pressure_class, + "context_pressure_signal": { + "status": context_status, + "requires_log_review": requires_log_review, + }, + "context_pressure_suspicious": context_pressure_suspicious, + "completed_sessions": completed_sessions, + "total_sessions": total_sessions, + "session_throughput_sps": session_throughput_sps, + "benchmark_certification_status": benchmark_certification_status, + } + + +def test_build_gate_report_passes_with_sglang_observability_gap(): + rows = [ + make_row( + result_filename="dsr1_control_b200_vllm", + model="dsr1", + hw="b200-cw-1", + framework="vllm", + support_status="supported", + effective_max_context_depth=9416, + context_pressure_class="standard", + context_status="not_applicable", + ), + make_row( + result_filename="gptoss_control_h100_vllm", + model="gptoss", + hw="h100-cw-1", + framework="vllm", + support_status="supported", + effective_max_context_depth=9416, + context_pressure_class="standard", + context_status="not_applicable", + ), + ] + + for hw in ("b200-cw-1", "h100-cw-1", "h200-cw-1"): + for framework in ("vllm", "sglang"): + rows.append( + make_row( + result_filename=f"qwen_131k_{hw}_{framework}", + model="qwen3.5", + hw=hw, + framework=framework, + support_status="reviewed_preview", + effective_max_context_depth=131272, + context_pressure_class="standard", + context_status="not_applicable", + ) + ) + rows.append( + make_row( + result_filename=f"qwen_500k_{hw}_{framework}", + model="qwen3.5", + hw=hw, + framework=framework, + support_status="reviewed_preview", + effective_max_context_depth=524288, + context_pressure_class="extended_500k", + context_status="ok" if framework == "vllm" else "observability_gap", + requires_log_review=framework == "sglang", + ) + ) + + rows.extend( + [ + make_row( + result_filename="qwen_1m_b200_vllm", + model="qwen3.5", + hw="b200-cw-1", + framework="vllm", + support_status="reviewed_preview", + effective_max_context_depth=1048576, + context_pressure_class="extended_1m", + context_status="ok", + ), + make_row( + result_filename="qwen_1m_b200_sglang", + model="qwen3.5", + hw="b200-cw-1", + framework="sglang", + support_status="reviewed_preview", + effective_max_context_depth=1048576, + context_pressure_class="extended_1m", + context_status="observability_gap", + requires_log_review=True, + ), + ] + ) + + report = build_gate_report(rows) + + assert report["overall"] == "pass" + assert all(gate["status"] == "pass" for gate in report["gates"]) + qwen_500k_gate = next(gate for gate in report["gates"] if gate["id"] == "qwen_500k") + assert qwen_500k_gate["review_required_rows"] + assert any( + row["result_filename"] == "qwen_500k_b200-cw-1_sglang" + for row in qwen_500k_gate["review_required_rows"] + ) + + +def test_build_gate_report_fails_control_lane_and_preserves_artifact_refs(): + rows = [ + make_row( + result_filename="dsr1_control_b200_vllm", + model="dsr1", + hw="b200-cw-1", + framework="vllm", + support_status="supported", + effective_max_context_depth=9416, + context_pressure_class="standard", + context_status="not_applicable", + completed_sessions=1, + total_sessions=2, + session_throughput_sps=0.0, + ) + ] + + report = build_gate_report(rows) + + assert report["overall"] == "fail" + control_gate = next(gate for gate in report["gates"] if gate["id"] == "control_lanes") + assert control_gate["status"] == "fail" + assert control_gate["failing_rows"][0]["result_filename"] == "dsr1_control_b200_vllm" + assert control_gate["failing_rows"][0]["artifact_stems"]["server_logs"] == "server_logs_dsr1_control_b200_vllm" + assert "completed_sessions == total_sessions" in control_gate["failing_rows"][0]["failed_criteria"] + assert "session_throughput_sps > 0" in control_gate["failing_rows"][0]["failed_criteria"] + + +def test_build_gate_report_fails_when_qwen_131k_coverage_is_missing(): + rows = [ + make_row( + result_filename="qwen_131k_b200_vllm", + model="qwen3.5", + hw="b200-cw-1", + framework="vllm", + support_status="reviewed_preview", + effective_max_context_depth=131272, + context_pressure_class="standard", + context_status="not_applicable", + ) + ] + + report = build_gate_report(rows) + + assert report["overall"] == "fail" + qwen_131k_gate = next(gate for gate in report["gates"] if gate["id"] == "qwen_131k") + assert qwen_131k_gate["status"] == "fail" + assert ["b200", "sglang"] in qwen_131k_gate["missing_coverage"] + assert ["h200", "vllm"] in qwen_131k_gate["missing_coverage"] + + +def test_build_gate_report_handles_no_rows(): + report = build_gate_report([]) + + assert report["overall"] == "partial" + assert all(gate["status"] == "no_rows" for gate in report["gates"]) + + +def test_gate_main_strict_returns_nonzero_on_failure(tmp_path): + payload = [ + make_row( + result_filename="dsr1_control_b200_vllm", + model="dsr1", + hw="b200-cw-1", + framework="vllm", + support_status="supported", + effective_max_context_depth=9416, + context_pressure_class="standard", + context_status="not_applicable", + completed_sessions=1, + total_sessions=2, + ) + ] + report_path = tmp_path / "agg_isb1.json" + report_path.write_text(json.dumps(payload)) + + assert load_rows(report_path)[0]["result_filename"] == "dsr1_control_b200_vllm" + assert main([str(report_path), "--strict"]) == 1 diff --git a/utils/test_process_result.py b/utils/test_process_result.py index 2a6389a78..8bc51d593 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -47,6 +47,7 @@ def base_env_vars(): "OSL": "1024", "DISAGG": "false", "MODEL_PREFIX": "dsr1", + "IMAGE": "lmsysorg/sglang:v0.4.6.post5-cu126", } @@ -299,6 +300,32 @@ def test_missing_result_file(self, tmp_path, single_node_env_vars): assert result.returncode != 0 + def test_isb1_replay_env_guard(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """ISB1 replay runs should fail fast with a helpful processor redirect.""" + env = single_node_env_vars.copy() + env["BENCHMARK_TYPE"] = "isb1_replay" + + result = run_script(tmp_path, env, sample_benchmark_result) + + assert result.returncode != 0 + assert "Use utils/process_result_isb1.py instead" in result.stderr + + def test_isb1_replay_payload_guard(self, tmp_path, single_node_env_vars): + """Replay-shaped payloads should be rejected even without BENCHMARK_TYPE set.""" + replay_like_result = { + "model_id": "test-model", + "max_concurrency": 4, + "aggregate_metrics": { + "total_token_throughput_tps": 1000.0, + "output_throughput_tps": 800.0, + }, + } + + result = run_script(tmp_path, single_node_env_vars, replay_like_result) + + assert result.returncode != 0 + assert "Detected an ISB1 replay-style result payload" in result.stderr + # ============================================================================= # Test latency and throughput calculations diff --git a/utils/test_process_result_isb1.py b/utils/test_process_result_isb1.py new file mode 100644 index 000000000..f2a4f06fb --- /dev/null +++ b/utils/test_process_result_isb1.py @@ -0,0 +1,1006 @@ +import json +import subprocess +import sys +from pathlib import Path + +import pytest + +SCRIPT_PATH = Path(__file__).parent / "process_result_isb1.py" + + +def write_export_fixture(tmp_path: Path, relative_path: str, payload: dict) -> str: + export_path = tmp_path / relative_path + export_path.parent.mkdir(parents=True, exist_ok=True) + export_path.write_text(json.dumps(payload)) + return str(export_path.relative_to(tmp_path)) + + +@pytest.fixture +def sample_replay_result(): + return { + "model_id": "deepseek-ai/DeepSeek-R1-0528", + "mode": "export_replay", + "max_concurrency": 8, + "num_sessions": 2, + "max_turns": 4, + "num_warmup_sessions": 1, + "harness_request_mode": "auto", + "selection": { + "adapter_id": "inferencex_multiturn", + "selected_sessions": 2, + "runtime_stack_ids": ["vllm-0.8.5-h200"], + "hardware_profile_ids": ["h200-8gpu"], + "canonical_model_ids": ["deepseek-r1-0528"], + "support_statuses": ["supported"], + "support_status_counts": {"supported": 2}, + "benchmark_certification_statuses": ["dataset_replay_verified"], + "benchmark_certification_status_counts": { + "dataset_replay_verified": 2 + }, + "request_mode_mix": {"chat": 2}, + }, + "server_metrics_summary": { + "cache_usage_avg": 0.45, + "cache_hit_rate_avg": 0.15, + "gpu_cache_usage_avg": 0.45, + "gpu_cache_usage_peak": 0.78, + "gpu_cache_metric_name": "vllm:gpu_cache_usage_perc", + "cpu_cache_usage_avg": 0.12, + "cpu_cache_usage_peak": 0.31, + "cpu_cache_metric_name": "vllm:cpu_cache_usage_perc", + "cpu_cache_metric_available": True, + "observability_status": "direct_cpu_cache_metric", + "kv_offload_observed": True, + "samples": 5, + }, + "per_turn_metrics": { + "turn_1": { + "completed": 2, + "mean_context_len": 8192.0, + "mean_ttft_ms": 180.0, + "p99_ttft_ms": 300.0, + "mean_e2el_ms": 1000.0, + } + }, + "aggregate_metrics": { + "completed_sessions": 2, + "total_sessions": 2, + "total_input_tokens": 1000, + "total_output_tokens": 300, + "total_wall_time_s": 2.0, + "session_throughput_sps": 1.0, + "output_throughput_tps": 150.0, + "total_token_throughput_tps": 650.0, + "mean_ttft_ms": 200.0, + "median_ttft_ms": 180.0, + "p99_ttft_ms": 500.0, + "mean_tpot_ms": 20.0, + "median_tpot_ms": 25.0, + "p99_tpot_ms": 50.0, + "mean_e2el_ms": 1200.0, + "median_e2el_ms": 1100.0, + "p99_e2el_ms": 2000.0, + }, + } + + +@pytest.fixture +def base_env(): + return { + "RUNNER_TYPE": "h200-cw-1", + "FRAMEWORK": "vllm", + "PRECISION": "fp8", + "RESULT_FILENAME": "isb1_result", + "MODEL_PREFIX": "dsr1", + "IMAGE": "vllm/vllm-openai:v0.8.5", + "TP": "8", + "EP_SIZE": "1", + "DP_ATTENTION": "false", + "BENCHMARK_TYPE": "isb1_replay", + "EXPORT_FILE": "datasets/isb1/exports/core/chat_8k1k.json", + "RUNTIME_STACK_ID": "vllm-0.8.5-h200", + "HARDWARE_PROFILE_ID": "h200-8gpu", + "CANONICAL_MODEL_ID": "deepseek-r1-0528", + "SUPPORT_STATUS": "supported", + "REQUEST_MODE": "multi-turn", + "MAX_CONCURRENCY": "8", + "SPEC_DECODING": "none", + "IGNORE_WAITS": "true", + "GITHUB_REF": "refs/heads/test-isb1-traceability", + } + + +def run_script(tmp_path, env, replay_result, result_filename="isb1_result"): + result_file = tmp_path / f"{result_filename}.json" + result_file.write_text(json.dumps(replay_result)) + + env = env.copy() + env["RESULT_FILENAME"] = result_filename + + return subprocess.run( + [sys.executable, str(SCRIPT_PATH)], + cwd=tmp_path, + env=env, + capture_output=True, + text=True, + ) + + +def assert_traceability_fields( + output_data: dict, result_filename: str, dispatch_ref: str = "refs/heads/test-isb1-traceability" +): + assert output_data["result_filename"] == result_filename + assert output_data["artifact_stems"] == { + "processed": f"isb1_{result_filename}", + "raw_replay": f"replay_{result_filename}", + "server_logs": f"server_logs_{result_filename}", + "gpu_metrics": f"gpu_metrics_{result_filename}", + } + assert output_data["dispatch_ref"] == dispatch_ref + + +def test_isb1_replay_processing(tmp_path, sample_replay_result, base_env): + export_file = write_export_fixture( + tmp_path, + "datasets/isb1/exports/core/chat_8k1k.json", + { + "adapter_id": "inferencex_multiturn", + "bundle_id": "bundle-core-chat", + "surface": "chat", + "exports": [ + { + "trace_id": "trace-1", + "runtime_stack_id": "vllm-0.8.5-h200", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "deepseek-r1-0528", + "support_status": "supported", + } + ], + }, + ) + env = base_env.copy() + env["EXPORT_FILE"] = export_file + + result = run_script(tmp_path, env, sample_replay_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + assert output_data["benchmark_type"] == "isb1_replay" + assert output_data["request_mode"] == "multi-turn" + assert output_data["harness_request_mode"] == "auto" + assert output_data["isl"] == 8192 + assert output_data["osl"] == 1024 + assert output_data["export_lane"] == "core" + assert output_data["benchmark_surface"] == "chat" + assert output_data["support_status"] == "supported" + assert output_data["benchmark_certification_status"] == "dataset_replay_verified" + assert output_data["effective_max_context_depth"] == 8192 + 1024 + 200 + assert output_data["context_pressure_class"] == "standard" + assert output_data["context_pressure_signal"]["status"] == "not_applicable" + assert output_data["context_pressure_suspicious"] is False + assert output_data["completed_sessions"] == 2 + assert output_data["session_throughput_sps"] == pytest.approx(1.0) + assert output_data["tput_per_gpu"] == pytest.approx(650.0 / 8) + assert output_data["output_tput_per_gpu"] == pytest.approx(150.0 / 8) + assert output_data["input_tput_per_gpu"] == pytest.approx((650.0 - 150.0) / 8) + assert output_data["median_ttft"] == pytest.approx(0.18) + assert output_data["median_intvty"] == pytest.approx(40.0) + assert output_data["median_e2el"] == pytest.approx(1.1) + assert output_data["kv_offload_observed"] is True + assert output_data["peak_gpu_cache_usage"] == pytest.approx(0.78) + assert output_data["peak_cpu_cache_usage"] == pytest.approx(0.31) + assert output_data["selection"]["request_mode_mix"] == {"chat": 2} + assert output_data["selection"]["support_status_counts"] == {"supported": 2} + assert output_data["per_turn_metrics"]["turn_1"]["completed"] == 2 + assert output_data["runtime_overrides"] == { + "vllm_cpu_offload_gb": None, + "vllm_swap_space_gb": None, + "sglang_mem_fraction_override": None, + "sglang_chunked_prefill_override": None, + } + assert_traceability_fields(output_data, "isb1_result") + + output_file = tmp_path / "agg_isb1_result.json" + assert output_file.exists() + persisted_output = json.loads(output_file.read_text()) + assert_traceability_fields(persisted_output, "isb1_result") + + +def test_offload_mode_env_propagation(tmp_path, sample_replay_result, base_env): + export_file = write_export_fixture( + tmp_path, + "datasets/isb1/exports/core/chat_8k1k.json", + { + "adapter_id": "inferencex_multiturn", + "surface": "chat", + "exports": [ + { + "trace_id": "trace-1", + "runtime_stack_id": "vllm-0.8.5-h200", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "deepseek-r1-0528", + "support_status": "supported", + } + ], + }, + ) + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["OFFLOAD_MODE"] = "noprefix" + env["KV_CACHE_DTYPE"] = "fp8" + env["DISABLE_PREFIX_CACHING"] = "true" + + result = run_script(tmp_path, env, sample_replay_result, result_filename="isb1_offload_env") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["offload_mode"] == "noprefix" + assert output_data["kv_cache_dtype"] == "fp8" + assert output_data["disable_prefix_caching"] is True + + +def test_support_status_mismatch_fails(tmp_path, sample_replay_result, base_env): + export_file = write_export_fixture( + tmp_path, + "datasets/isb1/exports/core/chat_8k1k.json", + { + "adapter_id": "inferencex_multiturn", + "surface": "chat", + "exports": [ + { + "trace_id": "trace-1", + "runtime_stack_id": "vllm-0.8.5-h200", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "deepseek-r1-0528", + "support_status": "supported", + } + ], + }, + ) + replay_result = { + **sample_replay_result, + "selection": { + **sample_replay_result["selection"], + "support_statuses": ["supported"], + "support_status_counts": {"supported": 2}, + }, + } + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["SUPPORT_STATUS"] = "reviewed_preview" + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_mismatch") + assert result.returncode != 0 + assert "support-status mismatch" in result.stderr + + +def test_certification_status_mismatch_fails(tmp_path, sample_replay_result, base_env): + export_file = write_export_fixture( + tmp_path, + "datasets/isb1/exports/core/chat_8k1k.json", + { + "adapter_id": "inferencex_multiturn", + "surface": "chat", + "exports": [ + { + "trace_id": "trace-1", + "runtime_stack_id": "vllm-0.8.5-h200", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "deepseek-r1-0528", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + } + ], + }, + ) + replay_result = { + **sample_replay_result, + "selection": { + **sample_replay_result["selection"], + "benchmark_certification_statuses": ["pending_review"], + "benchmark_certification_status_counts": {"pending_review": 2}, + }, + } + env = base_env.copy() + env["EXPORT_FILE"] = export_file + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_cert_mismatch") + assert result.returncode != 0 + assert "benchmark-certification mismatch" in result.stderr + + +def test_missing_required_env_vars_fails(tmp_path, sample_replay_result): + result_file = tmp_path / "isb1_result.json" + result_file.write_text(json.dumps(sample_replay_result)) + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH)], + cwd=tmp_path, + env={"PATH": "/usr/bin", "RESULT_FILENAME": "isb1_result"}, + capture_output=True, + text=True, + ) + + assert result.returncode != 0 + assert "Missing required environment variables" in result.stderr + + +def test_dispatch_ref_prefers_explicit_override(tmp_path, sample_replay_result, base_env): + export_file = write_export_fixture( + tmp_path, + "datasets/isb1/exports/core/chat_8k1k.json", + { + "adapter_id": "inferencex_multiturn", + "bundle_id": "bundle-core-chat", + "surface": "chat", + "exports": [ + { + "trace_id": "trace-1", + "runtime_stack_id": "vllm-0.8.5-h200", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "deepseek-r1-0528", + "support_status": "supported", + } + ], + }, + ) + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["DISPATCH_REF"] = "refs/tags/isb1-dispatch-override" + + result = run_script(tmp_path, env, sample_replay_result, result_filename="isb1_dispatch_override") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert_traceability_fields( + output_data, + "isb1_dispatch_override", + dispatch_ref="refs/tags/isb1-dispatch-override", + ) + + +def test_preview_offload_core_processing(tmp_path, sample_replay_result, base_env): + preview_export = ( + write_export_fixture( + tmp_path, + "datasets/isb1/exports/preview/offload_core/" + "inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json", + { + "adapter_id": "inferencex_multiturn", + "profile_id": "chat_hopper_blackwell_offload_core_v1", + "duration_tier": "smoke", + "adapter_surface": "chat", + "tier": "reviewed_preview", + "adapter_support_status": "reviewed_preview", + "exports": [ + { + "context_band": "lc1_8k_16k", + }, + { + "context_band": "lc3_96k_128k", + }, + ], + "producer_handoff_metadata": { + "class": "phase_2_offload_core_preview", + "claim_boundary": "Not blanket certification.", + }, + }, + ) + ) + + env = base_env.copy() + env["EXPORT_FILE"] = preview_export + env["SUPPORT_STATUS"] = "reviewed_preview" + env["MAX_MODEL_LEN"] = "131272" + replay_result = { + **sample_replay_result, + "selection": { + **sample_replay_result["selection"], + "support_statuses": ["reviewed_preview"], + "support_status_counts": {"reviewed_preview": 2}, + }, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_preview") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["export_lane"] == "preview/offload_core" + assert output_data["benchmark_surface"] == "chat" + assert output_data["profile_id"] == "chat_hopper_blackwell_offload_core_v1" + assert output_data["duration_tier"] == "smoke" + assert output_data["context_bands"] == ["lc1_8k_16k", "lc3_96k_128k"] + assert output_data["producer_handoff_class"] == "phase_2_offload_core_preview" + assert output_data["support_status"] == "reviewed_preview" + assert output_data["isl"] == 0 + assert output_data["osl"] == 0 + assert_traceability_fields(output_data, "isb1_preview") + + +def test_qwen_500k_preview_processing_preserves_served_shape_and_context_band( + tmp_path, sample_replay_result, base_env +): + preview_export = write_export_fixture( + tmp_path, + "datasets/isb1/exports/preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json", + { + "adapter_id": "inferencex_trace_replay", + "bundle_id": "isb1_preview_long_context_500k_vllm_code_xlc2_qwen3_5", + "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1", + "duration_tier": "standard", + "surface": "code", + "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, + "tier": "reviewed_preview", + "adapter_support_status": "reviewed_preview", + "producer_handoff_metadata": { + "class": "bounded_500k_class", + "claim_boundary": "Replay-derived 500k preview only.", + }, + "exports": [ + { + "context_band": "xlc2_384k_512k", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + }, + { + "context_band": "xlc2_384k_512k", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:h100_sxm_80gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + }, + { + "context_band": "xlc2_384k_512k", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + }, + ], + }, + ) + + env = base_env.copy() + env.update( + { + "RUNNER_TYPE": "b200-cw-1", + "FRAMEWORK": "vllm", + "MODEL_PREFIX": "qwen3.5", + "IMAGE": "vllm/vllm-openai:v0.8.5", + "EXPORT_FILE": preview_export, + "RUNTIME_STACK_ID": "standalone:vllm", + "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", + "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", + "SUPPORT_STATUS": "reviewed_preview", + "MAX_MODEL_LEN": "524288", + "VLLM_CPU_OFFLOAD_GB": "120", + "VLLM_SWAP_SPACE_GB": "24", + } + ) + replay_result = { + **sample_replay_result, + "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", + "vllm_cpu_offload_gb": "128", + "vllm_swap_space_gb": "32", + "selection": { + **sample_replay_result["selection"], + "runtime_stack_ids": ["standalone:vllm"], + "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], + "canonical_model_ids": ["qwen3_5_397b_a17b"], + "support_statuses": ["reviewed_preview"], + "support_status_counts": {"reviewed_preview": 3}, + "request_mode_mix": {"code": 3}, + }, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["export_lane"] == "preview/long_context_500k" + assert output_data["benchmark_surface"] == "code" + assert output_data["profile_id"] == "coding_qwen3.5_xlc2_500k_preview_v1" + assert output_data["context_bands"] == ["xlc2_384k_512k"] + assert output_data["producer_handoff_class"] == "bounded_500k_class" + assert output_data["support_status"] == "reviewed_preview" + assert output_data["benchmark_certification_status"] == "dataset_replay_verified" + assert output_data["isl"] == 131072 + assert output_data["osl"] == 1024 + assert output_data["max_model_len"] == 524288 + assert output_data["effective_max_context_depth"] == 524288 + assert output_data["context_pressure_class"] == "extended_500k" + assert output_data["context_pressure_signal"]["status"] == "ok" + assert output_data["context_pressure_suspicious"] is False + assert output_data["kv_offload_observed"] is True + assert output_data["runtime_overrides"] == { + "vllm_cpu_offload_gb": "128", + "vllm_swap_space_gb": "32", + "sglang_mem_fraction_override": None, + "sglang_chunked_prefill_override": None, + } + assert_traceability_fields(output_data, "isb1_qwen_500k") + + +def test_qwen_1m_preview_processing_preserves_8k_served_shape_and_offload_metadata( + tmp_path, sample_replay_result, base_env +): + preview_export = write_export_fixture( + tmp_path, + "datasets/isb1/exports/preview/long_context_1m/" + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json", + { + "adapter_id": "inferencex_trace_replay", + "bundle_id": "isb1_preview_long_context_1m_vllm_code_ulc2_qwen3_5", + "profile_id": "coding_qwen3.5_ulc2_1m_preview_v1", + "duration_tier": "standard", + "surface": "code", + "served_shape": {"shape_family": "8k1k", "isl": 8192, "osl": 1024}, + "tier": "reviewed_preview", + "adapter_support_status": "reviewed_preview", + "producer_handoff_metadata": { + "class": "bounded_1m_class", + "claim_boundary": "Manual 1M preview only.", + }, + "exports": [ + { + "context_band": "ulc2_1m_plus", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + } + ], + }, + ) + + env = base_env.copy() + env.update( + { + "RUNNER_TYPE": "b200-cw-1", + "FRAMEWORK": "vllm", + "MODEL_PREFIX": "qwen3.5", + "IMAGE": "vllm/vllm-openai:v0.8.5", + "EXPORT_FILE": preview_export, + "RUNTIME_STACK_ID": "standalone:vllm", + "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", + "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", + "SUPPORT_STATUS": "reviewed_preview", + "MAX_MODEL_LEN": "1048576", + "MAX_SESSIONS": "1", + "MAX_TURNS_PER_SESSION": "3", + } + ) + replay_result = { + **sample_replay_result, + "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", + "selection": { + **sample_replay_result["selection"], + "runtime_stack_ids": ["standalone:vllm"], + "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], + "canonical_model_ids": ["qwen3_5_397b_a17b"], + "support_statuses": ["reviewed_preview"], + "support_status_counts": {"reviewed_preview": 1}, + "request_mode_mix": {"code": 1}, + }, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_1m") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["export_lane"] == "preview/long_context_1m" + assert output_data["benchmark_surface"] == "code" + assert output_data["profile_id"] == "coding_qwen3.5_ulc2_1m_preview_v1" + assert output_data["context_bands"] == ["ulc2_1m_plus"] + assert output_data["producer_handoff_class"] == "bounded_1m_class" + assert output_data["support_status"] == "reviewed_preview" + assert output_data["benchmark_certification_status"] == "dataset_replay_verified" + assert output_data["isl"] == 8192 + assert output_data["osl"] == 1024 + assert output_data["max_model_len"] == 1048576 + assert output_data["effective_max_context_depth"] == 1048576 + assert output_data["context_pressure_class"] == "extended_1m" + assert output_data["context_pressure_signal"]["status"] == "ok" + assert output_data["context_pressure_suspicious"] is False + assert output_data["max_sessions"] == 1 + assert output_data["max_turns_per_session"] == 3 + assert output_data["kv_offload_observed"] is True + assert_traceability_fields(output_data, "isb1_qwen_1m") + + +def test_context_pressure_warning_on_high_context_without_cpu_cache( + tmp_path, sample_replay_result, base_env +): + preview_export = write_export_fixture( + tmp_path, + "datasets/isb1/exports/preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json", + { + "adapter_id": "inferencex_trace_replay", + "bundle_id": "isb1_preview_long_context_500k_vllm_code_xlc2_qwen3_5", + "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1", + "duration_tier": "standard", + "surface": "code", + "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, + "tier": "reviewed_preview", + "adapter_support_status": "reviewed_preview", + "exports": [ + { + "context_band": "xlc2_384k_512k", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + } + ], + }, + ) + + env = base_env.copy() + env.update( + { + "RUNNER_TYPE": "b200-cw-1", + "FRAMEWORK": "vllm", + "MODEL_PREFIX": "qwen3.5", + "IMAGE": "vllm/vllm-openai:v0.8.5", + "EXPORT_FILE": preview_export, + "RUNTIME_STACK_ID": "standalone:vllm", + "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", + "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", + "SUPPORT_STATUS": "reviewed_preview", + "MAX_MODEL_LEN": "524288", + } + ) + replay_result = { + **sample_replay_result, + "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", + "selection": { + **sample_replay_result["selection"], + "runtime_stack_ids": ["standalone:vllm"], + "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], + "canonical_model_ids": ["qwen3_5_397b_a17b"], + "support_statuses": ["reviewed_preview"], + "support_status_counts": {"reviewed_preview": 1}, + "request_mode_mix": {"code": 1}, + }, + "server_metrics_summary": { + "cache_usage_avg": 0.45, + "cache_hit_rate_avg": 0.15, + "gpu_cache_usage_avg": 0.45, + "gpu_cache_usage_peak": 0.91, + "gpu_cache_metric_name": "vllm:gpu_cache_usage_perc", + "cpu_cache_usage_avg": 0.0, + "cpu_cache_usage_peak": 0.0, + "cpu_cache_metric_name": "vllm:cpu_cache_usage_perc", + "cpu_cache_metric_available": True, + "observability_status": "direct_cpu_cache_metric", + "kv_offload_observed": False, + "samples": 5, + }, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_warn") + assert result.returncode == 0, f"Script failed: {result.stderr}" + assert "saw no CPU cache usage" in result.stderr + + output_data = json.loads(result.stdout) + assert output_data["context_pressure_signal"]["status"] == "suspicious" + assert output_data["context_pressure_suspicious"] is True + assert_traceability_fields(output_data, "isb1_qwen_500k_warn") + + +def test_context_pressure_signal_marks_sglang_observability_gap( + tmp_path, sample_replay_result, base_env +): + preview_export = write_export_fixture( + tmp_path, + "datasets/isb1/exports/preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json", + { + "adapter_id": "inferencex_trace_replay", + "bundle_id": "isb1_preview_long_context_500k_sglang_code_xlc2_qwen3_5", + "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1", + "duration_tier": "standard", + "surface": "code", + "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, + "tier": "reviewed_preview", + "adapter_support_status": "reviewed_preview", + "exports": [ + { + "context_band": "xlc2_384k_512k", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:sglang", + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + } + ], + }, + ) + + env = base_env.copy() + env.update( + { + "RUNNER_TYPE": "b200-cw-1", + "FRAMEWORK": "sglang", + "MODEL_PREFIX": "qwen3.5", + "IMAGE": "lmsysorg/sglang:v0.5.9-cu130", + "EXPORT_FILE": preview_export, + "RUNTIME_STACK_ID": "standalone:sglang", + "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", + "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", + "SUPPORT_STATUS": "reviewed_preview", + "MAX_MODEL_LEN": "524288", + "SGLANG_MEM_FRACTION_OVERRIDE": "0.77", + "SGLANG_CHUNKED_PREFILL_OVERRIDE": "65536", + } + ) + replay_result = { + **sample_replay_result, + "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", + "selection": { + **sample_replay_result["selection"], + "runtime_stack_ids": ["standalone:sglang"], + "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], + "canonical_model_ids": ["qwen3_5_397b_a17b"], + "support_statuses": ["reviewed_preview"], + "support_status_counts": {"reviewed_preview": 1}, + "request_mode_mix": {"code": 1}, + }, + "server_metrics_summary": { + "cache_usage_avg": 0.52, + "cache_hit_rate_avg": 0.23, + "gpu_cache_usage_avg": 0.52, + "gpu_cache_usage_peak": 0.88, + "gpu_cache_metric_name": "sglang:token_usage", + "cpu_cache_usage_avg": 0.0, + "cpu_cache_usage_peak": 0.0, + "cpu_cache_metric_name": None, + "cpu_cache_metric_available": False, + "observability_status": "indirect_without_cpu_cache_metric", + "kv_offload_observed": False, + "samples": 5, + }, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_sglang") + assert result.returncode == 0, f"Script failed: {result.stderr}" + assert "lacks a direct CPU cache metric" in result.stderr + + output_data = json.loads(result.stdout) + assert output_data["context_pressure_signal"]["status"] == "observability_gap" + assert output_data["context_pressure_signal"]["requires_log_review"] is True + assert output_data["context_pressure_suspicious"] is False + assert output_data["runtime_overrides"] == { + "vllm_cpu_offload_gb": None, + "vllm_swap_space_gb": None, + "sglang_mem_fraction_override": "0.77", + "sglang_chunked_prefill_override": "65536", + } + assert_traceability_fields(output_data, "isb1_qwen_500k_sglang") + + +def test_depth_coverage_ratio_for_500k_preview(tmp_path, base_env, sample_replay_result): + """Verify depth coverage ratio and class for a 500k preview with 131k actual tokens.""" + export_payload = { + "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, + "surface": "code", + "exports": [ + { + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "qwen3_5_397b_a17b", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "context_band": "xlc2_384k_512k", + "trace_metadata": { + "estimated_kv_bytes_peak": 27294647296, + "context_pressure_profile": { + "expected_offload_mode": "soft_offload", + }, + "expected_offload_mode": "soft_offload", + }, + } + ], + } + export_file = write_export_fixture( + tmp_path, "datasets/isb1/exports/preview/long_context_500k/test_500k.json", export_payload + ) + + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["MODEL_PREFIX"] = "qwen3.5" + env["CANONICAL_MODEL_ID"] = "qwen3_5_397b_a17b" + env["SUPPORT_STATUS"] = "reviewed_preview" + env["MAX_MODEL_LEN"] = "524288" + env["FRAMEWORK"] = "vllm" + + replay_result = sample_replay_result.copy() + replay_result["selection"] = { + **replay_result["selection"], + "support_statuses": ["reviewed_preview"], + } + replay_result["server_metrics_summary"] = { + "gpu_cache_usage_avg": 0.35, + "gpu_cache_usage_peak": 0.42, + "cpu_cache_usage_avg": 0.15, + "cpu_cache_usage_peak": 0.25, + "cpu_cache_metric_available": True, + "observability_status": "direct_cpu_cache_metric", + "kv_offload_observed": True, + "samples": 10, + } + replay_result["depth_telemetry"] = { + "total_estimated_input_tokens": 500000, + "total_actual_input_tokens": 131072, + "max_actual_context_len_per_turn": 131072, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_depth") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + # Depth coverage ratio: 131072 / 524288 ≈ 0.25 + assert output_data["depth_coverage_ratio"] is not None + assert 0.24 < output_data["depth_coverage_ratio"] < 0.26 + assert output_data["depth_coverage_class"] == "bounded_preview" + assert output_data["max_actual_context_len_per_turn"] == 131072 + assert output_data["depth_gap_tokens"] == 524288 - 131072 + + # Producer expectation validation + assert output_data["producer_estimated_kv_bytes_peak"] == 27294647296 + assert output_data["producer_expected_offload_mode"] == "soft_offload" + assert output_data["producer_expectation_validation"]["offload_mode_match"] is True + assert output_data["producer_expectation_validation"]["depth_exercised"] is False + + # Preemption count + assert output_data["preemption_count"] == 0 + + +def test_depth_mismatch_warning_for_configuration_only(tmp_path, base_env, sample_replay_result): + """Verify depth_mismatch status when actual context is <10% of configured.""" + export_payload = { + "served_shape": {"shape_family": "8k1k", "isl": 8192, "osl": 1024}, + "surface": "code", + "exports": [ + { + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "qwen3_5_397b_a17b", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "context_band": "ulc2_1m_plus", + "trace_metadata": { + "estimated_kv_bytes_peak": 39500000000, + "expected_offload_mode": "hard_offload", + }, + } + ], + } + export_file = write_export_fixture( + tmp_path, "datasets/isb1/exports/preview/long_context_1m/test_1m.json", export_payload + ) + + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["MODEL_PREFIX"] = "qwen3.5" + env["CANONICAL_MODEL_ID"] = "qwen3_5_397b_a17b" + env["SUPPORT_STATUS"] = "reviewed_preview" + env["MAX_MODEL_LEN"] = "1048576" + env["FRAMEWORK"] = "vllm" + + replay_result = sample_replay_result.copy() + replay_result["selection"] = { + **replay_result["selection"], + "support_statuses": ["reviewed_preview"], + } + replay_result["server_metrics_summary"] = { + "gpu_cache_usage_avg": 0.10, + "gpu_cache_usage_peak": 0.15, + "cpu_cache_usage_avg": 0.05, + "cpu_cache_usage_peak": 0.10, + "cpu_cache_metric_available": True, + "observability_status": "direct_cpu_cache_metric", + "kv_offload_observed": True, + "samples": 5, + } + # 1M preview sends only 8k actual tokens + replay_result["depth_telemetry"] = { + "total_estimated_input_tokens": 1600000, + "total_actual_input_tokens": 8192, + "max_actual_context_len_per_turn": 8192, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_1m_depth") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + # 8192 / 1048576 ≈ 0.0078 — less than 0.1 threshold + assert output_data["depth_coverage_ratio"] < 0.01 + assert output_data["depth_coverage_class"] == "configuration_only" + assert output_data["context_pressure_signal"]["status"] == "depth_mismatch" + assert output_data["context_pressure_signal"]["reason"] == "configured_depth_not_exercised" + assert "depth_coverage_ratio" in output_data["context_pressure_signal"] + assert "configured for" in result.stderr + + +def test_producer_expectation_offload_mismatch(tmp_path, base_env, sample_replay_result): + """Verify producer expectation validation when offload is expected but not observed.""" + export_payload = { + "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, + "surface": "code", + "exports": [ + { + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "gpt_oss_120b", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "context_band": "xlc2_384k_512k", + "trace_metadata": { + "estimated_kv_bytes_peak": 27000000000, + "context_pressure_profile": { + "expected_offload_mode": "hard_offload", + }, + }, + } + ], + } + export_file = write_export_fixture( + tmp_path, "datasets/isb1/exports/preview/long_context_500k/test_mismatch.json", export_payload + ) + + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["MODEL_PREFIX"] = "gptoss" + env["CANONICAL_MODEL_ID"] = "gpt_oss_120b" + env["SUPPORT_STATUS"] = "reviewed_preview" + env["MAX_MODEL_LEN"] = "524288" + + replay_result = sample_replay_result.copy() + replay_result["selection"] = { + **replay_result["selection"], + "support_statuses": ["reviewed_preview"], + } + replay_result["server_metrics_summary"] = { + "gpu_cache_usage_avg": 0.50, + "gpu_cache_usage_peak": 0.60, + "cpu_cache_usage_avg": 0.0, + "cpu_cache_usage_peak": 0.0, + "cpu_cache_metric_available": True, + "observability_status": "direct_cpu_cache_metric", + "kv_offload_observed": False, + "samples": 10, + } + replay_result["depth_telemetry"] = { + "total_estimated_input_tokens": 400000, + "total_actual_input_tokens": 131072, + "max_actual_context_len_per_turn": 131072, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_mismatch") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + # Producer expected hard_offload, but kv_offload_observed is False + assert output_data["producer_expectation_validation"]["offload_mode_match"] is False + assert output_data["producer_expected_offload_mode"] == "hard_offload" + assert output_data["kv_offload_observed"] is False diff --git a/utils/test_summarize_isb1.py b/utils/test_summarize_isb1.py new file mode 100644 index 000000000..3f4320594 --- /dev/null +++ b/utils/test_summarize_isb1.py @@ -0,0 +1,105 @@ +import json +from pathlib import Path + +from summarize_isb1 import generate_summary + + +def write_result(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload)) + + +def make_row(**overrides): + row = { + "benchmark_type": "isb1_replay", + "result_filename": "isb1_control_vllm_b200", + "artifact_stems": { + "processed": "isb1_isb1_control_vllm_b200", + "raw_replay": "replay_isb1_control_vllm_b200", + "server_logs": "server_logs_isb1_control_vllm_b200", + "gpu_metrics": "gpu_metrics_isb1_control_vllm_b200", + }, + "dispatch_ref": "refs/heads/test-summary", + "infmax_model_prefix": "dsr1", + "hw": "b200-cw-1", + "framework": "vllm", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + "effective_max_context_depth": 9416, + "context_pressure_class": "standard", + "context_pressure_signal": { + "status": "not_applicable", + "requires_log_review": False, + }, + "context_pressure_suspicious": False, + "completed_sessions": 2, + "total_sessions": 2, + "session_throughput_sps": 1.25, + "median_ttft": 0.18, + "kv_offload_observed": True, + "peak_gpu_cache_usage": 0.78, + "peak_cpu_cache_usage": 0.31, + "runtime_overrides": { + "vllm_cpu_offload_gb": None, + "vllm_swap_space_gb": None, + "sglang_mem_fraction_override": None, + "sglang_chunked_prefill_override": None, + }, + } + row.update(overrides) + return row + + +def test_generate_summary_surfaces_lane_override_and_action_sections(tmp_path): + control_row = make_row() + review_row = make_row( + result_filename="isb1_qwen_500k_sglang", + artifact_stems={ + "processed": "isb1_isb1_qwen_500k_sglang", + "raw_replay": "replay_isb1_qwen_500k_sglang", + "server_logs": "server_logs_isb1_qwen_500k_sglang", + "gpu_metrics": "gpu_metrics_isb1_qwen_500k_sglang", + }, + infmax_model_prefix="qwen3.5", + hw="h200-cw-1", + framework="sglang", + support_status="reviewed_preview", + effective_max_context_depth=524288, + context_pressure_class="extended_500k", + context_pressure_signal={ + "status": "observability_gap", + "requires_log_review": True, + }, + runtime_overrides={ + "vllm_cpu_offload_gb": None, + "vllm_swap_space_gb": None, + "sglang_mem_fraction_override": "0.77", + "sglang_chunked_prefill_override": "65536", + }, + kv_offload_observed=False, + peak_gpu_cache_usage=0.88, + peak_cpu_cache_usage=0.0, + ) + non_isb1_row = {"benchmark_type": "throughput", "ignored": True} + + write_result(tmp_path / "results" / "control.json", control_row) + write_result(tmp_path / "results" / "review.json", review_row) + write_result(tmp_path / "results" / "non_isb1.json", non_isb1_row) + + summary = generate_summary(tmp_path / "results") + + assert "## ISB1 Operator Summary" in summary + assert "### Lane Summary" in summary + assert "### Runtime Overrides" in summary + assert "### Action Items" in summary + assert "isb1_qwen_500k_sglang" in summary + assert "observability_gap" in summary + assert "65536" in summary + assert "server_logs_isb1_qwen_500k_sglang" in summary + assert "non_isb1" not in summary + + +def test_generate_summary_handles_empty_results(tmp_path): + summary = generate_summary(tmp_path / "results") + assert "No ISB1 replay rows found." in summary + assert "Lane Summary" not in summary diff --git a/utils/test_verify_producer_sync.py b/utils/test_verify_producer_sync.py new file mode 100644 index 000000000..ba42c8586 --- /dev/null +++ b/utils/test_verify_producer_sync.py @@ -0,0 +1,64 @@ +import json +import subprocess +import sys +from pathlib import Path + +SCRIPT_PATH = Path(__file__).parent / "verify_producer_sync.py" + + +RELEVANT_FILES = { + "extension_131k/sglang/code_131k1k_qwen3.5.json": {"name": "e131k"}, + "preview/long_context_500k/manifest_qwen3.5.json": {"name": "500k"}, + "preview/long_context_1m/manifest.json": {"name": "1m"}, +} + + +def _write_tree(root: Path, files: dict[str, dict]) -> None: + for relative_path, payload in files.items(): + file_path = root / relative_path + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.write_text(json.dumps(payload, sort_keys=True)) + + +def _run_verify(producer_root: Path, consumer_root: Path) -> subprocess.CompletedProcess[str]: + return subprocess.run( + [ + sys.executable, + str(SCRIPT_PATH), + "--producer-root", + str(producer_root), + "--consumer-root", + str(consumer_root), + ], + capture_output=True, + text=True, + check=False, + ) + + +def test_verify_producer_sync_passes_for_identical_trees(tmp_path: Path) -> None: + producer_root = tmp_path / "producer" + consumer_root = tmp_path / "consumer" + _write_tree(producer_root, RELEVANT_FILES) + _write_tree(consumer_root, RELEVANT_FILES) + + result = _run_verify(producer_root, consumer_root) + + assert result.returncode == 0 + assert "sync check passed" in result.stdout + + +def test_verify_producer_sync_fails_on_content_mismatch(tmp_path: Path) -> None: + producer_root = tmp_path / "producer" + consumer_root = tmp_path / "consumer" + _write_tree(producer_root, RELEVANT_FILES) + _write_tree(consumer_root, RELEVANT_FILES) + + mismatched_path = consumer_root / "preview/long_context_500k/manifest_qwen3.5.json" + mismatched_path.write_text(json.dumps({"name": "changed"}, sort_keys=True)) + + result = _run_verify(producer_root, consumer_root) + + assert result.returncode == 1 + assert "content_mismatch" in result.stderr + assert "preview/long_context_500k/manifest_qwen3.5.json" in result.stderr diff --git a/utils/verify_producer_sync.py b/utils/verify_producer_sync.py new file mode 100644 index 000000000..48cdac077 --- /dev/null +++ b/utils/verify_producer_sync.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Verify producer/consumer sync for ISB1 preview and extension exports.""" + +from __future__ import annotations + +import argparse +import sys +from dataclasses import dataclass +from pathlib import Path + + +RELEVANT_SUBTREES = ( + "extension_131k", + "preview/long_context_500k", + "preview/long_context_1m", +) + + +@dataclass +class SyncIssue: + kind: str + path: str + + +def _json_files(root: Path) -> set[str]: + if not root.exists(): + return set() + return { + str(path.relative_to(root)) + for path in root.rglob("*.json") + if path.is_file() + } + + +def _compare_subtree(producer_root: Path, consumer_root: Path, subtree: str) -> list[SyncIssue]: + issues: list[SyncIssue] = [] + + producer_subtree = producer_root / subtree + consumer_subtree = consumer_root / subtree + + producer_files = _json_files(producer_subtree) + consumer_files = _json_files(consumer_subtree) + + if not producer_subtree.exists(): + issues.append(SyncIssue("missing_producer_subtree", subtree)) + return issues + if not consumer_subtree.exists(): + issues.append(SyncIssue("missing_consumer_subtree", subtree)) + return issues + + for relative_path in sorted(producer_files - consumer_files): + issues.append(SyncIssue("missing_in_consumer", f"{subtree}/{relative_path}")) + + for relative_path in sorted(consumer_files - producer_files): + issues.append(SyncIssue("extra_in_consumer", f"{subtree}/{relative_path}")) + + for relative_path in sorted(producer_files & consumer_files): + producer_file = producer_subtree / relative_path + consumer_file = consumer_subtree / relative_path + if producer_file.read_bytes() != consumer_file.read_bytes(): + issues.append(SyncIssue("content_mismatch", f"{subtree}/{relative_path}")) + + return issues + + +def verify_sync(producer_root: Path, consumer_root: Path) -> list[SyncIssue]: + issues: list[SyncIssue] = [] + for subtree in RELEVANT_SUBTREES: + issues.extend(_compare_subtree(producer_root, consumer_root, subtree)) + return issues + + +def _default_consumer_root() -> Path: + return Path(__file__).resolve().parents[1] / "datasets" / "isb1" / "exports" + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Verify that committed ISB1 consumer preview/extension exports are " + "synced with producer exports." + ) + ) + parser.add_argument( + "--producer-root", + required=True, + type=Path, + help="Path to ISB1 producer exports root (…/upstream/inferencex/exports)", + ) + parser.add_argument( + "--consumer-root", + default=_default_consumer_root(), + type=Path, + help="Path to InferenceX consumer exports root (default: datasets/isb1/exports)", + ) + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + issues = verify_sync(args.producer_root.resolve(), args.consumer_root.resolve()) + + if not issues: + print( + "Producer/consumer export sync check passed for: " + + ", ".join(RELEVANT_SUBTREES) + ) + return 0 + + print("Producer/consumer export sync check failed:", file=sys.stderr) + for issue in issues: + print(f"- {issue.kind}: {issue.path}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) From cff850b1380def0f2b7f602399f8f973ec7c3b81 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Wed, 15 Apr 2026 15:47:26 -0700 Subject: [PATCH 02/18] fix: validate KV stress configs against export metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Keep only configs whose (runtime, hardware, model) triples exist in the export files — eliminates sweep generator failures - Fix canonical-model-id to match export metadata (e.g., gpt_oss_120b not gptoss) - Fix support-status to match export tiers (reviewed_preview vs unsupported) - Remove configs for engines/GPUs not yet in exports (SGLang, Dynamo, TRT, Atom, AMD) — these need export metadata updates before they can be added back - Add workload-type field required by sweep generator schema - Remove disagg/multinode fields not in KV stress schema Sweep generator now passes: exit code 0, produces valid matrix rows. --- .github/configs/isb1-kv-stress-pr993.yaml | 4129 +-------------------- 1 file changed, 73 insertions(+), 4056 deletions(-) diff --git a/.github/configs/isb1-kv-stress-pr993.yaml b/.github/configs/isb1-kv-stress-pr993.yaml index 583d51302..dc7bae2f5 100644 --- a/.github/configs/isb1-kv-stress-pr993.yaml +++ b/.github/configs/isb1-kv-stress-pr993.yaml @@ -1,59 +1,9 @@ -dsr1-fp4-b200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-r1-fp4 - model-prefix: dsr1 - multinode: true - precision: fp4 - runner: b200-multinode - runtime-stack-id: standalone:dynamo-sglang -dsr1-fp4-b200-dynamo-sglang-mtp-isb1-kv-stress: +gptoss-fp4-b200-vllm-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-sglang + canonical-model-id: gpt_oss_120b + framework: vllm hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 + image: vllm/vllm-openai:v0.15.1 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -73,6 +23,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp-isb1-kv-stress: - 64 - 128 - 256 + support-status: reviewed_preview tp-configs: - duration-s: 1800 ep: 1 @@ -90,19 +41,18 @@ dsr1-fp4-b200-dynamo-sglang-mtp-isb1-kv-stress: - 64 - 128 - 256 - model: deepseek-r1-fp4 - model-prefix: dsr1 - multinode: true + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss precision: fp4 - runner: b200-multinode - runtime-stack-id: standalone:dynamo-sglang -dsr1-fp4-b200-dynamo-trt-isb1-kv-stress: + runner: b200 + runtime-stack-id: standalone:vllm +gptoss-fp4-h100-vllm-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: nvidia:h100_sxm_80gb + image: vllm/vllm-openai:v0.18.0 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -122,6 +72,7 @@ dsr1-fp4-b200-dynamo-trt-isb1-kv-stress: - 64 - 128 - 256 + support-status: reviewed_preview tp-configs: - duration-s: 1800 ep: 1 @@ -139,18 +90,18 @@ dsr1-fp4-b200-dynamo-trt-isb1-kv-stress: - 64 - 128 - 256 - model: deepseek-r1-fp4 - model-prefix: dsr1 - multinode: true + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss precision: fp4 - runner: b200-multinode - runtime-stack-id: standalone:dynamo-trt -dsr1-fp4-b200-sglang-isb1-kv-stress: + runner: h100 + runtime-stack-id: standalone:vllm +gptoss-fp4-h200-vllm-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.18.0 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -170,6 +121,7 @@ dsr1-fp4-b200-sglang-isb1-kv-stress: - 64 - 128 - 256 + support-status: reviewed_preview tp-configs: - duration-s: 1800 ep: 1 @@ -187,18 +139,18 @@ dsr1-fp4-b200-sglang-isb1-kv-stress: - 64 - 128 - 256 - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - multinode: false + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss precision: fp4 - runner: b200 - runtime-stack-id: standalone:sglang -dsr1-fp4-b200-trt-isb1-kv-stress: + runner: h200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: trt + canonical-model-id: minimax_m2_5 + framework: vllm hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + image: vllm/vllm-openai:v0.19.0-cu130 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -218,6 +170,7 @@ dsr1-fp4-b200-trt-isb1-kv-stress: - 64 - 128 - 256 + support-status: unsupported tp-configs: - duration-s: 1800 ep: 1 @@ -235,18 +188,18 @@ dsr1-fp4-b200-trt-isb1-kv-stress: - 64 - 128 - 256 - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - multinode: false + workload-type: code + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 precision: fp4 runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp4-b200-trt-mtp-isb1-kv-stress: + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: trt + canonical-model-id: minimax_m2_5 + framework: vllm hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 + image: vllm/vllm-openai:v0.19.0-cu130 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -266,6 +219,7 @@ dsr1-fp4-b200-trt-mtp-isb1-kv-stress: - 64 - 128 - 256 + support-status: unsupported tp-configs: - duration-s: 1800 ep: 1 @@ -283,215 +237,18 @@ dsr1-fp4-b200-trt-mtp-isb1-kv-stress: - 64 - 128 - 256 - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - multinode: false - precision: fp4 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp4-b300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:b300_sxm_288gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-r1-fp4 - model-prefix: dsr1 - multinode: true - precision: fp4 - runner: b300 - runtime-stack-id: standalone:dynamo-trt -dsr1-fp4-gb200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-sglang - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: lmsysorg/sglang:v0.5.8-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - multinode: true - precision: fp4 - runner: gb200 - runtime-stack-id: standalone:dynamo-sglang -dsr1-fp4-gb200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - multinode: true - precision: fp4 - runner: gb200 - runtime-stack-id: standalone:dynamo-trt -dsr1-fp4-gb300-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-sglang - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - multinode: true - precision: fp4 - runner: gb300 - runtime-stack-id: standalone:dynamo-sglang -dsr1-fp4-gb300-dynamo-trt-isb1-kv-stress: + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:h100_sxm_80gb + image: vllm/vllm-openai:v0.18.0 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -511,6 +268,7 @@ dsr1-fp4-gb300-dynamo-trt-isb1-kv-stress: - 64 - 128 - 256 + support-status: unsupported tp-configs: - duration-s: 1800 ep: 1 @@ -528,18 +286,18 @@ dsr1-fp4-gb300-dynamo-trt-isb1-kv-stress: - 64 - 128 - 256 - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - multinode: true - precision: fp4 - runner: gb300 - runtime-stack-id: standalone:dynamo-trt -dsr1-fp4-mi355x-atom-isb1-kv-stress: + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: h100 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.18.0 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -559,7 +317,7 @@ dsr1-fp4-mi355x-atom-isb1-kv-stress: - 64 - 128 - 256 - - 512 + support-status: unsupported tp-configs: - duration-s: 1800 ep: 1 @@ -577,3750 +335,9 @@ dsr1-fp4-mi355x-atom-isb1-kv-stress: - 64 - 128 - 256 - - 512 - model: amd/DeepSeek-R1-0528-MXFP4-Preview - model-prefix: dsr1 - multinode: false - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp4-mi355x-atom-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - multinode: false - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp4-mi355x-sglang-disagg-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - multinode: true - precision: fp4 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp4-mi355x-sglang-disagg-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - multinode: true - precision: fp4 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp4-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi35x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: amd/DeepSeek-R1-0528-MXFP4-Preview - model-prefix: dsr1 - multinode: false - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:sglang -dsr1-fp8-b200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: b200-multinode - runtime-stack-id: standalone:dynamo-sglang -dsr1-fp8-b200-dynamo-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: b200-multinode - runtime-stack-id: standalone:dynamo-sglang -dsr1-fp8-b200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: b200-multinode - runtime-stack-id: standalone:dynamo-trt -dsr1-fp8-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -dsr1-fp8-b200-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -dsr1-fp8-b200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp8-b200-trt-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp8-b300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:b300_sxm_288gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: b300 - runtime-stack-id: standalone:dynamo-trt -dsr1-fp8-gb200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-sglang - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: gb200 - runtime-stack-id: standalone:dynamo-sglang -dsr1-fp8-gb200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: gb200 - runtime-stack-id: standalone:dynamo-trt -dsr1-fp8-gb300-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-sglang - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: gb300 - runtime-stack-id: standalone:dynamo-sglang -dsr1-fp8-gb300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: gb300 - runtime-stack-id: standalone:dynamo-trt -dsr1-fp8-h100-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-sglang - hardware-profile-id: nvidia:h100_sxm_80gb - image: lmsysorg/sglang:v0.5.8-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: h100-multinode - runtime-stack-id: standalone:dynamo-sglang -dsr1-fp8-h100-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:h100_sxm_80gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: h100-multinode - runtime-stack-id: standalone:dynamo-trt -dsr1-fp8-h200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: h200-multinode - runtime-stack-id: standalone:dynamo-sglang -dsr1-fp8-h200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: h200-multinode - runtime-stack-id: standalone:dynamo-trt -dsr1-fp8-h200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -dsr1-fp8-h200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: h200 - runtime-stack-id: standalone:trt -dsr1-fp8-h200-trt-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: h200 - runtime-stack-id: standalone:trt -dsr1-fp8-mi300x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: sglang - hardware-profile-id: amd:mi300x_192gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: mi300x - runtime-stack-id: standalone:sglang -dsr1-fp8-mi325x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: sglang - hardware-profile-id: amd:mi325x_288gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: mi325x - runtime-stack-id: standalone:sglang -dsr1-fp8-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp8-mi355x-atom-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp8-mi355x-sglang-disagg-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp8-mi355x-sglang-disagg-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - disagg: true - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: true - precision: fp8 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp8-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: dsr1 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi35x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - multinode: false - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:sglang -glm5-fp4-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm5 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: nvidia/GLM-5-NVFP4 - model-prefix: glm5 - multinode: false - precision: fp4 - runner: b200 - runtime-stack-id: standalone:sglang -glm5-fp8-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm5 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - multinode: false - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -glm5-fp8-h200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm5 - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:glm5-hopper - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - multinode: false - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -glm5-fp8-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm5 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - multinode: false - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -glm5-fp8-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm5 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - multinode: false - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:sglang -gptoss-fp4-b200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gptoss - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: openai/gpt-oss-120b - model-prefix: gptoss - multinode: false - precision: fp4 - runner: b200 - runtime-stack-id: standalone:trt -gptoss-fp4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gptoss - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.15.1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: openai/gpt-oss-120b - model-prefix: gptoss - multinode: false - precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -gptoss-fp4-gb200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gptoss - disagg: true - framework: dynamo-trt - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: openai/gpt-oss-120b - model-prefix: gptoss - multinode: true - precision: fp4 - runner: gb200 - runtime-stack-id: standalone:dynamo-trt -gptoss-fp4-h100-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gptoss - framework: vllm - hardware-profile-id: nvidia:h100_sxm_80gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: openai/gpt-oss-120b - model-prefix: gptoss - multinode: false - precision: fp4 - runner: h100 - runtime-stack-id: standalone:vllm -gptoss-fp4-h200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gptoss - framework: trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc11 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: openai/gpt-oss-120b - model-prefix: gptoss - multinode: false - precision: fp4 - runner: h200 - runtime-stack-id: standalone:trt -gptoss-fp4-h200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gptoss - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: openai/gpt-oss-120b - model-prefix: gptoss - multinode: false - precision: fp4 - runner: h200 - runtime-stack-id: standalone:vllm -gptoss-fp4-mi300x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gptoss - framework: vllm - hardware-profile-id: amd:mi300x_192gb - image: vllm/vllm-openai-rocm:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: openai/gpt-oss-120b - model-prefix: gptoss - multinode: false - precision: fp4 - runner: mi300x - runtime-stack-id: standalone:vllm -gptoss-fp4-mi325x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gptoss - framework: vllm - hardware-profile-id: amd:mi325x_288gb - image: vllm/vllm-openai-rocm:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: openai/gpt-oss-120b - model-prefix: gptoss - multinode: false - precision: fp4 - runner: mi325x - runtime-stack-id: standalone:vllm -gptoss-fp4-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gptoss - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: openai/gpt-oss-120b - model-prefix: gptoss - multinode: false - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -gptoss-fp4-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gptoss - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: amd/gpt-oss-120b-w-mxfp4-a-fp8 - model-prefix: gptoss - multinode: false - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:vllm -kimik2.5-fp4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimik2.5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - multinode: false - precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -kimik2.5-fp4-gb200-dynamo-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimik2.5 - disagg: true - framework: dynamo-vllm - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: vllm/vllm-openai:v0.18.0-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - multinode: true - precision: fp4 - runner: gb200 - runtime-stack-id: standalone:dynamo-vllm -kimik2.5-fp4-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimik2.5 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - multinode: false - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -kimik2.5-fp4-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimik2.5 - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - multinode: false - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:vllm -kimik2.5-int4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimik2.5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.15.1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - multinode: false - precision: int4 - runner: b200 - runtime-stack-id: standalone:vllm -kimik2.5-int4-h200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimik2.5 - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.16.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - multinode: false - precision: int4 - runner: h200 - runtime-stack-id: standalone:vllm -kimik2.5-int4-mi300x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimik2.5 - framework: vllm - hardware-profile-id: amd:mi300x_192gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - multinode: false - precision: int4 - runner: mi300x - runtime-stack-id: standalone:vllm -kimik2.5-int4-mi325x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimik2.5 - framework: vllm - hardware-profile-id: amd:mi325x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - multinode: false - precision: int4 - runner: mi325x - runtime-stack-id: standalone:vllm -kimik2.5-int4-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimik2.5 - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - multinode: false - precision: int4 - runner: mi355x - runtime-stack-id: standalone:vllm -minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimaxm2.5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.19.0-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: nvidia/MiniMax-M2.5-NVFP4 - model-prefix: minimaxm2.5 - multinode: false - precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimaxm2.5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.19.0-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - multinode: false - precision: fp8 - runner: b200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimaxm2.5 - framework: vllm - hardware-profile-id: nvidia:h100_sxm_80gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - multinode: false - precision: fp8 - runner: h100 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimaxm2.5 - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - multinode: false - precision: fp8 - runner: h200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-mi300x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimaxm2.5 - framework: vllm - hardware-profile-id: amd:mi300x_192gb - image: vllm/vllm-openai-rocm:v0.16.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - multinode: false - precision: fp8 - runner: mi300x - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-mi325x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimaxm2.5 - framework: vllm - hardware-profile-id: amd:mi325x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - multinode: false - precision: fp8 - runner: mi325x - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimaxm2.5 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - multinode: false - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -minimaxm2.5-fp8-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimaxm2.5 - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.19.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - multinode: false - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:vllm -qwen3.5-bf16-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - multinode: false - precision: bf16 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-bf16-mi300x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: amd:mi300x_192gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - multinode: false - precision: bf16 - runner: mi300x - runtime-stack-id: standalone:sglang -qwen3.5-bf16-mi325x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: amd:mi325x_288gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - multinode: false - precision: bf16 - runner: mi325x - runtime-stack-id: standalone:sglang -qwen3.5-bf16-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - multinode: false - precision: bf16 - runner: mi355x - runtime-stack-id: standalone:sglang -qwen3.5-fp4-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: nvidia/Qwen3.5-397B-A17B-NVFP4 - model-prefix: qwen3.5 - multinode: false - precision: fp4 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-fp4-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: lmsysorg/sglang:v0.5.10-rocm720-mi35x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: amd/Qwen3.5-397B-A17B-MXFP4 - model-prefix: qwen3.5 - multinode: false - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:sglang -qwen3.5-fp8-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - multinode: false - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-b200-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - multinode: false - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-h200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.9-cu129-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - multinode: false - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-h200-sglang-isb1-kv-stress-500k: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.9-cu129-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - users: - - 1 - - 2 - - 4 - - 8 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - tp: 8 - users: - - 1 - - 2 - - 4 - - 8 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - multinode: false - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-h200-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.10.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - multinode: false + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 precision: fp8 runner: h200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-mi300x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: amd:mi300x_192gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - multinode: false - precision: fp8 - runner: mi300x - runtime-stack-id: standalone:sglang -qwen3.5-fp8-mi325x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: amd:mi325x_288gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - multinode: false - precision: fp8 - runner: mi325x - runtime-stack-id: standalone:sglang -qwen3.5-fp8-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - multinode: false - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:sglang -qwen3.5-fp8-mi355x-sglang-isb1-kv-stress-500k: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3.5 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - users: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - tp: 8 - users: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - multinode: false - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:sglang + runtime-stack-id: standalone:vllm From fec485571559f0f518f0fc1f8519c7532ed9fa96 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Wed, 15 Apr 2026 16:00:24 -0700 Subject: [PATCH 03/18] =?UTF-8?q?feat:=20expand=20export=20metadata=20+=20?= =?UTF-8?q?configs=20for=20all=2087=20model=C3=97engine=C3=97GPU=20combos?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export metadata now includes all valid (runtime, hardware, model) triples from nvidia-master.yaml + amd-master.yaml: - 8 runtimes: vllm, sglang, trt, atom, sglang-disagg, dynamo-* - 9 GPU types: H100, H200, B200, B300, GB200, GB300, MI300X, MI325X, MI355X - 6 models: DSR1, GPT-OSS, Qwen 3.5, GLM-5, Kimi K2.5, MiniMax M2.5 87 KV stress configs with correct canonical-model-id and support-status matching export metadata. Sweep generator passes (exit code 0). MI355X configs sweep to 512 concurrent users (288GB HBM advantage). --- .github/configs/isb1-kv-stress-pr993.yaml | 3410 ++++++++++++++++- .../exports/extension_131k/code_131k1k.json | 4 +- .../extension_131k/code_131k1k_qwen3.5.json | 4 +- 3 files changed, 3332 insertions(+), 86 deletions(-) diff --git a/.github/configs/isb1-kv-stress-pr993.yaml b/.github/configs/isb1-kv-stress-pr993.yaml index dc7bae2f5..544ecd9dd 100644 --- a/.github/configs/isb1-kv-stress-pr993.yaml +++ b/.github/configs/isb1-kv-stress-pr993.yaml @@ -1,9 +1,9 @@ -gptoss-fp4-b200-vllm-isb1-kv-stress: +dsr1-fp4-b200-dynamo-sglang-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.15.1 + image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -14,7 +14,7 @@ gptoss-fp4-b200-vllm-isb1-kv-stress: - 'on' - 'off' - noprefix - users: + users: &id001 - 2 - 4 - 8 @@ -23,7 +23,7 @@ gptoss-fp4-b200-vllm-isb1-kv-stress: - 64 - 128 - 256 - support-status: reviewed_preview + support-status: unsupported tp-configs: - duration-s: 1800 ep: 1 @@ -32,7 +32,30 @@ gptoss-fp4-b200-vllm-isb1-kv-stress: - 'off' - noprefix tp: 8 - users: + users: *id001 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp4-b200-dynamo-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id002 - 2 - 4 - 8 @@ -41,18 +64,28 @@ gptoss-fp4-b200-vllm-isb1-kv-stress: - 64 - 128 - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id002 workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss + model: deepseek-r1-fp4 + model-prefix: dsr1 precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -gptoss-fp4-h100-vllm-isb1-kv-stress: + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp4-b200-dynamo-trt-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: nvidia:h100_sxm_80gb - image: vllm/vllm-openai:v0.18.0 + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -63,7 +96,7 @@ gptoss-fp4-h100-vllm-isb1-kv-stress: - 'on' - 'off' - noprefix - users: + users: &id003 - 2 - 4 - 8 @@ -72,7 +105,7 @@ gptoss-fp4-h100-vllm-isb1-kv-stress: - 64 - 128 - 256 - support-status: reviewed_preview + support-status: unsupported tp-configs: - duration-s: 1800 ep: 1 @@ -81,7 +114,30 @@ gptoss-fp4-h100-vllm-isb1-kv-stress: - 'off' - noprefix tp: 8 - users: + users: *id003 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b200-multinode + runtime-stack-id: dynamo:trt +dsr1-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id004 - 2 - 4 - 8 @@ -90,18 +146,28 @@ gptoss-fp4-h100-vllm-isb1-kv-stress: - 64 - 128 - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id004 workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 precision: fp4 - runner: h100 - runtime-stack-id: standalone:vllm -gptoss-fp4-h200-vllm-isb1-kv-stress: + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp4-b200-trt-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.18.0 + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -112,7 +178,7 @@ gptoss-fp4-h200-vllm-isb1-kv-stress: - 'on' - 'off' - noprefix - users: + users: &id005 - 2 - 4 - 8 @@ -121,7 +187,7 @@ gptoss-fp4-h200-vllm-isb1-kv-stress: - 64 - 128 - 256 - support-status: reviewed_preview + support-status: unsupported tp-configs: - duration-s: 1800 ep: 1 @@ -130,7 +196,30 @@ gptoss-fp4-h200-vllm-isb1-kv-stress: - 'off' - noprefix tp: 8 - users: + users: *id005 + workload-type: code + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp4-b200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id006 - 2 - 4 - 8 @@ -139,18 +228,28 @@ gptoss-fp4-h200-vllm-isb1-kv-stress: - 64 - 128 - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id006 workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 precision: fp4 - runner: h200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp4-b300-dynamo-trt-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.19.0-cu130 + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b300_sxm_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -161,7 +260,7 @@ minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: - 'on' - 'off' - noprefix - users: + users: &id007 - 2 - 4 - 8 @@ -179,7 +278,30 @@ minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: - 'off' - noprefix tp: 8 - users: + users: *id007 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b300 + runtime-stack-id: dynamo:trt +dsr1-fp4-gb200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: lmsysorg/sglang:v0.5.8-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id008 - 2 - 4 - 8 @@ -188,18 +310,28 @@ minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: - 64 - 128 - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id008 workload-type: code - model: nvidia/MiniMax-M2.5-NVFP4 - model-prefix: minimaxm2.5 + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: + runner: gb200 + runtime-stack-id: dynamo:sglang +dsr1-fp4-gb200-dynamo-trt-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.19.0-cu130 + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -210,7 +342,7 @@ minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: - 'on' - 'off' - noprefix - users: + users: &id009 - 2 - 4 - 8 @@ -228,7 +360,30 @@ minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: - 'off' - noprefix tp: 8 - users: + users: *id009 + workload-type: code + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:trt +dsr1-fp4-gb300-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id010 - 2 - 4 - 8 @@ -237,18 +392,28 @@ minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: - 64 - 128 - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id010 workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb300 + runtime-stack-id: dynamo:sglang +dsr1-fp4-gb300-dynamo-trt-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:h100_sxm_80gb - image: vllm/vllm-openai:v0.18.0 + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -259,7 +424,7 @@ minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: - 'on' - 'off' - noprefix - users: + users: &id011 - 2 - 4 - 8 @@ -277,7 +442,30 @@ minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: - 'off' - noprefix tp: 8 - users: + users: *id011 + workload-type: code + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb300 + runtime-stack-id: dynamo:trt +dsr1-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id012 - 2 - 4 - 8 @@ -286,18 +474,29 @@ minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: - 64 - 128 - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id012 workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: h100 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp4-mi355x-atom-mtp-isb1-kv-stress: benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.18.0 + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1 kv-cache-dtype: fp8 kv-stress-configs: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -308,7 +507,7 @@ minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: - 'on' - 'off' - noprefix - users: + users: &id013 - 2 - 4 - 8 @@ -317,6 +516,7 @@ minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: - 64 - 128 - 256 + - 512 support-status: unsupported tp-configs: - duration-s: 1800 @@ -326,7 +526,30 @@ minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: - 'off' - noprefix tp: 8 - users: + users: *id013 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp4-mi355x-sglang-disagg-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id014 - 2 - 4 - 8 @@ -335,9 +558,3032 @@ minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: - 64 - 128 - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id014 workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:vllm + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + precision: fp4 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp4-mi355x-sglang-disagg-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id015 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id015 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + precision: fp4 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp4-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id016 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id016 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id017 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id017 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-b200-dynamo-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id018 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id018 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-b200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id019 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id019 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200-multinode + runtime-stack-id: dynamo:trt +dsr1-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id020 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id020 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id021 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id021 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id022 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id022 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp8-b200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id023 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id023 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp8-b300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b300_sxm_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id024 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id024 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b300 + runtime-stack-id: dynamo:trt +dsr1-fp8-gb200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id025 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id025 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb200 + runtime-stack-id: dynamo:sglang +dsr1-fp8-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id026 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id026 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb200 + runtime-stack-id: dynamo:trt +dsr1-fp8-gb300-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id027 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id027 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb300 + runtime-stack-id: dynamo:sglang +dsr1-fp8-gb300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id028 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id028 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb300 + runtime-stack-id: dynamo:trt +dsr1-fp8-h100-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:h100_sxm_80gb + image: lmsysorg/sglang:v0.5.8-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id029 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id029 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h100-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-h100-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:h100_sxm_80gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id030 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id030 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h100-multinode + runtime-stack-id: dynamo:trt +dsr1-fp8-h200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id031 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id031 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-h200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id032 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id032 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200-multinode + runtime-stack-id: dynamo:trt +dsr1-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id033 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id033 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +dsr1-fp8-h200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id034 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id034 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:trt +dsr1-fp8-h200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id035 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id035 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:trt +dsr1-fp8-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id036 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id036 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:sglang +dsr1-fp8-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id037 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id037 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:sglang +dsr1-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id038 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id038 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp8-mi355x-atom-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id039 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id039 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp8-mi355x-sglang-disagg-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id040 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id040 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp8-mi355x-sglang-disagg-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id041 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id041 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id042 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id042 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang +glm5-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id043 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id043 + workload-type: code + model: nvidia/GLM-5-NVFP4 + model-prefix: glm5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +glm5-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id044 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id044 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +glm5-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:glm5-hopper + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id045 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id045 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +glm5-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id046 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id046 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +glm5-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id047 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id047 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang +gptoss-fp4-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id048 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id048 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +gptoss-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.15.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id049 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id049 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +gptoss-fp4-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id050 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id050 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:trt +gptoss-fp4-h100-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: nvidia:h100_sxm_80gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id051 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id051 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: h100 + runtime-stack-id: standalone:vllm +gptoss-fp4-h200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc11 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id052 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id052 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: h200 + runtime-stack-id: standalone:trt +gptoss-fp4-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id053 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id053 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: h200 + runtime-stack-id: standalone:vllm +gptoss-fp4-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id054 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id054 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: mi300x + runtime-stack-id: standalone:vllm +gptoss-fp4-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id055 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id055 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: mi325x + runtime-stack-id: standalone:vllm +gptoss-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id056 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id056 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +gptoss-fp4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id057 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id057 + workload-type: code + model: amd/gpt-oss-120b-w-mxfp4-a-fp8 + model-prefix: gptoss + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:vllm +kimik2.5-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id058 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id058 + workload-type: code + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +kimik2.5-fp4-gb200-dynamo-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: dynamo-vllm + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: vllm/vllm-openai:v0.18.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id059 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id059 + workload-type: code + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:vllm +kimik2.5-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id060 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id060 + workload-type: code + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +kimik2.5-fp4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id061 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id061 + workload-type: code + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:vllm +kimik2.5-int4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.15.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id062 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id062 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: b200 + runtime-stack-id: standalone:vllm +kimik2.5-int4-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.16.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id063 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id063 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: h200 + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id064 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id064 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: mi300x + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id065 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id065 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: mi325x + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id066 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id066 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: mi355x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.19.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id067 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id067 + workload-type: code + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.19.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id068 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id068 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:h100_sxm_80gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id069 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id069 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: h100 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id070 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id070 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.16.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id071 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id071 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id072 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id072 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id073 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id073 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +minimaxm2.5-fp8-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.19.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id074 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id074 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:vllm +qwen3.5-bf16-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id075 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id075 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id076 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id076 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: mi300x + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id077 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id077 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: mi325x + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id078 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id078 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: mi355x + runtime-stack-id: standalone:sglang +qwen3.5-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id079 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id079 + workload-type: code + model: nvidia/Qwen3.5-397B-A17B-NVFP4 + model-prefix: qwen3.5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp4-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.10-rocm720-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id080 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id080 + workload-type: code + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id081 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id081 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-b200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id082 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id082 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.9-cu129-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id083 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id083 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-h200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.10.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id084 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id084 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id085 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id085 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id086 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id086 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id087 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id087 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang diff --git a/datasets/isb1/exports/extension_131k/code_131k1k.json b/datasets/isb1/exports/extension_131k/code_131k1k.json index 1b29e38f6..99915e4cd 100644 --- a/datasets/isb1/exports/extension_131k/code_131k1k.json +++ b/datasets/isb1/exports/extension_131k/code_131k1k.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:082c4b75b81ca680adccf6f00fc8e4069098cfb25e20ebc5ca88ce0dd47c3cc3 -size 1802776 +oid sha256:66df69260749a22f4af2d2d25a6dce23b3b466533f75338da599db87ace6e833 +size 5461532 diff --git a/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json b/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json index 1b955eb08..0b041fb66 100644 --- a/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json +++ b/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:385d33f9705f6c0227ea04b03d0ed2c47730a3ce408b1619445e50b67429a9d2 -size 398078 +oid sha256:dcd048663de0e325e601cdc44b0683a2dfbeecd53fe277937131250e1a86b3e4 +size 5027435 From fa132a75469ed293882496f1eff05cabcdfb9cd7 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Thu, 16 Apr 2026 22:04:05 -0700 Subject: [PATCH 04/18] =?UTF-8?q?fix(isb1):=20close=20PR#1032=20merge-swee?= =?UTF-8?q?p=20=E2=80=94=20flat=20paths,=20v0.2.0=20manifests,=20prefix-aw?= =?UTF-8?q?are=20replay?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Final closure pass landing PR#1032 end-to-end for SLURM + InferenceX + kv-cache-tester across every (runtime, hardware, canonical-model) triple currently in the export metadata. Sweep configs: - Rename isb1-kv-stress-pr993.yaml -> isb1-kv-stress.yaml - Rewrite isb1-master / isb1-triattn-preview / isb1-qwen-1m-preview: drop/demote dead stanzas, flatten paths (strip /vllm//sglang/ subdirs and __vllm/__sglang suffixes), repoint qwen3.5 to _qwen3.5 basename - isb1-master shrinks 1723 -> 863 lines (50 -> 26 stanzas); 1M preview drops the vllm stanza (sglang-only in reality) - All produced rows resolve to real bundle cells at declared tier Manifests -> manifest_version 0.2.0 with single-bundle exports for preview/long_context_500k (gptoss + qwen3.5) and preview/long_context_1m. Consumer replay (utils/bench_serving/benchmark_export_replay.py): hydrate v0.2.0 prefix-aware bundles — thin per-cell deltas join a shared workload prefix via prefix_ref, LRU-cached (max 8) across cells in the same bundle. Pre-0.2.0 bundles replay unchanged. Producer-sync verifier (utils/verify_producer_sync.py): extend coverage to core + extension_32k + extension_64k; silently skip subtrees absent on both sides, report asymmetric ones. Docs: COEXISTENCE_WITH_KV_CACHE_TESTER + both preview READMEs updated with flat paths, new config name, and the sglang-only preview reality. Tests: 262/262 pass across utils/ (107 sweep-config + new test_benchmark_export_replay.py for the prefix-aware consumer + test_verify_producer_sync.py for broadened verifier coverage). --- .github/configs/isb1-kv-stress-pr993.yaml | 3589 ---------------- .github/configs/isb1-kv-stress.yaml | 3619 ++++++++++++++++- .github/configs/isb1-master.yaml | 1312 ++---- .github/configs/isb1-qwen-1m-preview.yaml | 23 +- .github/configs/isb1-triattn-preview.yaml | 36 +- .../isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md | 8 +- .../exports/preview/long_context_1m/README.md | 2 +- .../preview/long_context_1m/manifest.json | 4 +- .../preview/long_context_500k/README.md | 3 +- .../preview/long_context_500k/manifest.json | 4 +- .../long_context_500k/manifest_qwen3.5.json | 4 +- .../bench_serving/benchmark_export_replay.py | 150 +- .../test_generate_sweep_configs.py | 251 +- utils/test_benchmark_export_replay.py | 181 + utils/test_verify_producer_sync.py | 38 + utils/verify_producer_sync.py | 24 +- 16 files changed, 4330 insertions(+), 4918 deletions(-) delete mode 100644 .github/configs/isb1-kv-stress-pr993.yaml diff --git a/.github/configs/isb1-kv-stress-pr993.yaml b/.github/configs/isb1-kv-stress-pr993.yaml deleted file mode 100644 index 544ecd9dd..000000000 --- a/.github/configs/isb1-kv-stress-pr993.yaml +++ /dev/null @@ -1,3589 +0,0 @@ -dsr1-fp4-b200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id001 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id001 - workload-type: code - model: deepseek-r1-fp4 - model-prefix: dsr1 - precision: fp4 - runner: b200-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp4-b200-dynamo-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id002 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id002 - workload-type: code - model: deepseek-r1-fp4 - model-prefix: dsr1 - precision: fp4 - runner: b200-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp4-b200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id003 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id003 - workload-type: code - model: deepseek-r1-fp4 - model-prefix: dsr1 - precision: fp4 - runner: b200-multinode - runtime-stack-id: dynamo:trt -dsr1-fp4-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id004 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id004 - workload-type: code - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:sglang -dsr1-fp4-b200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id005 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id005 - workload-type: code - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp4-b200-trt-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id006 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id006 - workload-type: code - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp4-b300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:b300_sxm_288gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id007 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id007 - workload-type: code - model: deepseek-r1-fp4 - model-prefix: dsr1 - precision: fp4 - runner: b300 - runtime-stack-id: dynamo:trt -dsr1-fp4-gb200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: lmsysorg/sglang:v0.5.8-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id008 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id008 - workload-type: code - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - precision: fp4 - runner: gb200 - runtime-stack-id: dynamo:sglang -dsr1-fp4-gb200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id009 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id009 - workload-type: code - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - precision: fp4 - runner: gb200 - runtime-stack-id: dynamo:trt -dsr1-fp4-gb300-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id010 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id010 - workload-type: code - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - precision: fp4 - runner: gb300 - runtime-stack-id: dynamo:sglang -dsr1-fp4-gb300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id011 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id011 - workload-type: code - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - precision: fp4 - runner: gb300 - runtime-stack-id: dynamo:trt -dsr1-fp4-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id012 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id012 - workload-type: code - model: amd/DeepSeek-R1-0528-MXFP4-Preview - model-prefix: dsr1 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp4-mi355x-atom-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id013 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id013 - workload-type: code - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp4-mi355x-sglang-disagg-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id014 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id014 - workload-type: code - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - precision: fp4 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp4-mi355x-sglang-disagg-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id015 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id015 - workload-type: code - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - precision: fp4 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp4-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi35x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id016 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id016 - workload-type: code - model: amd/DeepSeek-R1-0528-MXFP4-Preview - model-prefix: dsr1 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:sglang -dsr1-fp8-b200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id017 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id017 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp8-b200-dynamo-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id018 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id018 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp8-b200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id019 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id019 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200-multinode - runtime-stack-id: dynamo:trt -dsr1-fp8-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id020 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id020 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -dsr1-fp8-b200-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id021 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id021 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -dsr1-fp8-b200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id022 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id022 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp8-b200-trt-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id023 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id023 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp8-b300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:b300_sxm_288gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id024 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id024 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b300 - runtime-stack-id: dynamo:trt -dsr1-fp8-gb200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id025 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id025 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: gb200 - runtime-stack-id: dynamo:sglang -dsr1-fp8-gb200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id026 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id026 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: gb200 - runtime-stack-id: dynamo:trt -dsr1-fp8-gb300-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id027 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id027 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: gb300 - runtime-stack-id: dynamo:sglang -dsr1-fp8-gb300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id028 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id028 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: gb300 - runtime-stack-id: dynamo:trt -dsr1-fp8-h100-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:h100_sxm_80gb - image: lmsysorg/sglang:v0.5.8-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id029 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id029 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h100-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp8-h100-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:h100_sxm_80gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id030 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id030 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h100-multinode - runtime-stack-id: dynamo:trt -dsr1-fp8-h200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id031 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id031 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h200-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp8-h200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id032 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id032 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h200-multinode - runtime-stack-id: dynamo:trt -dsr1-fp8-h200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id033 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id033 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -dsr1-fp8-h200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id034 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id034 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:trt -dsr1-fp8-h200-trt-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id035 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id035 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:trt -dsr1-fp8-mi300x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: amd:mi300x_192gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id036 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id036 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi300x - runtime-stack-id: standalone:sglang -dsr1-fp8-mi325x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: amd:mi325x_288gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id037 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id037 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi325x - runtime-stack-id: standalone:sglang -dsr1-fp8-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id038 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id038 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp8-mi355x-atom-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id039 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id039 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp8-mi355x-sglang-disagg-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id040 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id040 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp8-mi355x-sglang-disagg-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id041 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id041 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp8-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi35x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id042 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id042 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:sglang -glm5-fp4-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm_5 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id043 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id043 - workload-type: code - model: nvidia/GLM-5-NVFP4 - model-prefix: glm5 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:sglang -glm5-fp8-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm_5 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id044 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id044 - workload-type: code - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -glm5-fp8-h200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm_5 - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:glm5-hopper - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id045 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id045 - workload-type: code - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -glm5-fp8-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm_5 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id046 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id046 - workload-type: code - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -glm5-fp8-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm_5 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id047 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id047 - workload-type: code - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:sglang -gptoss-fp4-b200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id048 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id048 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: b200 - runtime-stack-id: standalone:trt -gptoss-fp4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.15.1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id049 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id049 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -gptoss-fp4-gb200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: dynamo-trt - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id050 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id050 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: gb200 - runtime-stack-id: dynamo:trt -gptoss-fp4-h100-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: nvidia:h100_sxm_80gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id051 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id051 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: h100 - runtime-stack-id: standalone:vllm -gptoss-fp4-h200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc11 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id052 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id052 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: h200 - runtime-stack-id: standalone:trt -gptoss-fp4-h200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id053 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id053 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: h200 - runtime-stack-id: standalone:vllm -gptoss-fp4-mi300x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: amd:mi300x_192gb - image: vllm/vllm-openai-rocm:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id054 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id054 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: mi300x - runtime-stack-id: standalone:vllm -gptoss-fp4-mi325x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: amd:mi325x_288gb - image: vllm/vllm-openai-rocm:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id055 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id055 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: mi325x - runtime-stack-id: standalone:vllm -gptoss-fp4-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id056 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id056 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -gptoss-fp4-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id057 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id057 - workload-type: code - model: amd/gpt-oss-120b-w-mxfp4-a-fp8 - model-prefix: gptoss - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:vllm -kimik2.5-fp4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id058 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id058 - workload-type: code - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -kimik2.5-fp4-gb200-dynamo-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: dynamo-vllm - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: vllm/vllm-openai:v0.18.0-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id059 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id059 - workload-type: code - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - precision: fp4 - runner: gb200 - runtime-stack-id: dynamo:vllm -kimik2.5-fp4-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id060 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id060 - workload-type: code - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -kimik2.5-fp4-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id061 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id061 - workload-type: code - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:vllm -kimik2.5-int4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.15.1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id062 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id062 - workload-type: code - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - precision: int4 - runner: b200 - runtime-stack-id: standalone:vllm -kimik2.5-int4-h200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.16.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id063 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id063 - workload-type: code - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - precision: int4 - runner: h200 - runtime-stack-id: standalone:vllm -kimik2.5-int4-mi300x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: amd:mi300x_192gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id064 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id064 - workload-type: code - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - precision: int4 - runner: mi300x - runtime-stack-id: standalone:vllm -kimik2.5-int4-mi325x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: amd:mi325x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id065 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id065 - workload-type: code - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - precision: int4 - runner: mi325x - runtime-stack-id: standalone:vllm -kimik2.5-int4-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id066 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id066 - workload-type: code - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - precision: int4 - runner: mi355x - runtime-stack-id: standalone:vllm -minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.19.0-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id067 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id067 - workload-type: code - model: nvidia/MiniMax-M2.5-NVFP4 - model-prefix: minimaxm2.5 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.19.0-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id068 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id068 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:h100_sxm_80gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id069 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id069 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: h100 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id070 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id070 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-mi300x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: amd:mi300x_192gb - image: vllm/vllm-openai-rocm:v0.16.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id071 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id071 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: mi300x - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-mi325x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: amd:mi325x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id072 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id072 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: mi325x - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id073 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id073 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -minimaxm2.5-fp8-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.19.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id074 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id074 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:vllm -qwen3.5-bf16-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id075 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id075 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - precision: bf16 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-bf16-mi300x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi300x_192gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id076 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id076 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - precision: bf16 - runner: mi300x - runtime-stack-id: standalone:sglang -qwen3.5-bf16-mi325x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi325x_288gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id077 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id077 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - precision: bf16 - runner: mi325x - runtime-stack-id: standalone:sglang -qwen3.5-bf16-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id078 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id078 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - precision: bf16 - runner: mi355x - runtime-stack-id: standalone:sglang -qwen3.5-fp4-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id079 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id079 - workload-type: code - model: nvidia/Qwen3.5-397B-A17B-NVFP4 - model-prefix: qwen3.5 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-fp4-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: lmsysorg/sglang:v0.5.10-rocm720-mi35x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id080 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id080 - workload-type: code - model: amd/Qwen3.5-397B-A17B-MXFP4 - model-prefix: qwen3.5 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:sglang -qwen3.5-fp8-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id081 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id081 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-b200-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id082 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id082 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-h200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.9-cu129-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id083 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id083 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-h200-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.10.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id084 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id084 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-mi300x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi300x_192gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id085 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id085 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: mi300x - runtime-stack-id: standalone:sglang -qwen3.5-fp8-mi325x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi325x_288gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id086 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id086 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: mi325x - runtime-stack-id: standalone:sglang -qwen3.5-fp8-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id087 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id087 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:sglang diff --git a/.github/configs/isb1-kv-stress.yaml b/.github/configs/isb1-kv-stress.yaml index 9ee07ef5d..544ecd9dd 100644 --- a/.github/configs/isb1-kv-stress.yaml +++ b/.github/configs/isb1-kv-stress.yaml @@ -1,96 +1,3589 @@ -# Dedicated ISB1 KV cache stress sweeps (CTO-approved schema). -# -# This file is intentionally separate from isb1-master.yaml and uses -# benchmark-type: isb1_kv_stress with kv-stress-configs. - -gptoss-fp4-h200-isb1-kv-stress-vllm-code: +dsr1-fp4-b200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id001 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id001 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp4-b200-dynamo-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id002 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id002 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp4-b200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id003 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id003 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b200-multinode + runtime-stack-id: dynamo:trt +dsr1-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id004 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id004 + workload-type: code + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp4-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id005 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id005 + workload-type: code + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp4-b200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id006 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id006 + workload-type: code + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp4-b300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b300_sxm_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id007 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id007 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b300 + runtime-stack-id: dynamo:trt +dsr1-fp4-gb200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: lmsysorg/sglang:v0.5.8-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id008 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id008 + workload-type: code + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:sglang +dsr1-fp4-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id009 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id009 + workload-type: code + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:trt +dsr1-fp4-gb300-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id010 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id010 + workload-type: code + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb300 + runtime-stack-id: dynamo:sglang +dsr1-fp4-gb300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id011 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id011 + workload-type: code + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb300 + runtime-stack-id: dynamo:trt +dsr1-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id012 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id012 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp4-mi355x-atom-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id013 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id013 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp4-mi355x-sglang-disagg-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id014 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id014 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + precision: fp4 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp4-mi355x-sglang-disagg-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id015 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id015 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + precision: fp4 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp4-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id016 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id016 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id017 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id017 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-b200-dynamo-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id018 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id018 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-b200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id019 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id019 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200-multinode + runtime-stack-id: dynamo:trt +dsr1-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id020 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id020 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id021 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id021 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id022 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id022 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp8-b200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id023 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id023 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp8-b300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b300_sxm_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id024 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id024 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b300 + runtime-stack-id: dynamo:trt +dsr1-fp8-gb200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id025 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id025 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb200 + runtime-stack-id: dynamo:sglang +dsr1-fp8-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id026 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id026 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb200 + runtime-stack-id: dynamo:trt +dsr1-fp8-gb300-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id027 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id027 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb300 + runtime-stack-id: dynamo:sglang +dsr1-fp8-gb300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id028 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id028 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb300 + runtime-stack-id: dynamo:trt +dsr1-fp8-h100-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:h100_sxm_80gb + image: lmsysorg/sglang:v0.5.8-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id029 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id029 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h100-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-h100-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:h100_sxm_80gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id030 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id030 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h100-multinode + runtime-stack-id: dynamo:trt +dsr1-fp8-h200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id031 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id031 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-h200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id032 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id032 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200-multinode + runtime-stack-id: dynamo:trt +dsr1-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id033 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id033 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +dsr1-fp8-h200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id034 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id034 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:trt +dsr1-fp8-h200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id035 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id035 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:trt +dsr1-fp8-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id036 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id036 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:sglang +dsr1-fp8-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id037 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id037 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:sglang +dsr1-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id038 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id038 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp8-mi355x-atom-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id039 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id039 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp8-mi355x-sglang-disagg-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id040 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id040 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp8-mi355x-sglang-disagg-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id041 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id041 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id042 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id042 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang +glm5-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id043 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id043 + workload-type: code + model: nvidia/GLM-5-NVFP4 + model-prefix: glm5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +glm5-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id044 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id044 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +glm5-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:glm5-hopper + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id045 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id045 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +glm5-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id046 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id046 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +glm5-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id047 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id047 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang +gptoss-fp4-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id048 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id048 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +gptoss-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.15.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id049 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id049 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +gptoss-fp4-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id050 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id050 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:trt +gptoss-fp4-h100-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: nvidia:h100_sxm_80gb image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id051 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id051 + workload-type: code model: openai/gpt-oss-120b model-prefix: gptoss precision: fp4 + runner: h100 + runtime-stack-id: standalone:vllm +gptoss-fp4-h200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc11 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id052 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id052 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: h200 + runtime-stack-id: standalone:trt +gptoss-fp4-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id053 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id053 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 runner: h200 + runtime-stack-id: standalone:vllm +gptoss-fp4-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id054 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id054 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: mi300x + runtime-stack-id: standalone:vllm +gptoss-fp4-mi325x-vllm-isb1-kv-stress: benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id055 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id055 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: mi325x runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb +gptoss-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress canonical-model-id: gpt_oss_120b - max-model-len: 131272 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - workload-type: code - search-space: - - users: [2, 4, 8, 16, 32, 64, 128, 256] - offload-modes: ["on", "off", "noprefix"] - duration-s: 1800 - -gptoss-fp4-b200-isb1-kv-stress-vllm-code: - image: vllm/vllm-openai:v0.15.1 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id056 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id056 + workload-type: code model: openai/gpt-oss-120b model-prefix: gptoss precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +gptoss-fp4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id057 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id057 + workload-type: code + model: amd/gpt-oss-120b-w-mxfp4-a-fp8 + model-prefix: gptoss + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:vllm +kimik2.5-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id058 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id058 + workload-type: code + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + precision: fp4 runner: b200 + runtime-stack-id: standalone:vllm +kimik2.5-fp4-gb200-dynamo-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: dynamo-vllm + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: vllm/vllm-openai:v0.18.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id059 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id059 + workload-type: code + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:vllm +kimik2.5-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id060 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id060 + workload-type: code + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +kimik2.5-fp4-mi355x-vllm-isb1-kv-stress: benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id061 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id061 + workload-type: code + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: mi355x runtime-stack-id: standalone:vllm +kimik2.5-int4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: gpt_oss_120b - max-model-len: 131272 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - workload-type: code - search-space: - - users: [2, 4, 8, 16, 32, 64, 128, 256] - offload-modes: ["on", "off", "noprefix"] - duration-s: 1800 - -qwen3.5-fp8-h200-isb1-kv-stress-vllm-code: + image: vllm/vllm-openai:v0.15.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id062 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id062 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: b200 + runtime-stack-id: standalone:vllm +kimik2.5-int4-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.16.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id063 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id063 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: h200 + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id064 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id064 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: mi300x + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id065 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id065 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: mi325x + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id066 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id066 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: mi355x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.19.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id067 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id067 + workload-type: code + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.19.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id068 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id068 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:h100_sxm_80gb image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id069 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id069 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 precision: fp8 + runner: h100 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id070 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id070 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 runner: h200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi300x-vllm-isb1-kv-stress: benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.16.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id071 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id071 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi300x runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb +minimaxm2.5-fp8-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id072 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id072 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id073 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id073 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +minimaxm2.5-fp8-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.19.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id074 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id074 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:vllm +qwen3.5-bf16-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress canonical-model-id: qwen3_5_397b_a17b - max-model-len: 131272 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - workload-type: code - search-space: - - users: [2, 4, 8, 16, 32, 64, 128, 256] - offload-modes: ["on", "off", "noprefix"] - duration-s: 1800 - -qwen3.5-fp8-b200-isb1-kv-stress-vllm-code: - image: vllm/vllm-openai:v0.19.0-cu130 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id075 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id075 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id076 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id076 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: mi300x + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id077 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id077 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: mi325x + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id078 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id078 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: mi355x + runtime-stack-id: standalone:sglang +qwen3.5-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id079 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id079 + workload-type: code + model: nvidia/Qwen3.5-397B-A17B-NVFP4 + model-prefix: qwen3.5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp4-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.10-rocm720-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id080 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id080 + workload-type: code + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id081 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id081 + workload-type: code model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 precision: fp8 - framework: vllm runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-b200-sglang-mtp-isb1-kv-stress: benchmark-type: isb1_kv_stress - runtime-stack-id: standalone:vllm + canonical-model-id: qwen3_5_397b_a17b + framework: sglang hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id082 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id082 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress canonical-model-id: qwen3_5_397b_a17b - max-model-len: 131272 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.9-cu129-amd64 kv-cache-dtype: fp8 kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - workload-type: code - search-space: - - users: [2, 4, 8, 16, 32, 64, 128, 256] - offload-modes: ["on", "off", "noprefix"] - duration-s: 1800 + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id083 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id083 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-h200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.10.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id084 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id084 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id085 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id085 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id086 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id086 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id087 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id087 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang diff --git a/.github/configs/isb1-master.yaml b/.github/configs/isb1-master.yaml index 99c111967..ff71182e5 100644 --- a/.github/configs/isb1-master.yaml +++ b/.github/configs/isb1-master.yaml @@ -1,79 +1,29 @@ -# PR2 packaged the core 8k1k replay bundles. -# PR4 adds truthful long-context extension replay lanes using only the materialized -# extension_32k / extension_64k / extension_131k code bundles. -# These extension lanes are served-shape replay artifacts derived from larger source -# workloads; they are not native 500k+/1M+ InferenceX served-lane claims. +# ISB1 master sweep config. # -# Core entries keep an explicit 8k1k max-model-len. Extension entries intentionally -# omit max-model-len so the ISB1 workflow derives the served-shape value from the -# export stem (32k1k / 64k1k / 131k1k) at execution time. +# Each stanza pairs a runtime/hardware/model identity with one or more +# replay-config entries. Every (runtime-stack-id, hardware-profile-id, +# canonical-model-id) triple must exist in the referenced export bundle; +# row-level filtering in the hydrator selects the matching cells. # -# Official replay-configs pin support-status: supported so the workflow only replays -# the supported subset of mixed-status export bundles. -# All currently runnable rows also resolve to -# benchmark_certification_status=dataset_replay_verified. -# Phase 2 adds truthful chat-extension widening plus bounded preview/offload -# lanes. Preview rows stay explicit via support-status: reviewed_preview and the -# dedicated preview export paths. The current replay closure covers dsr1, -# gptoss, and qwen3.5 across core 8k1k plus extension bands, with bounded -# 500k code preview for gptoss and qwen3.5 on standalone sglang/vllm across -# b200/h100/h200. - -dsr1-fp8-b200-isb1-sglang: - image: lmsysorg/sglang:v0.5.9-cu130 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - framework: sglang - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: deepseek_r1_0528 - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -dsr1-fp8-h200-isb1-sglang: - image: lmsysorg/sglang:v0.5.9-cu130 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - framework: sglang - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: deepseek_r1_0528 - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 +# Export-file paths are flat under datasets/isb1/exports/ (no per-engine +# subdirs). Qwen3.5 bundles are suffixed _qwen3.5.json to keep identity +# triples unambiguous. +# +# Core entries keep an explicit 8k1k max-model-len. Extension entries +# intentionally omit max-model-len so the ISB1 workflow derives the +# served-shape value from the export stem (32k1k / 64k1k / 131k1k) at +# execution time. +# +# support-status: +# supported — benchmark_certification_status=dataset_replay_verified +# reviewed_preview — preview rows, pinned explicitly for disclosure +# unsupported — rows retained for bundle coverage; not executed +# +# Current closure: dsr1, gptoss, qwen3.5 on standalone:vllm across core 8k1k +# and extension 32k/64k/131k bands, plus bounded 500k code preview on +# standalone:sglang for gptoss and qwen3.5. SGLang core/extension lanes are +# deferred until matching cells are materialized in the corresponding +# bundles. dsr1-fp8-b200-isb1-vllm: image: vllm/vllm-openai:v0.19.0-cu130 @@ -88,14 +38,14 @@ dsr1-fp8-b200-isb1-vllm: canonical-model-id: deepseek_r1_0528 max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -116,14 +66,14 @@ dsr1-fp8-h200-isb1-vllm: canonical-model-id: deepseek_r1_0528 max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -131,112 +81,112 @@ dsr1-fp8-h200-isb1-vllm: num-warmup-sessions: 1 - max-concurrency: 8 -gptoss-fp4-b200-isb1-sglang: - image: lmsysorg/sglang:v0.5.9-cu130 +gptoss-fp4-b200-isb1-vllm: + # Keep the existing B200 GPT-OSS vLLM pin from the official throughput lane. + image: vllm/vllm-openai:v0.15.1 model: openai/gpt-oss-120b model-prefix: gptoss precision: fp4 - framework: sglang + framework: vllm runner: b200 benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang + runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:b200_sxm_180gb canonical-model-id: gpt_oss_120b max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 -gptoss-fp4-h100-isb1-sglang: - image: lmsysorg/sglang:v0.5.9-cu130 +gptoss-fp4-h100-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 model: openai/gpt-oss-120b model-prefix: gptoss precision: fp4 - framework: sglang + framework: vllm runner: h100 benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang + runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:h100_sxm_80gb canonical-model-id: gpt_oss_120b max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 -gptoss-fp4-h200-isb1-sglang: - image: lmsysorg/sglang:v0.5.9-cu130 +gptoss-fp4-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 model: openai/gpt-oss-120b model-prefix: gptoss precision: fp4 - framework: sglang + framework: vllm runner: h200 benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang + runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:h200_sxm_141gb canonical-model-id: gpt_oss_120b max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 -gptoss-fp4-b200-isb1-vllm: - # Keep the existing B200 GPT-OSS vLLM pin from the official throughput lane. - image: vllm/vllm-openai:v0.15.1 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 +qwen3.5-fp8-b200-isb1-vllm: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 framework: vllm runner: b200 benchmark-type: isb1_replay runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: gpt_oss_120b + canonical-model-id: qwen3_5_397b_a17b max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -244,27 +194,27 @@ gptoss-fp4-b200-isb1-vllm: num-warmup-sessions: 1 - max-concurrency: 8 -gptoss-fp4-h100-isb1-vllm: +qwen3.5-fp8-h100-isb1-vllm: image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 framework: vllm runner: h100 benchmark-type: isb1_replay runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: gpt_oss_120b + canonical-model-id: qwen3_5_397b_a17b max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -272,27 +222,27 @@ gptoss-fp4-h100-isb1-vllm: num-warmup-sessions: 1 - max-concurrency: 8 -gptoss-fp4-h200-isb1-vllm: +qwen3.5-fp8-h200-isb1-vllm: image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 framework: vllm runner: h200 benchmark-type: isb1_replay runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b + canonical-model-id: qwen3_5_397b_a17b max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -300,359 +250,190 @@ gptoss-fp4-h200-isb1-vllm: num-warmup-sessions: 1 - max-concurrency: 8 -qwen3.5-fp8-b200-isb1-sglang: +dsr1-fp8-b200-isb1-sglang-extension: image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 precision: fp8 framework: sglang runner: b200 benchmark-type: isb1_replay runtime-stack-id: standalone:sglang hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -qwen3.5-fp8-h100-isb1-sglang: - image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: sglang - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 10240 + canonical-model-id: deepseek_r1_0528 replay-configs: - - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json request-mode: multi-turn - support-status: supported + support-status: unsupported search-space: - - max-concurrency: 4 + - max-concurrency: 2 num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 -qwen3.5-fp8-h200-isb1-sglang: +dsr1-fp8-h200-isb1-sglang-extension: image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 precision: fp8 framework: sglang runner: h200 benchmark-type: isb1_replay runtime-stack-id: standalone:sglang hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 10240 + canonical-model-id: deepseek_r1_0528 replay-configs: - - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json request-mode: multi-turn - support-status: supported + support-status: unsupported search-space: - - max-concurrency: 4 + - max-concurrency: 2 num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 -qwen3.5-fp8-b200-isb1-vllm: +dsr1-fp8-b200-isb1-vllm-extension: image: vllm/vllm-openai:v0.19.0-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 precision: fp8 framework: vllm runner: b200 benchmark-type: isb1_replay runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 10240 + canonical-model-id: deepseek_r1_0528 replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - -qwen3.5-fp8-h100-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/extension_32k/chat_32k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json request-mode: multi-turn - support-status: reviewed_preview + support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - -qwen3.5-fp8-h200-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/extension_64k/chat_64k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json request-mode: multi-turn - support-status: reviewed_preview + support-status: unsupported search-space: - - max-concurrency: 4 + - max-concurrency: 2 num-warmup-sessions: 1 - - max-concurrency: 8 + - max-concurrency: 4 -dsr1-fp8-b200-isb1-sglang-extension: - image: lmsysorg/sglang:v0.5.9-cu130 +dsr1-fp8-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 precision: fp8 - framework: sglang - runner: b200 + framework: vllm + runner: h200 benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:b200_sxm_180gb + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb canonical-model-id: deepseek_r1_0528 replay-configs: - - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/chat_32k1k.json request-mode: multi-turn - support-status: reviewed_preview + support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/chat_64k1k.json request-mode: multi-turn - support-status: reviewed_preview + support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json request-mode: multi-turn - support-status: reviewed_preview + support-status: unsupported search-space: - max-concurrency: 2 num-warmup-sessions: 1 - max-concurrency: 4 -dsr1-fp8-h200-isb1-sglang-extension: - image: lmsysorg/sglang:v0.5.9-cu130 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - framework: sglang - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: deepseek_r1_0528 - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -dsr1-fp8-b200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.19.0-cu130 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 +gptoss-fp4-b200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 framework: vllm runner: b200 benchmark-type: isb1_replay runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: deepseek_r1_0528 + canonical-model-id: gpt_oss_120b replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/chat_32k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/chat_64k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 2 num-warmup-sessions: 1 - max-concurrency: 4 - -dsr1-fp8-h200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: deepseek_r1_0528 - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/chat_131k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -660,54 +441,54 @@ dsr1-fp8-h200-isb1-vllm-extension: num-warmup-sessions: 1 - max-concurrency: 4 -gptoss-fp4-b200-isb1-sglang-extension: - image: lmsysorg/sglang:v0.5.9-cu130 +gptoss-fp4-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 model: openai/gpt-oss-120b model-prefix: gptoss precision: fp4 - framework: sglang - runner: b200 + framework: vllm + runner: h100 benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:b200_sxm_180gb + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb canonical-model-id: gpt_oss_120b replay-configs: - - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/chat_32k1k.json request-mode: multi-turn - support-status: reviewed_preview + support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/chat_64k1k.json request-mode: multi-turn - support-status: reviewed_preview + support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 2 num-warmup-sessions: 1 - max-concurrency: 4 - - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/chat_131k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -715,54 +496,54 @@ gptoss-fp4-b200-isb1-sglang-extension: num-warmup-sessions: 1 - max-concurrency: 4 -gptoss-fp4-h100-isb1-sglang-extension: - image: lmsysorg/sglang:v0.5.9-cu130 +gptoss-fp4-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 model: openai/gpt-oss-120b model-prefix: gptoss precision: fp4 - framework: sglang - runner: h100 + framework: vllm + runner: h200 benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h100_sxm_80gb + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb canonical-model-id: gpt_oss_120b replay-configs: - - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/chat_32k1k.json request-mode: multi-turn - support-status: reviewed_preview + support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/chat_64k1k.json request-mode: multi-turn - support-status: reviewed_preview + support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 2 num-warmup-sessions: 1 - max-concurrency: 4 - - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/chat_131k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -770,54 +551,39 @@ gptoss-fp4-h100-isb1-sglang-extension: num-warmup-sessions: 1 - max-concurrency: 4 -gptoss-fp4-h200-isb1-sglang-extension: +qwen3.5-fp8-b200-isb1-sglang-extension: image: lmsysorg/sglang:v0.5.9-cu130 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 framework: sglang - runner: h200 + runner: b200 benchmark-type: isb1_replay runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b replay-configs: - - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 2 num-warmup-sessions: 1 - max-concurrency: 4 - - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json + +qwen3.5-fp8-h200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -825,54 +591,47 @@ gptoss-fp4-h200-isb1-sglang-extension: num-warmup-sessions: 1 - max-concurrency: 4 -gptoss-fp4-b200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.15.1 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 +qwen3.5-fp8-b200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 framework: vllm runner: b200 benchmark-type: isb1_replay runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: gpt_oss_120b + canonical-model-id: qwen3_5_397b_a17b replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -880,54 +639,47 @@ gptoss-fp4-b200-isb1-vllm-extension: num-warmup-sessions: 1 - max-concurrency: 4 -gptoss-fp4-h100-isb1-vllm-extension: +qwen3.5-fp8-h100-isb1-vllm-extension: image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 framework: vllm runner: h100 benchmark-type: isb1_replay runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: gpt_oss_120b + canonical-model-id: qwen3_5_397b_a17b replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -935,54 +687,47 @@ gptoss-fp4-h100-isb1-vllm-extension: num-warmup-sessions: 1 - max-concurrency: 4 -gptoss-fp4-h200-isb1-vllm-extension: +qwen3.5-fp8-h200-isb1-vllm-extension: image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 framework: vllm runner: h200 benchmark-type: isb1_replay runtime-stack-id: standalone:vllm hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b + canonical-model-id: qwen3_5_397b_a17b replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json request-mode: multi-turn - support-status: supported + support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -990,7 +735,7 @@ gptoss-fp4-h200-isb1-vllm-extension: num-warmup-sessions: 1 - max-concurrency: 4 -qwen3.5-fp8-b200-isb1-sglang-extension: +qwen3.5-fp8-b200-isb1-sglang-500k-preview-code: image: lmsysorg/sglang:v0.5.9-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 @@ -1001,297 +746,9 @@ qwen3.5-fp8-b200-isb1-sglang-extension: runtime-stack-id: standalone:sglang hardware-profile-id: nvidia:b200_sxm_180gb canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 replay-configs: - - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-h100-isb1-sglang-extension: - image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: sglang - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-h200-isb1-sglang-extension: - image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: sglang - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-b200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.19.0-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-h100-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-h200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-b200-isb1-sglang-500k-preview-code: - image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: sglang - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -1313,7 +770,7 @@ qwen3.5-fp8-h100-isb1-sglang-500k-preview-code: canonical-model-id: qwen3_5_397b_a17b max-model-len: 524288 replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -1335,73 +792,7 @@ qwen3.5-fp8-h200-isb1-sglang-500k-preview-code: canonical-model-id: qwen3_5_397b_a17b max-model-len: 524288 replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -qwen3.5-fp8-b200-isb1-vllm-500k-preview-code: - image: vllm/vllm-openai:v0.19.0-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -qwen3.5-fp8-h100-isb1-vllm-500k-preview-code: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -qwen3.5-fp8-h200-isb1-vllm-500k-preview-code: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -1423,7 +814,7 @@ gptoss-fp4-b200-isb1-sglang-500k-preview-code: canonical-model-id: gpt_oss_120b max-model-len: 524288 replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -1445,7 +836,7 @@ gptoss-fp4-h100-isb1-sglang-500k-preview-code: canonical-model-id: gpt_oss_120b max-model-len: 524288 replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -1467,185 +858,7 @@ gptoss-fp4-h200-isb1-sglang-500k-preview-code: canonical-model-id: gpt_oss_120b max-model-len: 524288 replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -gptoss-fp4-b200-isb1-vllm-500k-preview-code: - image: vllm/vllm-openai:v0.15.1 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: gpt_oss_120b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -gptoss-fp4-h100-isb1-vllm-500k-preview-code: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: gpt_oss_120b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -gptoss-fp4-h200-isb1-vllm-500k-preview-code: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -gptoss-fp4-b200-isb1-sglang-offload-core-preview-chat: - image: lmsysorg/sglang:v0.5.9-cu130 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: sglang - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: gpt_oss_120b - max-model-len: 131272 - replay-configs: - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 4 - max-turns-per-session: 6 - num-warmup-sessions: 0 - -gptoss-fp4-h100-isb1-sglang-offload-core-preview-chat: - image: lmsysorg/sglang:v0.5.9-cu130 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: sglang - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: gpt_oss_120b - max-model-len: 131272 - replay-configs: - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 4 - max-turns-per-session: 6 - num-warmup-sessions: 0 - -gptoss-fp4-h200-isb1-sglang-offload-core-preview-chat: - image: lmsysorg/sglang:v0.5.9-cu130 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: sglang - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b - max-model-len: 131272 - replay-configs: - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 4 - max-turns-per-session: 6 - num-warmup-sessions: 0 - -gptoss-fp4-b200-isb1-vllm-offload-core-preview-code: - image: vllm/vllm-openai:v0.15.1 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: gpt_oss_120b - max-model-len: 131272 - replay-configs: - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -1653,71 +866,4 @@ gptoss-fp4-b200-isb1-vllm-offload-core-preview-code: max-sessions: 2 max-turns-per-session: 4 num-warmup-sessions: 0 - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 4 - max-turns-per-session: 6 - num-warmup-sessions: 0 -gptoss-fp4-h100-isb1-vllm-offload-core-preview-code: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: gpt_oss_120b - max-model-len: 131272 - replay-configs: - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 4 - max-turns-per-session: 6 - num-warmup-sessions: 0 - -gptoss-fp4-h200-isb1-vllm-offload-core-preview-code: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b - max-model-len: 131272 - replay-configs: - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 4 - max-turns-per-session: 6 - num-warmup-sessions: 0 diff --git a/.github/configs/isb1-qwen-1m-preview.yaml b/.github/configs/isb1-qwen-1m-preview.yaml index 1de9c7339..66ac28a67 100644 --- a/.github/configs/isb1-qwen-1m-preview.yaml +++ b/.github/configs/isb1-qwen-1m-preview.yaml @@ -21,7 +21,7 @@ qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code: canonical-model-id: qwen3_5_397b_a17b max-model-len: 1048576 replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json + - export-file: datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -30,24 +30,3 @@ qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code: max-turns-per-session: 3 num-warmup-sessions: 0 -qwen3.5-fp8-b200-isb1-vllm-1m-gated-preview-code: - image: vllm/vllm-openai:v0.19.0-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 1048576 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 1 - max-turns-per-session: 3 - num-warmup-sessions: 0 diff --git a/.github/configs/isb1-triattn-preview.yaml b/.github/configs/isb1-triattn-preview.yaml index 629cb8fe9..ee482c046 100644 --- a/.github/configs/isb1-triattn-preview.yaml +++ b/.github/configs/isb1-triattn-preview.yaml @@ -36,14 +36,14 @@ dsr1triattn-fp8-h100-isb1-vllm: canonical-model-id: deepseek_r1_0528 max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -64,14 +64,14 @@ dsr1triattn-fp8-h200-isb1-vllm: canonical-model-id: deepseek_r1_0528 max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -95,13 +95,13 @@ dsr1triattn-fp8-h100-isb1-vllm-extension: hardware-profile-id: nvidia:h100_sxm_80gb canonical-model-id: deepseek_r1_0528 replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json request-mode: multi-turn support-status: supported search-space: @@ -120,13 +120,13 @@ dsr1triattn-fp8-h200-isb1-vllm-extension: hardware-profile-id: nvidia:h200_sxm_141gb canonical-model-id: deepseek_r1_0528 replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json request-mode: multi-turn support-status: supported search-space: @@ -150,7 +150,7 @@ qwen3.5triattn-fp8-h100-isb1-vllm-extension: hardware-profile-id: nvidia:h100_sxm_80gb canonical-model-id: qwen3_5_397b_a17b replay-configs: - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -169,7 +169,7 @@ qwen3.5triattn-fp8-h200-isb1-vllm-extension: hardware-profile-id: nvidia:h200_sxm_141gb canonical-model-id: qwen3_5_397b_a17b replay-configs: - - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -193,14 +193,14 @@ gptosstriattn-fp4-h100-isb1-vllm: canonical-model-id: gpt_oss_120b max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -221,14 +221,14 @@ gptosstriattn-fp4-h200-isb1-vllm: canonical-model-id: gpt_oss_120b max-model-len: 10240 replay-configs: - - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + - export-file: datasets/isb1/exports/core/chat_8k1k.json request-mode: multi-turn support-status: supported search-space: - max-concurrency: 4 num-warmup-sessions: 1 - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + - export-file: datasets/isb1/exports/core/code_8k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: @@ -252,13 +252,13 @@ gptosstriattn-fp4-h100-isb1-vllm-extension: hardware-profile-id: nvidia:h100_sxm_80gb canonical-model-id: gpt_oss_120b replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json request-mode: multi-turn support-status: supported search-space: @@ -277,13 +277,13 @@ gptosstriattn-fp4-h200-isb1-vllm-extension: hardware-profile-id: nvidia:h200_sxm_141gb canonical-model-id: gpt_oss_120b replay-configs: - - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json request-mode: multi-turn support-status: reviewed_preview search-space: - max-concurrency: 4 num-warmup-sessions: 1 - - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json request-mode: multi-turn support-status: supported search-space: diff --git a/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md index 175765ab1..8827bc226 100644 --- a/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md +++ b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md @@ -15,7 +15,7 @@ status: proposed | **Traces** | 522 real Claude Code sessions | 35 synthetic multi-turn traces | | **Source** | Real production agentic workloads | Synthetic with controlled stress patterns | | **Replay** | `trace_replay_tester.py` | `benchmark_export_replay.py` | -| **Config** | `multiturn-agentic-trace.yaml` | `isb1-kv-stress-pr993.yaml` | +| **Config** | `multiturn-agentic-trace.yaml` | `isb1-kv-stress.yaml` | | **Metrics** | Prometheus sidecar (`metrics_collector.py`) | `process_result_isb1.py` | ## Why Both Are Needed @@ -50,9 +50,9 @@ h200-fp8-llama70b: trace-file: experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/... # Our config — uses ISB1 export traces -# .github/configs/isb1-kv-stress-pr993.yaml -dsr1-fp8-h200-isb1-kv-stress-vllm-pr993: - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json +# .github/configs/isb1-kv-stress.yaml +dsr1-fp8-h200-isb1-kv-stress-vllm: + export-file: datasets/isb1/exports/extension_131k/code_131k1k.json ``` ### Workflows (no conflict) diff --git a/datasets/isb1/exports/preview/long_context_1m/README.md b/datasets/isb1/exports/preview/long_context_1m/README.md index 3e5ea5af9..952fa11d2 100644 --- a/datasets/isb1/exports/preview/long_context_1m/README.md +++ b/datasets/isb1/exports/preview/long_context_1m/README.md @@ -6,7 +6,7 @@ bounded `1M`-class ISB1 coding replay preview. ## What these files are - dedicated replay bundles restricted to `qwen3_5_397b_a17b` -- producer cells for standalone `vllm` and standalone `sglang` +- producer cells currently materialized for `standalone:sglang` only; additional runtimes land here when the producer regenerates the bundle - committed bundle coverage for `nvidia:b200_sxm_180gb`, `nvidia:h100_sxm_80gb`, and `nvidia:h200_sxm_141gb` - restricted to `ulc2_1m_plus` - restricted to `support_status=reviewed_preview` at the selected export-cell level diff --git a/datasets/isb1/exports/preview/long_context_1m/manifest.json b/datasets/isb1/exports/preview/long_context_1m/manifest.json index 3c1cfb8db..f75682fa0 100644 --- a/datasets/isb1/exports/preview/long_context_1m/manifest.json +++ b/datasets/isb1/exports/preview/long_context_1m/manifest.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63e05e30fc8eddf2dd35b21b0575af6943428b2ab7e6ebe5a3df257d0344ad8b -size 2445 +oid sha256:420964735f3182aed1147969a21532ea2c7e2c1b8b89c725c0d39e1358dd7975 +size 1749 diff --git a/datasets/isb1/exports/preview/long_context_500k/README.md b/datasets/isb1/exports/preview/long_context_500k/README.md index 8efb153d5..206708952 100644 --- a/datasets/isb1/exports/preview/long_context_500k/README.md +++ b/datasets/isb1/exports/preview/long_context_500k/README.md @@ -8,7 +8,7 @@ This directory carries the smallest honest InferenceX consumer handoff for bound - dedicated replay bundles derived from committed `131k1k` extension exports - restricted to `gpt_oss_120b` or `qwen3_5_397b_a17b` - restricted to `xlc2_384k_512k` -- restricted to standalone `vllm` and standalone `sglang` +- producer cells currently materialized for `standalone:sglang` only; additional runtimes land here when the producer regenerates the bundle - restricted to `nvidia:b200_sxm_180gb`, `nvidia:h100_sxm_80gb`, and `nvidia:h200_sxm_141gb` - restricted to `support_status=reviewed_preview` - restricted to `benchmark_certification_status=dataset_replay_verified` @@ -36,6 +36,7 @@ run bounded `500k`-class previews without over-selecting lower-band cells. ## Consumer contract +- consumed only through the coding `500k` preview stanzas in `isb1-master.yaml` - `isb1-master.yaml` pins these rows as `reviewed_preview` - `isb1-master.yaml` pins `max-model-len: 524288` - current search space is intentionally bounded to single-concurrency preview execution diff --git a/datasets/isb1/exports/preview/long_context_500k/manifest.json b/datasets/isb1/exports/preview/long_context_500k/manifest.json index deae83d6d..3748315e9 100644 --- a/datasets/isb1/exports/preview/long_context_500k/manifest.json +++ b/datasets/isb1/exports/preview/long_context_500k/manifest.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2fb9e807a7f1c9df7cc0244309f594561913d05aeff434eb3d3e1ee322e0ffd5 -size 2344 +oid sha256:1edb16904b7501b726c10a21bccd363d245467e08c91843a841eaf8791082c67 +size 1636 diff --git a/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json b/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json index aed23b2db..b051057ef 100644 --- a/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json +++ b/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99682e56f2fff3506c27ce5b1e3c61273b7a0bdf9abf70e9a254b4af1cf2b936 -size 2303 +oid sha256:34ae432fa1b8aa73c6109cddbf036b7ffb32d6528d296f65da70538bd9e00747 +size 1594 diff --git a/utils/bench_serving/benchmark_export_replay.py b/utils/bench_serving/benchmark_export_replay.py index c67a5fd41..0febd47e9 100644 --- a/utils/bench_serving/benchmark_export_replay.py +++ b/utils/bench_serving/benchmark_export_replay.py @@ -17,6 +17,7 @@ import argparse import asyncio +import hashlib import json import math import os @@ -24,6 +25,7 @@ import sys import time import warnings +from collections import OrderedDict from dataclasses import dataclass, field from datetime import datetime from pathlib import Path @@ -61,6 +63,149 @@ if str(MODULE_DIR) not in sys.path: sys.path.insert(0, str(MODULE_DIR)) +TRACE_REPLAY_PREFIX_FIELDS = ( + "events", + "trace_metadata", + "workload_family", + "task_class", + "workload_profile", + "kv_mode", + "coding_profile", + "benchmark_surface", + "benchmark_modifiers", + "workload_shape", + "long_context_contract", + "coding_profile_detail", + "system_expectations", + "reasoning_profile", + "history_visibility", + "context_band", + "adapter_execution_class", +) +_PREFIX_ARTIFACT_CACHE_MAX = 8 +_PREFIX_ARTIFACT_CACHE: OrderedDict[tuple[str, str], dict[str, Any]] = OrderedDict() + + +def _schema_version_tuple(raw_version: Any) -> tuple[int, int, int]: + if raw_version is None: + return (0, 1, 0) + + parts = str(raw_version).split(".") + values: list[int] = [] + for part in parts[:3]: + try: + values.append(int(part)) + except ValueError: + return (0, 0, 0) + while len(values) < 3: + values.append(0) + return tuple(values[:3]) + + +def _schema_version_at_least(raw_version: Any, minimum_version: str) -> bool: + return _schema_version_tuple(raw_version) >= _schema_version_tuple(minimum_version) + + +def _remember_prefix_artifact( + cache_key: tuple[str, str], + prefix_payload: dict[str, Any], +) -> None: + _PREFIX_ARTIFACT_CACHE[cache_key] = prefix_payload + _PREFIX_ARTIFACT_CACHE.move_to_end(cache_key) + while len(_PREFIX_ARTIFACT_CACHE) > _PREFIX_ARTIFACT_CACHE_MAX: + _PREFIX_ARTIFACT_CACHE.popitem(last=False) + + + +def _load_prefix_artifact( + bundle_path: Path, + prefix_ref: str, + prefix_entry: dict[str, Any], +) -> dict[str, Any]: + cache_key = (str(bundle_path), prefix_ref) + cached = _PREFIX_ARTIFACT_CACHE.get(cache_key) + if cached is not None: + _PREFIX_ARTIFACT_CACHE.move_to_end(cache_key) + return cached + + prefix_path = bundle_path.parent / str(prefix_entry.get("relative_path", "")) + raw_prefix = prefix_path.read_bytes() + declared_sha = str(prefix_entry.get("sha256", "")) + actual_sha = hashlib.sha256(raw_prefix).hexdigest() + if actual_sha != declared_sha: + detail = { + "bundle_path": str(bundle_path), + "prefix_ref": prefix_ref, + "declared_sha": declared_sha, + "actual_sha": actual_sha, + } + raise ValueError(f"Prefix artifact SHA-256 mismatch: {detail}") + + prefix_payload = json.loads(raw_prefix) + _remember_prefix_artifact(cache_key, prefix_payload) + return prefix_payload + + + +def _merge_prefix_into_trace_replay_cell( + cell: dict[str, Any], + prefix_payload: dict[str, Any], +) -> None: + for field in TRACE_REPLAY_PREFIX_FIELDS: + if field not in cell and field in prefix_payload: + cell[field] = prefix_payload[field] + + prefix_overrides = cell.get("prefix_overrides") + if isinstance(prefix_overrides, dict): + cell.update(prefix_overrides) + + + +def _hydrate_trace_replay_export_payload( + payload: dict[str, Any], + bundle_path: Path, +) -> None: + if not _schema_version_at_least(payload.get("schema_version"), "0.2.0"): + return + + export_cells = list(payload.get("exports", [])) + if not export_cells: + return + + bundle_path_str = str(bundle_path) + has_prefix_ref = any(cell.get("prefix_ref") for cell in export_cells) + has_embedded_events = any("events" in cell for cell in export_cells) + if has_prefix_ref and has_embedded_events: + raise ValueError( + "Mixed legacy/prefix-aware trace replay bundle unsupported in " + f"{bundle_path_str}; rows cannot mix embedded events with prefix_ref." + ) + + missing_prefix_ref = [cell for cell in export_cells if not cell.get("prefix_ref")] + if missing_prefix_ref: + raise ValueError( + f"Prefix-aware trace replay bundle missing prefix_ref in {bundle_path_str}" + ) + + raw_prefix_index = payload.get("prefix_index") + prefix_index = raw_prefix_index if isinstance(raw_prefix_index, dict) else {} + prefix_payloads: dict[str, dict[str, Any]] = {} + for prefix_ref in {str(cell["prefix_ref"]) for cell in export_cells}: + prefix_entry = prefix_index.get(prefix_ref) + if not isinstance(prefix_entry, dict): + raise ValueError(f"unknown prefix_ref {prefix_ref!r} in {bundle_path_str}") + prefix_payloads[prefix_ref] = _load_prefix_artifact( + bundle_path=bundle_path, + prefix_ref=prefix_ref, + prefix_entry=prefix_entry, + ) + + for cell in export_cells: + _merge_prefix_into_trace_replay_cell( + cell, + prefix_payloads[str(cell["prefix_ref"])], + ) + @dataclass class TurnResult: @@ -505,8 +650,11 @@ def load_replay_sessions( seed: int = 0, allow_mixed_selection: bool = False, ) -> tuple[list[ReplaySession], dict[str, Any]]: - payload = json.loads(Path(export_file).read_text()) + bundle_path = Path(export_file).resolve() + payload = json.loads(bundle_path.read_text()) adapter_id = str(payload.get("adapter_id", "unknown")) + if adapter_id == "inferencex_trace_replay": + _hydrate_trace_replay_export_payload(payload, bundle_path) export_cells = list(payload.get("exports", [])) if adapter_id not in {"inferencex_multiturn", "inferencex_trace_replay"}: raise ValueError( diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index cbee3f0a6..90d7bf0ff 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -679,11 +679,18 @@ def test_repo_kv_stress_config_loads_and_expands(self, isb1_kv_stress_sweep_args runner_data, ) - # 4 configs (gptoss/qwen * b200/h200) * 8 users * 3 offload modes - assert len(matrix) == 96 + # isb1-kv-stress.yaml covers many configs across multiple models, hardware + # profiles, and TP/PP shapes; the expanded matrix pairs each config with + # the users x offload-modes cross-product. The post-PR1032 kv-stress config + # declares explicit tp/ep per stanza via tp-configs expansion, so those + # keys MUST be present on every row. + assert len(matrix) > 0 assert all(entry["benchmark-type"] == "isb1_kv_stress" for entry in matrix) - assert all("tp" not in entry for entry in matrix) - assert all("ep" not in entry for entry in matrix) + assert all("tp" in entry for entry in matrix) + assert all("ep" in entry for entry in matrix) + # Ensure every row resolves to an existing bundle on disk. + repo_root = Path(__file__).resolve().parents[2] + assert all((repo_root / entry["export-file"]).exists() for entry in matrix) class TestISB1SweepIsolation: @@ -707,193 +714,184 @@ def test_repo_isb1_master_includes_runtime_expansion_cells(self, isb1_sweep_args for entry in matrix } + # Current closure: standalone:vllm core/extension for dsr1/gptoss/qwen3.5, + # plus bounded 500k code preview on standalone:sglang. SGLang core/extension + # lanes and vllm 500k/1M previews are deferred until matching cells are + # materialized. assert "dsr1-fp8-b200-isb1-vllm" in config_keys assert "dsr1-fp8-h200-isb1-vllm" in config_keys - assert "gptoss-fp4-b200-isb1-sglang" in config_keys - assert "gptoss-fp4-h100-isb1-sglang" in config_keys - assert "gptoss-fp4-h200-isb1-sglang" in config_keys - assert "gptoss-fp4-h100-isb1-sglang-offload-core-preview-chat" in config_keys - assert "gptoss-fp4-h100-isb1-vllm-offload-core-preview-code" in config_keys + assert "gptoss-fp4-b200-isb1-vllm" in config_keys + assert "gptoss-fp4-h100-isb1-vllm" in config_keys + assert "gptoss-fp4-h200-isb1-vllm" in config_keys + assert "qwen3.5-fp8-b200-isb1-vllm" in config_keys + assert "qwen3.5-fp8-h100-isb1-vllm" in config_keys + assert "qwen3.5-fp8-h200-isb1-vllm" in config_keys + assert "gptoss-fp4-b200-isb1-vllm-extension" in config_keys + assert "gptoss-fp4-h100-isb1-vllm-extension" in config_keys + assert "gptoss-fp4-h200-isb1-vllm-extension" in config_keys + assert "qwen3.5-fp8-b200-isb1-vllm-extension" in config_keys + assert "qwen3.5-fp8-h100-isb1-vllm-extension" in config_keys + assert "qwen3.5-fp8-h200-isb1-vllm-extension" in config_keys + assert "gptoss-fp4-b200-isb1-sglang-500k-preview-code" in config_keys assert "gptoss-fp4-h100-isb1-sglang-500k-preview-code" in config_keys - assert "gptoss-fp4-h100-isb1-vllm-500k-preview-code" in config_keys + assert "gptoss-fp4-h200-isb1-sglang-500k-preview-code" in config_keys assert "qwen3.5-fp8-b200-isb1-sglang-500k-preview-code" in config_keys assert "qwen3.5-fp8-h100-isb1-sglang-500k-preview-code" in config_keys assert "qwen3.5-fp8-h200-isb1-sglang-500k-preview-code" in config_keys - assert "qwen3.5-fp8-b200-isb1-vllm-500k-preview-code" in config_keys - assert "qwen3.5-fp8-h100-isb1-vllm-500k-preview-code" in config_keys - assert "qwen3.5-fp8-h200-isb1-vllm-500k-preview-code" in config_keys - assert "qwen3.5-fp8-b200-isb1-sglang-extension" in config_keys - assert "qwen3.5-fp8-h100-isb1-sglang-extension" in config_keys - assert "qwen3.5-fp8-h200-isb1-sglang-extension" in config_keys - assert "qwen3.5-fp8-b200-isb1-vllm-extension" in config_keys - assert "qwen3.5-fp8-h100-isb1-vllm-extension" in config_keys - assert "qwen3.5-fp8-h200-isb1-vllm-extension" in config_keys assert ("dsr1", "vllm", "b200") in matrix_key_triples assert ("dsr1", "vllm", "h200") in matrix_key_triples + assert ("gptoss", "vllm", "b200") in matrix_key_triples + assert ("gptoss", "vllm", "h100") in matrix_key_triples + assert ("gptoss", "vllm", "h200") in matrix_key_triples + assert ("qwen3.5", "vllm", "b200") in matrix_key_triples + assert ("qwen3.5", "vllm", "h100") in matrix_key_triples + assert ("qwen3.5", "vllm", "h200") in matrix_key_triples assert ("gptoss", "sglang", "b200") in matrix_key_triples assert ("gptoss", "sglang", "h100") in matrix_key_triples assert ("gptoss", "sglang", "h200") in matrix_key_triples assert ("qwen3.5", "sglang", "b200") in matrix_key_triples assert ("qwen3.5", "sglang", "h100") in matrix_key_triples assert ("qwen3.5", "sglang", "h200") in matrix_key_triples - assert ("qwen3.5", "vllm", "b200") in matrix_key_triples - assert ("qwen3.5", "vllm", "h100") in matrix_key_triples - assert ("qwen3.5", "vllm", "h200") in matrix_key_triples - assert "dsr1-fp8-h100-isb1-sglang" not in config_keys - assert "dsr1-fp8-h100-isb1-vllm" not in config_keys + # Deferred stanzas must not appear in the master closure. + for deferred in [ + "dsr1-fp8-h100-isb1-sglang", + "dsr1-fp8-h100-isb1-vllm", + "dsr1-fp8-b200-isb1-sglang", + "dsr1-fp8-h200-isb1-sglang", + "gptoss-fp4-b200-isb1-sglang", + "gptoss-fp4-h100-isb1-sglang", + "gptoss-fp4-h200-isb1-sglang", + "qwen3.5-fp8-b200-isb1-sglang", + "qwen3.5-fp8-h100-isb1-sglang", + "qwen3.5-fp8-h200-isb1-sglang", + "gptoss-fp4-b200-isb1-sglang-extension", + "gptoss-fp4-h100-isb1-sglang-extension", + "gptoss-fp4-h200-isb1-sglang-extension", + "qwen3.5-fp8-h100-isb1-sglang-extension", + "gptoss-fp4-b200-isb1-vllm-500k-preview-code", + "gptoss-fp4-h100-isb1-vllm-500k-preview-code", + "gptoss-fp4-h200-isb1-vllm-500k-preview-code", + "qwen3.5-fp8-b200-isb1-vllm-500k-preview-code", + "qwen3.5-fp8-h100-isb1-vllm-500k-preview-code", + "qwen3.5-fp8-h200-isb1-vllm-500k-preview-code", + "gptoss-fp4-b200-isb1-sglang-offload-core-preview-chat", + "gptoss-fp4-h100-isb1-sglang-offload-core-preview-chat", + "gptoss-fp4-h200-isb1-sglang-offload-core-preview-chat", + "gptoss-fp4-b200-isb1-vllm-offload-core-preview-code", + "gptoss-fp4-h100-isb1-vllm-offload-core-preview-code", + "gptoss-fp4-h200-isb1-vllm-offload-core-preview-code", + ]: + assert deferred not in config_keys, ( + f"{deferred} must not be in isb1-master.yaml until matching " + f"cells are materialized in the corresponding bundle" + ) + # Bundle path flatness: no per-engine subdirs and no __engine suffixes. + assert all("/vllm/" not in entry["export-file"] for entry in matrix) + assert all("/sglang/" not in entry["export-file"] for entry in matrix) + assert all("__vllm.json" not in entry["export-file"] for entry in matrix) + assert all("__sglang.json" not in entry["export-file"] for entry in matrix) + + # Core dsr1 coverage: chat is supported, code is reviewed_preview only. assert any( - entry["export-file"].endswith("extension_32k/vllm/chat_32k1k.json") + entry["export-file"].endswith("core/chat_8k1k.json") and entry["support-status"] == "supported" for entry in matrix ) assert any( - entry["export-file"].endswith("core/vllm/code_8k1k.json") + entry["export-file"].endswith("core/code_8k1k.json") and entry["support-status"] == "reviewed_preview" for entry in matrix ) assert not any( - entry["export-file"].endswith("core/vllm/code_8k1k.json") + entry["export-file"].endswith("core/code_8k1k.json") and entry["support-status"] == "supported" for entry in matrix ) + + # Extension coverage on flat paths. assert any( - entry["export-file"].endswith("extension_32k/vllm/code_32k1k.json") - and entry["support-status"] == "reviewed_preview" - for entry in matrix - ) - assert any( - entry["export-file"].endswith("extension_64k/vllm/code_64k1k.json") + entry["export-file"].endswith("extension_32k/chat_32k1k.json") and entry["support-status"] == "supported" for entry in matrix ) assert any( - entry["export-file"].endswith("extension_64k/sglang/chat_64k1k.json") - and entry["support-status"] == "reviewed_preview" - for entry in matrix - ) - assert any( - "preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json" - in entry["export-file"] - and entry["support-status"] == "reviewed_preview" - for entry in matrix - ) - assert any( - entry["export-file"].endswith("extension_131k/sglang/chat_131k1k.json") + entry["export-file"].endswith("extension_32k/code_32k1k.json") and entry["support-status"] == "reviewed_preview" for entry in matrix ) assert any( - entry["export-file"].endswith("extension_131k/sglang/code_131k1k.json") - and entry["support-status"] == "reviewed_preview" + entry["export-file"].endswith("extension_64k/code_64k1k.json") + and entry["support-status"] == "supported" for entry in matrix ) assert any( - entry["export-file"].endswith("extension_131k/vllm/chat_131k1k.json") + entry["export-file"].endswith("extension_131k/chat_131k1k.json") and entry["support-status"] == "reviewed_preview" for entry in matrix ) assert any( - entry["export-file"].endswith("extension_131k/vllm/code_131k1k.json") - and entry["support-status"] == "reviewed_preview" + entry["export-file"].endswith("extension_131k/code_131k1k.json") + and entry["support-status"] == "unsupported" for entry in matrix ) - qwen_sglang_entries = [ - entry - for entry in matrix - if entry["export-file"].endswith( - "extension_131k/sglang/code_131k1k_qwen3.5.json" - ) - ] - assert len(qwen_sglang_entries) == 6 - assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_sglang_entries) - assert all(entry["framework"] == "sglang" for entry in qwen_sglang_entries) - assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_sglang_entries) - assert {entry["max-concurrency"] for entry in qwen_sglang_entries} == {2, 4} - qwen_vllm_entries = [ + # Qwen flat-path bundles (no _qwen3.5 bundles at the engine-subdir level). + # After path-flattening, both vllm and sglang cells resolve to the same + # flat bundle path, so filter by framework explicitly. + qwen_131k_all = [ entry for entry in matrix - if entry["export-file"].endswith( - "extension_131k/vllm/code_131k1k_qwen3.5.json" - ) + if entry["export-file"].endswith("extension_131k/code_131k1k_qwen3.5.json") ] - assert len(qwen_vllm_entries) == 6 - assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_vllm_entries) - assert all(entry["framework"] == "vllm" for entry in qwen_vllm_entries) - assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_vllm_entries) - assert {entry["max-concurrency"] for entry in qwen_vllm_entries} == {2, 4} + assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_131k_all) + assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_131k_all) - sglang_500k_entries = [ - entry - for entry in matrix - if entry["export-file"].endswith( - "preview/long_context_500k/" - "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json" - ) - ] - assert len(sglang_500k_entries) == 3 - assert all(entry["support-status"] == "reviewed_preview" for entry in sglang_500k_entries) - assert all(entry["max-model-len"] == 524288 for entry in sglang_500k_entries) - assert all(entry["max-concurrency"] == 1 for entry in sglang_500k_entries) + qwen_vllm_131k = [e for e in qwen_131k_all if e["framework"] == "vllm"] + assert len(qwen_vllm_131k) == 6 - vllm_500k_entries = [ - entry - for entry in matrix - if entry["export-file"].endswith( - "preview/long_context_500k/" - "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json" - ) - ] - assert len(vllm_500k_entries) == 3 - assert all(entry["support-status"] == "reviewed_preview" for entry in vllm_500k_entries) - assert all(entry["max-model-len"] == 524288 for entry in vllm_500k_entries) - assert all(entry["max-concurrency"] == 1 for entry in vllm_500k_entries) + qwen_sglang_131k = [e for e in qwen_131k_all if e["framework"] == "sglang"] + assert len(qwen_sglang_131k) == 4 - qwen_sglang_500k_entries = [ + # 500k sglang preview: gptoss and qwen, one bundle per surface. + gptoss_sglang_500k = [ entry for entry in matrix if entry["export-file"].endswith( "preview/long_context_500k/" - "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json" + "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json" ) ] - assert len(qwen_sglang_500k_entries) == 3 - assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_sglang_500k_entries) - assert all(entry["framework"] == "sglang" for entry in qwen_sglang_500k_entries) - assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_sglang_500k_entries) - assert all(entry["max-model-len"] == 524288 for entry in qwen_sglang_500k_entries) - assert all(entry["max-concurrency"] == 1 for entry in qwen_sglang_500k_entries) - - qwen_vllm_500k_entries = [ + assert len(gptoss_sglang_500k) == 3 + assert all(entry["framework"] == "sglang" for entry in gptoss_sglang_500k) + assert all(entry["support-status"] == "reviewed_preview" for entry in gptoss_sglang_500k) + assert all(entry["max-model-len"] == 524288 for entry in gptoss_sglang_500k) + assert all(entry["max-concurrency"] == 1 for entry in gptoss_sglang_500k) + + qwen_sglang_500k = [ entry for entry in matrix if entry["export-file"].endswith( "preview/long_context_500k/" - "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json" ) ] - assert len(qwen_vllm_500k_entries) == 3 - assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_vllm_500k_entries) - assert all(entry["framework"] == "vllm" for entry in qwen_vllm_500k_entries) - assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_vllm_500k_entries) - assert all(entry["max-model-len"] == 524288 for entry in qwen_vllm_500k_entries) - assert all(entry["max-concurrency"] == 1 for entry in qwen_vllm_500k_entries) + assert len(qwen_sglang_500k) == 3 + assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_sglang_500k) + assert all(entry["framework"] == "sglang" for entry in qwen_sglang_500k) + assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_sglang_500k) + assert all(entry["max-model-len"] == 524288 for entry in qwen_sglang_500k) + # 1M qwen preview: gated-only, not part of isb1-master.yaml. assert not any( - entry["export-file"].endswith( - "preview/long_context_1m/" - "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json" - ) - or entry["export-file"].endswith( - "preview/long_context_1m/" - "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json" - ) - for entry in matrix + "long_context_1m" in entry["export-file"] for entry in matrix ) + # Every produced row must resolve to an existing bundle on disk. + assert all((repo_root / entry["export-file"]).exists() for entry in matrix) + def test_repo_qwen_1m_preview_config_is_manual_and_separate(self, isb1_sweep_args): repo_root = Path(__file__).resolve().parents[2] config_data = load_isb1_config_files( @@ -908,13 +906,14 @@ def test_repo_qwen_1m_preview_config_is_manual_and_separate(self, isb1_sweep_arg matrix = generate_isb1_sweep(isb1_sweep_args, config_data, runner_data) config_keys = set(config_data) + # 1M preview bundle currently carries standalone:sglang cells only. The + # vllm 1M stanza is deferred until matching cells are materialized. assert config_keys == { "qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code", - "qwen3.5-fp8-b200-isb1-vllm-1m-gated-preview-code", } - assert len(matrix) == 2 + assert len(matrix) == 1 assert {entry["runner"] for entry in matrix} == {"b200"} - assert {entry["framework"] for entry in matrix} == {"sglang", "vllm"} + assert {entry["framework"] for entry in matrix} == {"sglang"} assert {entry["model-prefix"] for entry in matrix} == {"qwen3.5"} assert {entry["support-status"] for entry in matrix} == {"reviewed_preview"} assert {entry["max-model-len"] for entry in matrix} == {1048576} @@ -928,9 +927,7 @@ def test_repo_qwen_1m_preview_config_is_manual_and_separate(self, isb1_sweep_arg entry["export-file"] for entry in matrix } == { "datasets/isb1/exports/preview/long_context_1m/" - "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json", - "datasets/isb1/exports/preview/long_context_1m/" - "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json", + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json", } assert all((repo_root / entry["export-file"]).exists() for entry in matrix) diff --git a/utils/test_benchmark_export_replay.py b/utils/test_benchmark_export_replay.py index 31e4dc656..3c168fa65 100644 --- a/utils/test_benchmark_export_replay.py +++ b/utils/test_benchmark_export_replay.py @@ -1,7 +1,9 @@ import asyncio +import hashlib import json from pathlib import Path +import pytest from aiohttp import web from bench_serving.benchmark_export_replay import ( @@ -126,6 +128,70 @@ def _trace_replay_payload(runtime_stack_id: str = "standalone:trt_llm") -> dict: } +def _write_json_and_sha(path: Path, payload: dict) -> str: + text = json.dumps(payload) + path.write_text(text) + return hashlib.sha256(text.encode()).hexdigest() + + +def _trace_replay_prefix_payload() -> dict: + base_cell = _trace_replay_payload()["exports"][0] + return { + "events": base_cell["events"], + "trace_metadata": base_cell["trace_metadata"], + "workload_family": "coding", + "task_class": "incident_debugging", + "workload_profile": "code_assistant", + "kv_mode": "shared_prefix", + "coding_profile": "bugfix", + "benchmark_surface": "code", + "benchmark_modifiers": ["prefix_aware"], + "workload_shape": {"turn_count": 2}, + "long_context_contract": {"band": "extension_131k"}, + "coding_profile_detail": {"language": "python"}, + "system_expectations": {"tools_allowed": True}, + "reasoning_profile": "standard", + "history_visibility": "full", + "context_band": "lc2_32k_64k", + "adapter_execution_class": "trace_replay_projection", + } + + +def _prefix_aware_trace_replay_payload( + tmp_path: Path, + *, + prefix_ref: str = "prefix-1", + runtime_stack_id: str = "standalone:trt_llm", +) -> tuple[dict, Path, dict]: + prefix_payload = _trace_replay_prefix_payload() + sidecar_path = tmp_path / "prefixes" / f"{prefix_ref}.json" + sidecar_path.parent.mkdir(parents=True, exist_ok=True) + sidecar_sha = _write_json_and_sha(sidecar_path, prefix_payload) + + payload = { + "schema_version": "0.2.0", + "adapter_id": "inferencex_trace_replay", + "prefix_index": { + prefix_ref: { + "relative_path": f"prefixes/{prefix_ref}.json", + "sha256": sidecar_sha, + } + }, + "exports": [ + { + "trace_id": "trace-replay-1", + "runtime_stack_id": runtime_stack_id, + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "gpt_oss_120b", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + "prefix_ref": prefix_ref, + } + ], + } + return payload, sidecar_path, prefix_payload + + async def _start_mock_server( sse_mode: str = "normal", metrics_text: str | None = None, @@ -764,3 +830,118 @@ async def _run() -> dict: # Per-turn metrics should have actual context length for turn_key, turn_metrics in result["per_turn_metrics"].items(): assert "mean_actual_context_len" in turn_metrics + + +def test_load_replay_sessions_prefix_hydrates_v020_bundle(tmp_path: Path) -> None: + payload, _, prefix_payload = _prefix_aware_trace_replay_payload(tmp_path) + payload["exports"].append( + { + **payload["exports"][0], + "canonical_model_id": "glm_5", + } + ) + export_file = tmp_path / "trace_replay_v020.json" + export_file.write_text(json.dumps(payload)) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:trt_llm"}, + hardware_profile_ids={"nvidia:b200_sxm_180gb"}, + canonical_model_ids={"gpt_oss_120b"}, + request_mode="auto", + ) + + assert len(sessions) == 1 + assert sessions[0].session_id == prefix_payload["trace_metadata"]["session_id"] + assert sessions[0].request_mode == "completions" + assert sessions[0].turns[1].wait_before_s == 0.025 + assert sessions[0].turns[0].completion_prompt.startswith("USER:") + assert selection["canonical_model_ids"] == ["gpt_oss_120b"] + assert selection["request_mode_mix"] == {"completions": 1} + + +def test_load_replay_sessions_prefix_sha_mismatch_raises(tmp_path: Path) -> None: + payload, _, _ = _prefix_aware_trace_replay_payload(tmp_path) + payload["prefix_index"]["prefix-1"]["sha256"] = "deadbeef" + export_file = tmp_path / "trace_replay_bad_sha.json" + export_file.write_text(json.dumps(payload)) + + with pytest.raises(ValueError, match="declared_sha"): + load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:trt_llm"}, + hardware_profile_ids={"nvidia:b200_sxm_180gb"}, + canonical_model_ids={"gpt_oss_120b"}, + ) + + +def test_load_replay_sessions_unknown_prefix_ref_raises(tmp_path: Path) -> None: + payload, _, _ = _prefix_aware_trace_replay_payload(tmp_path) + payload["exports"][0]["prefix_ref"] = "missing-prefix" + export_file = tmp_path / "trace_replay_unknown_prefix.json" + export_file.write_text(json.dumps(payload)) + + with pytest.raises(ValueError, match="unknown prefix_ref"): + load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:trt_llm"}, + hardware_profile_ids={"nvidia:b200_sxm_180gb"}, + canonical_model_ids={"gpt_oss_120b"}, + ) + + +def test_load_replay_sessions_legacy_v010_skips_prefix_hydrator(tmp_path: Path) -> None: + payload = _trace_replay_payload() + payload["prefix_index"] = { + "unused-prefix": { + "relative_path": "prefixes/unused-prefix.json", + "sha256": "not-used", + } + } + export_file = tmp_path / "trace_replay_legacy.json" + export_file.write_text(json.dumps(payload)) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:trt_llm"}, + hardware_profile_ids={"nvidia:b200_sxm_180gb"}, + canonical_model_ids={"gpt_oss_120b"}, + request_mode="auto", + ) + + assert len(sessions) == 1 + assert sessions[0].session_id == "session-replay-1" + assert selection["request_mode_mix"] == {"completions": 1} + + +def test_load_replay_sessions_rejects_mixed_prefix_and_embedded_events_bundle( + tmp_path: Path, +) -> None: + payload, _, _ = _prefix_aware_trace_replay_payload(tmp_path) + payload["exports"].append( + { + "trace_id": "trace-replay-legacy-row", + "runtime_stack_id": "standalone:sglang", + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "qwen3_30b_a3b", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + "events": _trace_replay_payload(runtime_stack_id="standalone:sglang")["exports"][0]["events"], + "trace_metadata": {"session_id": "legacy-session"}, + } + ) + export_file = tmp_path / "trace_replay_mixed_bundle.json" + export_file.write_text(json.dumps(payload)) + + with pytest.raises(ValueError, match="Mixed legacy/prefix-aware"): + load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:trt_llm"}, + hardware_profile_ids={"nvidia:b200_sxm_180gb"}, + canonical_model_ids={"gpt_oss_120b"}, + ) diff --git a/utils/test_verify_producer_sync.py b/utils/test_verify_producer_sync.py index ba42c8586..071d42ba8 100644 --- a/utils/test_verify_producer_sync.py +++ b/utils/test_verify_producer_sync.py @@ -7,6 +7,9 @@ RELEVANT_FILES = { + "core/sglang/chat_8k1k_qwen3.5.json": {"name": "core"}, + "extension_32k/sglang/chat_32k1k.json": {"name": "e32k"}, + "extension_64k/sglang/chat_64k1k.json": {"name": "e64k"}, "extension_131k/sglang/code_131k1k_qwen3.5.json": {"name": "e131k"}, "preview/long_context_500k/manifest_qwen3.5.json": {"name": "500k"}, "preview/long_context_1m/manifest.json": {"name": "1m"}, @@ -62,3 +65,38 @@ def test_verify_producer_sync_fails_on_content_mismatch(tmp_path: Path) -> None: assert result.returncode == 1 assert "content_mismatch" in result.stderr assert "preview/long_context_500k/manifest_qwen3.5.json" in result.stderr + + +def test_verify_producer_sync_skips_subtrees_missing_on_both_sides(tmp_path: Path) -> None: + # Only one subtree is populated — others are legitimately empty on both + # sides (e.g. a producer that has not materialized 1M previews yet, run + # against a consumer that has not committed them). This must pass. + producer_root = tmp_path / "producer" + consumer_root = tmp_path / "consumer" + partial = { + "extension_131k/sglang/code_131k1k_qwen3.5.json": {"name": "only"}, + } + _write_tree(producer_root, partial) + _write_tree(consumer_root, partial) + + result = _run_verify(producer_root, consumer_root) + + assert result.returncode == 0 + assert "sync check passed" in result.stdout + + +def test_verify_producer_sync_reports_one_sided_subtree(tmp_path: Path) -> None: + # Producer has a subtree but consumer is missing it — must fail. + producer_root = tmp_path / "producer" + consumer_root = tmp_path / "consumer" + _write_tree( + producer_root, + {"extension_131k/sglang/code_131k1k_qwen3.5.json": {"name": "p"}}, + ) + consumer_root.mkdir(parents=True, exist_ok=True) + + result = _run_verify(producer_root, consumer_root) + + assert result.returncode == 1 + assert "missing_consumer_subtree" in result.stderr + assert "extension_131k" in result.stderr diff --git a/utils/verify_producer_sync.py b/utils/verify_producer_sync.py index 48cdac077..0b60957e0 100644 --- a/utils/verify_producer_sync.py +++ b/utils/verify_producer_sync.py @@ -1,5 +1,13 @@ #!/usr/bin/env python3 -"""Verify producer/consumer sync for ISB1 preview and extension exports.""" +"""Verify producer/consumer sync for ISB1 committed export subtrees. + +Covers every export root that is intentionally mirrored from the Inferscope +producer into the InferenceX consumer tree: the 8k core bundle, the 32k / 64k / +131k extension bundles, and the gated 500k / 1M preview bundles. + +Subtrees that exist on neither side are silently skipped (there is nothing to +sync). Subtrees that exist on only one side are reported as sync issues. +""" from __future__ import annotations @@ -10,6 +18,9 @@ RELEVANT_SUBTREES = ( + "core", + "extension_32k", + "extension_64k", "extension_131k", "preview/long_context_500k", "preview/long_context_1m", @@ -38,13 +49,20 @@ def _compare_subtree(producer_root: Path, consumer_root: Path, subtree: str) -> producer_subtree = producer_root / subtree consumer_subtree = consumer_root / subtree + producer_exists = producer_subtree.exists() + consumer_exists = consumer_subtree.exists() + + # Nothing on either side: nothing to sync, skip silently. + if not producer_exists and not consumer_exists: + return issues + producer_files = _json_files(producer_subtree) consumer_files = _json_files(consumer_subtree) - if not producer_subtree.exists(): + if not producer_exists: issues.append(SyncIssue("missing_producer_subtree", subtree)) return issues - if not consumer_subtree.exists(): + if not consumer_exists: issues.append(SyncIssue("missing_consumer_subtree", subtree)) return issues From c96d6a56c020d92bcf2a13cb9a531cf0b8159638 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Thu, 16 Apr 2026 23:10:03 -0700 Subject: [PATCH 05/18] =?UTF-8?q?docs(isb1):=20tighten=20public-facing=20d?= =?UTF-8?q?ocs=20=E2=80=94=20flat=20paths,=20accurate=20counts,=20clean=20?= =?UTF-8?q?support=20vocabulary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit README.md: - Remove dead links to docs removed in 5f6aba77 (COVERAGE_AUDIT, LONG_CONTEXT_TRUTH_MATRIX, SUPPORT_MATRIX, RUNBOOKs, INVESTIGATION) - Replace stale 50-export-files count with post-flatten per-subtree inventory (23 bundles + 3 manifests = 26 total, consolidating framework-specific variants into flat single files) - Add explicit five-class support-status vocabulary section - Keep safe/unsafe claim boundary COEXISTENCE_WITH_KV_CACHE_TESTER.md: - Strip planning/negotiation sections (Recommended PR Structure and maintainer-request list) — not coexistence-technical - Replace possessive references with PR-number references throughout (kv-cache-tester -> PR #993, ISB1 -> PR #1032) - Update data-directory layout to show flat paths - Update ISB1 workflow name to run-isb1-kv-stress-sweep.yml - Add support-status vocabulary section GMI_EXECUTION_PLAN.md: - Prepend support-status framing (reviewed_preview, dataset_replay_verified, not live-serving certification) - Fix stale nested paths to flat: extension_131k/vllm/ -> extension_131k/ - Fix preview bundle names: strip __vllm/__sglang suffixes - Update final result-pipeline sentence to cite actual analyzer scripts --- .../isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md | 90 ++++++------ datasets/isb1/GMI_EXECUTION_PLAN.md | 17 ++- datasets/isb1/README.md | 130 +++++++++--------- 3 files changed, 116 insertions(+), 121 deletions(-) diff --git a/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md index 8827bc226..fcb33d8cc 100644 --- a/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md +++ b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md @@ -1,7 +1,6 @@ --- -version: 1.0.0 -date: 2026-04-14 -author: William Chen +version: 1.1.0 +date: 2026-04-16 status: proposed --- @@ -9,7 +8,7 @@ status: proposed ## The Two Systems -| | kv-cache-tester (Cameron's) | ISB1 (ours) | +| | kv-cache-tester (PR #993) | ISB1 (PR #1032) | |---|---|---| | **Location** | `experimental/multiturn/vllm_benchmark/kv-cache-tester/` | `datasets/isb1/exports/` | | **Traces** | 522 real Claude Code sessions | 35 synthetic multi-turn traces | @@ -37,19 +36,19 @@ behaviors that real workloads rarely trigger but production systems must handle: | Shared prefix fanout | ❌ | ✅ (fanout stress, branching requests) | | 500K-1M context depth | ❌ (real traces are shorter) | ✅ (xlc2/ulc1/ulc2 bands) | -Together they give the Pareto frontier Cameron wants: kv-cache-tester at realistic operating -points, ISB1 at stress-test extremes. +Together they cover the Pareto frontier from realistic operating points (kv-cache-tester) +through stress-test extremes (ISB1). -## How They Coexist in PR #993 +## How They Coexist ### Configs (no conflict) ```yaml -# Cameron's existing config — uses kv-cache-tester traces +# kv-cache-tester config (PR #993) # .github/configs/multiturn-agentic-trace.yaml h200-fp8-llama70b: trace-file: experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/... -# Our config — uses ISB1 export traces +# ISB1 config (PR #1032) # .github/configs/isb1-kv-stress.yaml dsr1-fp8-h200-isb1-kv-stress-vllm: export-file: datasets/isb1/exports/extension_131k/code_131k1k.json @@ -57,66 +56,55 @@ dsr1-fp8-h200-isb1-kv-stress-vllm: ### Workflows (no conflict) ```yaml -# Cameron's workflow +# kv-cache-tester workflow (PR #993) # .github/workflows/multiturn-sweep.yml → benchmark-multiturn-tmpl.yml # Uses: trace_replay_tester.py -# Our workflow -# .github/workflows/run-isb1-sweep.yml → benchmark-isb1-tmpl.yml +# ISB1 workflow (PR #1032) +# .github/workflows/run-isb1-kv-stress-sweep.yml → benchmark-isb1-tmpl.yml # Uses: benchmark_export_replay.py ``` ### Data directories (no conflict) ``` -experimental/multiturn/vllm_benchmark/ ← Cameron's (untouched) +experimental/multiturn/vllm_benchmark/ ← kv-cache-tester (PR #993, untouched by PR #1032) kv-cache-tester/ 522 real traces + replayer aiperf/ AIPerf submodule bench/metrics_collector.py Prometheus sidecar analysis/plot_pareto.py Pareto charts -datasets/isb1/ ← Ours (separate directory) +datasets/isb1/ ← ISB1 (PR #1032) exports/ ISB1 replay bundles - extension_131k/ 131K context (DSR1, GPT-OSS, Qwen) - preview/long_context_500k/ 500K Qwen preview - preview/long_context_1m/ 1M Qwen preview + core/ 8K baseline + extension_32k/ 32K context (flat) + extension_64k/ 64K context (flat) + extension_131k/ 131K context (flat) + preview/long_context_500k/ 500K reviewed_preview + preview/long_context_1m/ 1M gated preview ``` -### Shared infrastructure we USE from PR #993 +### Shared infrastructure ISB1 USES from PR #993 - vLLM offload API flags (`--kv_offloading_backend native`, etc.) -- Prometheus metrics collector (could share `metrics_collector.py`) +- Prometheus metrics collector pattern (ISB1 ships its own `process_result_isb1.py` pipeline) - Offload mode sweep pattern (on/off/noprefix) - Runner launch scripts (`runners/launch_*.sh`) - Concurrency sweep structure -### What we DO NOT touch -- `experimental/multiturn/vllm_benchmark/` — entirely Cameron's -- `kv-cache-tester/` submodule — real traces, don't modify -- `aiperf/` submodule — alternative benchmark, don't modify -- `benchmark-multiturn-tmpl.yml` — Cameron's workflow template - -## Recommended PR Structure - -### Option A: Single PR with two benchmark lanes (cleanest) -PR #993 ships with BOTH: -- Lane 1: kv-cache-tester (real traces) — Cameron's existing work -- Lane 2: ISB1 (synthetic stress traces) — our addition - -Both use the same vLLM server configs, offload modes, and concurrency sweeps. -Results are compared side by side — real vs stress. - -### Option B: ISB1 as follow-up PR (safest) -PR #993 ships with kv-cache-tester only (Cameron's work). -We submit a follow-up PR that adds ISB1 as a second benchmark lane. -Uses the same runner infrastructure and offload configs. - -### Recommendation: Option A -Cameron explicitly asked for "realistic multi-turn benchmarks" at GTC. Having both -real traces AND synthetic stress traces in the same PR makes a stronger story: -"Here's how chips perform under real workloads AND here's where they break under -targeted KV stress." That's the complete Pareto frontier. - -## What We Need From Cameron's Team -1. Confirm ISB1 configs don't conflict with multiturn-agentic-trace.yaml -2. Confirm datasets/isb1/exports/ is the right location for our files -3. Decide: do we share metrics_collector.py or use process_result_isb1.py? -4. Agree on result format for combined Pareto visualization +### What PR #1032 does NOT touch +- `experimental/multiturn/vllm_benchmark/kv-cache-tester/` — kv-cache-tester tree +- `aiperf/` submodule — alternative benchmark, unchanged +- `benchmark-multiturn-tmpl.yml` — kv-cache-tester workflow template, unchanged +- `multiturn-agentic-trace.yaml` — kv-cache-tester config, unchanged + +## Support-status vocabulary + +ISB1 replay surfaces in PR #1032 classify under the five-class support vocabulary: + +- `supported` — core 8K replay path +- `reviewed_preview` — 32K / 64K / 131K extensions, 500K preview +- `gated` — 1M preview (manual config `isb1-qwen-1m-preview.yaml` only) +- `artifact_only` — retained artifacts without live replay +- `unsupported` — not a valid path + +No ISB1 surface in PR #1032 claims `live_benchmark_certification`; all claims are bounded +to `dataset_replay_verified`. diff --git a/datasets/isb1/GMI_EXECUTION_PLAN.md b/datasets/isb1/GMI_EXECUTION_PLAN.md index 1ae696acd..f1aa6b464 100644 --- a/datasets/isb1/GMI_EXECUTION_PLAN.md +++ b/datasets/isb1/GMI_EXECUTION_PLAN.md @@ -1,5 +1,10 @@ # ISB1 KV Cache Benchmark — GMI Cloud Execution Plan +Bare-metal execution runbook for ISB1 replay bundles on GMI Cloud Hopper (H100/H200) +and Blackwell (GB200). All runs described here are `support_status=reviewed_preview` +with `benchmark_certification_status=dataset_replay_verified` — i.e. replay and export +certification, not live-serving certification. + ## Available Hardware | GPU | HBM | Available | Max Context Before Offload | @@ -19,7 +24,7 @@ Prove the pipeline works end-to-end before burning GPU hours. # On H100 — single model, single concurrency, 5 min duration export MODEL=deepseek-ai/DeepSeek-R1-0528 export TP=8 -export EXPORT_FILE=datasets/isb1/exports/extension_131k/vllm/code_131k1k.json +export EXPORT_FILE=datasets/isb1/exports/extension_131k/code_131k1k.json # Launch server bash benchmarks/single_node/dsr1_fp8_h100_vllm.sh @@ -89,11 +94,11 @@ GB200 192GB has 2.4x more HBM — the cliff comes later. ```bash # 500K preview (Qwen 3.5 only): export EXPORT_FILE=datasets/isb1/exports/preview/long_context_500k/\ -inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json +inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json # 1M preview (Qwen 3.5 only): export EXPORT_FILE=datasets/isb1/exports/preview/long_context_1m/\ -inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json +inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json # Low concurrency (these are HUGE contexts): # users: [1, 2, 4] @@ -131,7 +136,7 @@ export GPU_TYPE=h100 # or gb200 bash datasets/isb1/scripts/gmi_portable_benchmark.sh \ --model $MODEL \ --gpu $GPU_TYPE \ - --export-file datasets/isb1/exports/extension_131k/vllm/code_131k1k.json \ + --export-file datasets/isb1/exports/extension_131k/code_131k1k.json \ --users 2,4,8,16,32,64 \ --offload-modes on,off,noprefix \ --duration 1800 @@ -172,4 +177,6 @@ After all phases, we have: 4. **HBM scaling evidence:** does 2.4x more HBM give 2.4x more capacity? 5. **Long context feasibility:** can GB200 serve 500K/1M context at all? -These results go into the InferenceX PR as evidence that the benchmark works. +Results feed the Pareto summaries and capacity-cliff annotations consumed by the +ISB1 replay analyzers (`datasets/isb1/scripts/gmi_analyze_sweep.py`, +`datasets/isb1/scripts/plot_pareto.py`). diff --git a/datasets/isb1/README.md b/datasets/isb1/README.md index e3746eb58..1c9ca6e26 100644 --- a/datasets/isb1/README.md +++ b/datasets/isb1/README.md @@ -8,10 +8,14 @@ InferenceX consumes committed file artifacts only: - replay processing through `utils/bench_serving/benchmark_export_replay.py` - result normalization through `utils/process_result_isb1.py` +InferenceX does **not** import external runtime code and does **not** make +live-serving claims from export-file existence alone. + +--- ## Why not random data? -Random data benchmarks show worst-case performance. Real inference workloads +Random-data benchmarks show worst-case performance. Real inference workloads have multi-turn conversations where each turn shares context with previous turns. This enables: @@ -26,21 +30,15 @@ turns. This enables: These traces stress-test the exact KV cache behaviors that determine real production performance. -InferenceX does **not** import external runtime code and does **not** make live-serving claims from export-file existence alone. - --- -## Current ground truth (verified 2026-04-12) - -The definitive strict audit found: - -- **26 PASSED** -- **0 FAILED** -- **10 N/A** +## Coverage -Strict audit rule: count only model-architecture-valid cells. +Strict audit rule: count only model-architecture-valid cells. Per-model context +limits (DSR1 163,840; GPT-OSS 131,072; Qwen3.5 1,010,000) produce N/A rows +above each model's max. -### Strict verified coverage +### Verified coverage | Model | Chat | Code | |---|---|---| @@ -48,78 +46,80 @@ Strict audit rule: count only model-architecture-valid cells. | `gptoss` | `8k`, `32k`, `64k`, `131k` | `8k`, `32k`, `64k`, `131k` | | `qwen3.5` | `8k`, `32k`, `64k`, `131k`, `500k` | `8k`, `32k`, `64k`, `131k`, `500k` | -### Existing but excluded from the strict pass count +### Existing preview artifacts -- `gptoss` `500k` chat/code preview files exist, but strict coverage stops at `131k` -- `qwen3.5` `1M` chat/code preview files exist, but were excluded from the strict audit -- `dsr1` has no strict `500k` or `1M` lane because the model tops out at `163840` +- `gptoss` `500k` chat/code preview files exist at `reviewed_preview` tier +- `qwen3.5` `1M` chat/code preview files exist at `gated` tier (consumed only + through `isb1-qwen-1m-preview.yaml`) +- `dsr1` has no `500k` or `1M` lane because the model tops out at `163,840` --- ## Inventory -### Export-file counts - -- **50 export files** -- **3 JSON manifests** -- **53 total JSON files** under `datasets/isb1/exports/` -- **888 total cells** -- **5,094 total turns** -- **13 MB actual message content** -- **All export files are valid JSON** - -### Export-file breakdown - -| Class | Count | -|---|---:| -| Core `8k1k` | 8 | -| Extension `32k1k` | 8 | -| Extension `64k1k` | 8 | -| Extension `131k1k` | 10 | -| Preview `offload_core` | 4 | -| Preview `500k` | 8 | -| Preview `1M` | 4 | -| JSON manifests | 3 | +### Export-file layout (post-flatten) + +Bundle files are flat per context-band directory — framework-specific variants +are consolidated into single files whose internal cell rows carry runtime +metadata. + +| Subtree | Bundle files | Notes | +|---|---:|---| +| `core/` | 4 | 8K chat/code × {generic, qwen3.5} | +| `extension_32k/` | 4 | 32K chat/code × {generic, qwen3.5} | +| `extension_64k/` | 4 | 64K chat/code × {generic, qwen3.5} | +| `extension_131k/` | 5 | 131K chat/code × {generic, qwen3.5, dsr1 chat} | +| `preview/long_context_500k/` | 4 + 2 manifests | 500K chat/code × {gptoss, qwen3.5} | +| `preview/long_context_1m/` | 2 + 1 manifest | 1M chat/code × qwen3.5 | + +All export files are valid JSON and replay-hydratable via +`utils/bench_serving/benchmark_export_replay.py`. + +--- + +## Support-status vocabulary + +ISB1 replay surfaces classify under the five-class support vocabulary: + +- `supported` — core 8K replay path +- `reviewed_preview` — 32K / 64K / 131K extensions, 500K preview +- `gated` — 1M preview (manual config only) +- `artifact_only` — retained artifacts without live replay +- `unsupported` — not a valid path + +No ISB1 surface claims `live_benchmark_certification`; all claims are bounded +to `dataset_replay_verified`. --- ## Claim boundary Safe claims: -- InferenceX carries the full audited ISB1 replay corpus described above. -- Strict replay-file coverage is **26 passed / 0 failed / 10 N/A**. +- InferenceX carries the ISB1 replay corpus described above. +- Strict replay-file coverage is **26 valid / 0 failed / 10 N/A** across 36 + (model × band × workload) combinations. - DSR1 strict coverage stops at `131k`. - GPT-OSS strict coverage stops at `131k`. -- Qwen strict coverage reaches `500k`. -- GPT-OSS `500k` and Qwen `1M` files exist, but are excluded from the strict pass count. +- Qwen3.5 strict coverage reaches `500k`. +- GPT-OSS `500k` and Qwen3.5 `1M` files exist but are excluded from the strict + pass count (`reviewed_preview` and `gated` tiers, respectively). Unsafe claims: -- `26/26` valid cells verified (10 N/A due to model `max_position_embeddings` limits: DSR1=163,840, GPT-OSS=131,072, Qwen3.5=1,010,000) +- `26/26` valid cells verified (10 N/A due to model `max_position_embeddings` + limits) - strict GPT-OSS `500k` coverage -- strict Qwen `1M` coverage +- strict Qwen3.5 `1M` coverage - turning preview-file existence into live benchmark certification --- -## Key docs - -- [`COVERAGE_AUDIT_2026-04-11.md`](COVERAGE_AUDIT_2026-04-11.md) — definitive strict audit, file-path mapping, and N/A rationale -- [`LONG_CONTEXT_TRUTH_MATRIX.md`](LONG_CONTEXT_TRUTH_MATRIX.md) — canonical claim boundary -- [`SUPPORT_MATRIX.md`](SUPPORT_MATRIX.md) — lane-by-lane audited support table -- [`PRODUCER_GAPS.md`](PRODUCER_GAPS.md) — what remains truly open vs no longer applicable -- [`RUNBOOK_EXTERNAL_GMI.md`](RUNBOOK_EXTERNAL_GMI.md) — external operator path -- [`RUNBOOK_INTERNAL_SEMIANALYSIS.md`](RUNBOOK_INTERNAL_SEMIANALYSIS.md) — internal workflow-backed path -- [`INVESTIGATION_KV_CACHE_PROFILING_2026-04-11.md`](INVESTIGATION_KV_CACHE_PROFILING_2026-04-11.md) — what the long-context preview paths actually measure - ---- - -## Export roots - -- `datasets/isb1/exports/core/` -- `datasets/isb1/exports/extension_32k/` -- `datasets/isb1/exports/extension_64k/` -- `datasets/isb1/exports/extension_131k/` -- `datasets/isb1/exports/preview/offload_core/` -- `datasets/isb1/exports/preview/long_context_500k/` -- `datasets/isb1/exports/preview/long_context_1m/` +## Related docs +- [`COEXISTENCE_WITH_KV_CACHE_TESTER.md`](COEXISTENCE_WITH_KV_CACHE_TESTER.md) — + how PR #1032 coexists with PR #993's kv-cache-tester +- [`GMI_EXECUTION_PLAN.md`](GMI_EXECUTION_PLAN.md) — bare-metal execution + runbook for ISB1 replay on GMI Cloud Hopper and Blackwell +- [`exports/preview/long_context_500k/README.md`](exports/preview/long_context_500k/README.md) — + 500K preview lane claim boundary +- [`exports/preview/long_context_1m/README.md`](exports/preview/long_context_1m/README.md) — + 1M gated preview lane claim boundary From 127f06888bb5e084a93796be415efdd24559ea38 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:35:01 -0700 Subject: [PATCH 06/18] chore(isb1): trim PR #1032 to narrow data+contract scope Trim this branch to ISB1 data exports + processing/replay contract files only.\n\nRemoved non-scope changes from this PR branch (workflows/configs, benchmark runners/scripts, GMI harness docs/scripts, experimental multiturn assets, and auxiliary ISB1 tooling), preserving them on fork-only bookmark branches:\n- isb1/kv-stress-tooling\n- isb1/agentic-benchmark-runners\n- isb1/gmi-harness\n\nThis keeps upstream cherry-pick review focused on dataset exports and contract guards. --- .github/configs/amd-master.yaml | 29 +- .github/configs/isb1-kv-stress.yaml | 3589 ----------- .github/configs/isb1-master.yaml | 869 --- .github/configs/isb1-qwen-1m-preview.yaml | 32 - .github/configs/isb1-triattn-preview.yaml | 291 - .github/configs/nvidia-master.yaml | 39 +- .github/workflows/benchmark-isb1-tmpl.yml | 451 -- .github/workflows/benchmark-tmpl.yml | 2 +- .github/workflows/collect-results.yml | 22 - .github/workflows/pr-recipe-reminder.yml | 6 +- .../workflows/run-isb1-kv-stress-sweep.yml | 110 - .github/workflows/run-isb1-sweep.yml | 256 - .gitignore | 5 +- benchmarks/benchmark_lib.sh | 698 --- benchmarks/single_node/dsr1_fp4_b200.sh | 27 +- benchmarks/single_node/dsr1_fp8_b200.sh | 43 +- benchmarks/single_node/dsr1_fp8_b200_vllm.sh | 108 - benchmarks/single_node/dsr1_fp8_h200.sh | 40 +- benchmarks/single_node/dsr1_fp8_h200_vllm.sh | 92 - .../single_node/dsr1triattn_fp8_h100_vllm.sh | 117 - .../single_node/dsr1triattn_fp8_h200_vllm.sh | 117 - benchmarks/single_node/gptoss_fp4_b200.sh | 34 +- .../single_node/gptoss_fp4_b200_sglang.sh | 97 - benchmarks/single_node/gptoss_fp4_h100.sh | 40 +- .../single_node/gptoss_fp4_h100_sglang.sh | 85 - benchmarks/single_node/gptoss_fp4_h200.sh | 25 +- .../single_node/gptoss_fp4_h200_sglang.sh | 83 - .../gptosstriattn_fp4_h100_vllm.sh | 127 - .../gptosstriattn_fp4_h200_vllm.sh | 127 - benchmarks/single_node/qwen3.5_bf16_mi300x.sh | 15 +- benchmarks/single_node/qwen3.5_bf16_mi325x.sh | 15 +- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 13 +- .../single_node/qwen3.5_fp8_b200_sglang.sh | 102 - .../single_node/qwen3.5_fp8_b200_vllm.sh | 95 - benchmarks/single_node/qwen3.5_fp8_b300.sh | 83 + .../single_node/qwen3.5_fp8_b300_mtp.sh | 87 + .../single_node/qwen3.5_fp8_h100_sglang.sh | 91 - .../single_node/qwen3.5_fp8_h100_vllm.sh | 104 - .../single_node/qwen3.5_fp8_h200_sglang.sh | 98 - .../single_node/qwen3.5_fp8_h200_vllm.sh | 93 - benchmarks/single_node/qwen3.5_fp8_mi300x.sh | 15 +- benchmarks/single_node/qwen3.5_fp8_mi325x.sh | 15 +- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 13 +- .../qwen3.5triattn_fp8_h100_vllm.sh | 127 - .../qwen3.5triattn_fp8_h200_vllm.sh | 127 - .../isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md | 110 - datasets/isb1/GMI_EXECUTION_PLAN.md | 182 - .../isb1/scripts/adapt_trace_replay_result.py | 214 - .../analyze_benchmark_distributions.py | 157 - .../isb1/scripts/collect_sweep_results.py | 183 - .../generate_qwen35_low_band_exports.py | 98 - datasets/isb1/scripts/gmi_analyze_sweep.py | 250 - datasets/isb1/scripts/gmi_full_suite.sh | 135 - datasets/isb1/scripts/gmi_kv_sweep.sh | 176 - .../isb1/scripts/gmi_portable_benchmark.sh | 1019 --- datasets/isb1/scripts/gmi_test_matrix.sh | 88 - .../isb1/scripts/gpu_profile_collector.sh | 42 - datasets/isb1/scripts/isb1_results_db.py | 816 --- datasets/isb1/scripts/metrics_collector.py | 356 -- datasets/isb1/scripts/plot_pareto.py | 210 - experimental/README.md | 10 +- experimental/multiturn/README.md | 43 +- .../multiturn/vllm_benchmark/.gitignore | 7 - .../multiturn/vllm_benchmark/README.md | 33 - .../aiperf_synthetic_traces.json | 5559 ----------------- .../aiperf_traces/generate_aiperf_traces.py | 81 - .../vllm_benchmark/kv-cache-tester/README.md | 11 - .../kv-cache-tester/traces/.gitkeep | 0 .../multiturn/vllm_benchmark/launch/README.md | 8 - .../launch/lmcache_vllm_b200.sh | 25 - .../launch/lmcache_vllm_h200.sh | 25 - .../trace_replay_dsr1_fp8_b200_vllm.sh | 34 - .../trace_replay_dsr1_fp8_h200_vllm.sh | 34 - .../trace_replay_gptoss_fp4_b200_sglang.sh | 32 - .../trace_replay_gptoss_fp4_b200_vllm.sh | 34 - .../trace_replay_gptoss_fp4_h200_sglang.sh | 32 - .../trace_replay_gptoss_fp4_h200_vllm.sh | 34 - .../trace_replay_qwen3.5_fp8_b200_sglang.sh | 32 - .../trace_replay_qwen3.5_fp8_b200_vllm.sh | 34 - .../trace_replay_qwen3.5_fp8_h200_sglang.sh | 32 - .../trace_replay_qwen3.5_fp8_h200_vllm.sh | 34 - perf-changelog.yaml | 48 + runners/launch_b200-dgxc-slurm.sh | 9 +- runners/launch_b200-dgxc.sh | 11 +- runners/launch_b200-nb.sh | 7 +- runners/launch_b300-nv.sh | 27 + runners/launch_h100-cr.sh | 11 +- runners/launch_h100-cw.sh | 5 +- runners/launch_h100-dgxc-slurm.sh | 5 +- runners/launch_h200-cw.sh | 7 +- runners/launch_h200-dgxc-slurm.sh | 5 +- runners/launch_h200-nb.sh | 7 +- runners/lib_single_node_script.sh | 41 - utils/gate_isb1.py | 298 - utils/matrix_logic/generate_sweep_configs.py | 334 +- .../test_generate_sweep_configs.py | 770 --- utils/matrix_logic/test_validation.py | 541 -- utils/matrix_logic/validation.py | 647 +- utils/summarize_isb1.py | 238 - utils/test_gate_isb1.py | 218 - utils/test_summarize_isb1.py | 105 - utils/test_verify_producer_sync.py | 102 - utils/verify_producer_sync.py | 135 - 103 files changed, 476 insertions(+), 21919 deletions(-) delete mode 100644 .github/configs/isb1-kv-stress.yaml delete mode 100644 .github/configs/isb1-master.yaml delete mode 100644 .github/configs/isb1-qwen-1m-preview.yaml delete mode 100644 .github/configs/isb1-triattn-preview.yaml delete mode 100644 .github/workflows/benchmark-isb1-tmpl.yml delete mode 100644 .github/workflows/run-isb1-kv-stress-sweep.yml delete mode 100644 .github/workflows/run-isb1-sweep.yml delete mode 100644 benchmarks/single_node/dsr1_fp8_b200_vllm.sh delete mode 100644 benchmarks/single_node/dsr1_fp8_h200_vllm.sh delete mode 100755 benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh delete mode 100755 benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh delete mode 100644 benchmarks/single_node/gptoss_fp4_b200_sglang.sh delete mode 100644 benchmarks/single_node/gptoss_fp4_h100_sglang.sh delete mode 100644 benchmarks/single_node/gptoss_fp4_h200_sglang.sh delete mode 100755 benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh delete mode 100755 benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh delete mode 100755 benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh delete mode 100755 benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh create mode 100644 benchmarks/single_node/qwen3.5_fp8_b300.sh create mode 100644 benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh delete mode 100755 benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh delete mode 100755 benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh delete mode 100755 benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh delete mode 100755 benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh delete mode 100755 benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh delete mode 100755 benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh delete mode 100644 datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md delete mode 100644 datasets/isb1/GMI_EXECUTION_PLAN.md delete mode 100644 datasets/isb1/scripts/adapt_trace_replay_result.py delete mode 100644 datasets/isb1/scripts/analyze_benchmark_distributions.py delete mode 100644 datasets/isb1/scripts/collect_sweep_results.py delete mode 100755 datasets/isb1/scripts/generate_qwen35_low_band_exports.py delete mode 100644 datasets/isb1/scripts/gmi_analyze_sweep.py delete mode 100755 datasets/isb1/scripts/gmi_full_suite.sh delete mode 100644 datasets/isb1/scripts/gmi_kv_sweep.sh delete mode 100755 datasets/isb1/scripts/gmi_portable_benchmark.sh delete mode 100755 datasets/isb1/scripts/gmi_test_matrix.sh delete mode 100755 datasets/isb1/scripts/gpu_profile_collector.sh delete mode 100644 datasets/isb1/scripts/isb1_results_db.py delete mode 100644 datasets/isb1/scripts/metrics_collector.py delete mode 100644 datasets/isb1/scripts/plot_pareto.py delete mode 100644 experimental/multiturn/vllm_benchmark/.gitignore delete mode 100644 experimental/multiturn/vllm_benchmark/README.md delete mode 100644 experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json delete mode 100644 experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py delete mode 100644 experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md delete mode 100644 experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/.gitkeep delete mode 100644 experimental/multiturn/vllm_benchmark/launch/README.md delete mode 100755 experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh delete mode 100755 experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh delete mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh delete mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh delete mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh delete mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh delete mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh delete mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh delete mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh delete mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh delete mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh delete mode 100755 experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh delete mode 100644 runners/lib_single_node_script.sh delete mode 100644 utils/gate_isb1.py delete mode 100644 utils/summarize_isb1.py delete mode 100644 utils/test_gate_isb1.py delete mode 100644 utils/test_summarize_isb1.py delete mode 100644 utils/test_verify_producer_sync.py delete mode 100644 utils/verify_producer_sync.py diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 13d0e6146..a2c424c91 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -114,7 +114,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -125,14 +125,14 @@ qwen3.5-bf16-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } qwen3.5-bf16-mi300x-sglang: - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi300x @@ -150,7 +150,7 @@ qwen3.5-bf16-mi300x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi325x-sglang: - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi325x @@ -168,7 +168,7 @@ qwen3.5-bf16-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi325x-sglang: - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi325x @@ -186,7 +186,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x @@ -197,14 +197,17 @@ qwen3.5-fp8-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } qwen3.5-fp4-mi355x-sglang: - image: lmsysorg/sglang:v0.5.10-rocm720-mi35x + image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413 model: amd/Qwen3.5-397B-A17B-MXFP4 model-prefix: qwen3.5 runner: mi355x @@ -216,15 +219,15 @@ qwen3.5-fp4-mi355x-sglang: osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 4 } + - { tp: 4, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, conc-start: 4, conc-end: 16 } qwen3.5-fp8-mi300x-sglang: - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi300x diff --git a/.github/configs/isb1-kv-stress.yaml b/.github/configs/isb1-kv-stress.yaml deleted file mode 100644 index 544ecd9dd..000000000 --- a/.github/configs/isb1-kv-stress.yaml +++ /dev/null @@ -1,3589 +0,0 @@ -dsr1-fp4-b200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id001 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id001 - workload-type: code - model: deepseek-r1-fp4 - model-prefix: dsr1 - precision: fp4 - runner: b200-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp4-b200-dynamo-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id002 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id002 - workload-type: code - model: deepseek-r1-fp4 - model-prefix: dsr1 - precision: fp4 - runner: b200-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp4-b200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id003 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id003 - workload-type: code - model: deepseek-r1-fp4 - model-prefix: dsr1 - precision: fp4 - runner: b200-multinode - runtime-stack-id: dynamo:trt -dsr1-fp4-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id004 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id004 - workload-type: code - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:sglang -dsr1-fp4-b200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id005 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id005 - workload-type: code - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp4-b200-trt-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id006 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id006 - workload-type: code - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp4-b300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:b300_sxm_288gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id007 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id007 - workload-type: code - model: deepseek-r1-fp4 - model-prefix: dsr1 - precision: fp4 - runner: b300 - runtime-stack-id: dynamo:trt -dsr1-fp4-gb200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: lmsysorg/sglang:v0.5.8-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id008 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id008 - workload-type: code - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - precision: fp4 - runner: gb200 - runtime-stack-id: dynamo:sglang -dsr1-fp4-gb200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id009 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id009 - workload-type: code - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - precision: fp4 - runner: gb200 - runtime-stack-id: dynamo:trt -dsr1-fp4-gb300-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id010 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id010 - workload-type: code - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - precision: fp4 - runner: gb300 - runtime-stack-id: dynamo:sglang -dsr1-fp4-gb300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id011 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id011 - workload-type: code - model: nvidia/DeepSeek-R1-0528-NVFP4-v2 - model-prefix: dsr1 - precision: fp4 - runner: gb300 - runtime-stack-id: dynamo:trt -dsr1-fp4-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id012 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id012 - workload-type: code - model: amd/DeepSeek-R1-0528-MXFP4-Preview - model-prefix: dsr1 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp4-mi355x-atom-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id013 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id013 - workload-type: code - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp4-mi355x-sglang-disagg-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id014 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id014 - workload-type: code - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - precision: fp4 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp4-mi355x-sglang-disagg-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id015 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id015 - workload-type: code - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - precision: fp4 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp4-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi35x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id016 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id016 - workload-type: code - model: amd/DeepSeek-R1-0528-MXFP4-Preview - model-prefix: dsr1 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:sglang -dsr1-fp8-b200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id017 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id017 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp8-b200-dynamo-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id018 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id018 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp8-b200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id019 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id019 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200-multinode - runtime-stack-id: dynamo:trt -dsr1-fp8-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id020 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id020 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -dsr1-fp8-b200-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id021 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id021 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -dsr1-fp8-b200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id022 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id022 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp8-b200-trt-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id023 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id023 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:trt -dsr1-fp8-b300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:b300_sxm_288gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id024 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id024 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: b300 - runtime-stack-id: dynamo:trt -dsr1-fp8-gb200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id025 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id025 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: gb200 - runtime-stack-id: dynamo:sglang -dsr1-fp8-gb200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id026 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id026 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: gb200 - runtime-stack-id: dynamo:trt -dsr1-fp8-gb300-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id027 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id027 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: gb300 - runtime-stack-id: dynamo:sglang -dsr1-fp8-gb300-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:gb300_grace_blackwell_288gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id028 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id028 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: gb300 - runtime-stack-id: dynamo:trt -dsr1-fp8-h100-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:h100_sxm_80gb - image: lmsysorg/sglang:v0.5.8-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id029 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id029 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h100-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp8-h100-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:h100_sxm_80gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id030 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id030 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h100-multinode - runtime-stack-id: dynamo:trt -dsr1-fp8-h200-dynamo-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.8.post1-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id031 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id031 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h200-multinode - runtime-stack-id: dynamo:sglang -dsr1-fp8-h200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: dynamo-trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id032 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id032 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h200-multinode - runtime-stack-id: dynamo:trt -dsr1-fp8-h200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id033 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id033 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -dsr1-fp8-h200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id034 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id034 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:trt -dsr1-fp8-h200-trt-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id035 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id035 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:trt -dsr1-fp8-mi300x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: amd:mi300x_192gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id036 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id036 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi300x - runtime-stack-id: standalone:sglang -dsr1-fp8-mi325x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: amd:mi325x_288gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id037 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id037 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi325x - runtime-stack-id: standalone:sglang -dsr1-fp8-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id038 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id038 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp8-mi355x-atom-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id039 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id039 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -dsr1-fp8-mi355x-sglang-disagg-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id040 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id040 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp8-mi355x-sglang-disagg-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang-disagg - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id041 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id041 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi355x-disagg - runtime-stack-id: standalone:sglang-disagg -dsr1-fp8-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: deepseek_r1_0528 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: lmsysorg/sglang:v0.5.9-rocm700-mi35x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id042 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id042 - workload-type: code - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:sglang -glm5-fp4-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm_5 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id043 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id043 - workload-type: code - model: nvidia/GLM-5-NVFP4 - model-prefix: glm5 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:sglang -glm5-fp8-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm_5 - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id044 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id044 - workload-type: code - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -glm5-fp8-h200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm_5 - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:glm5-hopper - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id045 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id045 - workload-type: code - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -glm5-fp8-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm_5 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id046 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id046 - workload-type: code - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -glm5-fp8-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: glm_5 - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id047 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id047 - workload-type: code - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:sglang -gptoss-fp4-b200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: trt - hardware-profile-id: nvidia:b200_sxm_180gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id048 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id048 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: b200 - runtime-stack-id: standalone:trt -gptoss-fp4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.15.1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id049 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id049 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -gptoss-fp4-gb200-dynamo-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: dynamo-trt - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id050 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id050 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: gb200 - runtime-stack-id: dynamo:trt -gptoss-fp4-h100-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: nvidia:h100_sxm_80gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id051 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id051 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: h100 - runtime-stack-id: standalone:vllm -gptoss-fp4-h200-trt-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: trt - hardware-profile-id: nvidia:h200_sxm_141gb - image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc11 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id052 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id052 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: h200 - runtime-stack-id: standalone:trt -gptoss-fp4-h200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id053 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id053 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: h200 - runtime-stack-id: standalone:vllm -gptoss-fp4-mi300x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: amd:mi300x_192gb - image: vllm/vllm-openai-rocm:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id054 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id054 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: mi300x - runtime-stack-id: standalone:vllm -gptoss-fp4-mi325x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: amd:mi325x_288gb - image: vllm/vllm-openai-rocm:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id055 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id055 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: mi325x - runtime-stack-id: standalone:vllm -gptoss-fp4-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id056 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id056 - workload-type: code - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -gptoss-fp4-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: gpt_oss_120b - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id057 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id057 - workload-type: code - model: amd/gpt-oss-120b-w-mxfp4-a-fp8 - model-prefix: gptoss - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:vllm -kimik2.5-fp4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.17.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id058 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id058 - workload-type: code - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -kimik2.5-fp4-gb200-dynamo-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: dynamo-vllm - hardware-profile-id: nvidia:gb200_grace_blackwell_192gb - image: vllm/vllm-openai:v0.18.0-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id059 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id059 - workload-type: code - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - precision: fp4 - runner: gb200 - runtime-stack-id: dynamo:vllm -kimik2.5-fp4-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id060 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id060 - workload-type: code - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:atom -kimik2.5-fp4-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id061 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id061 - workload-type: code - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:vllm -kimik2.5-int4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.15.1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id062 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id062 - workload-type: code - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - precision: int4 - runner: b200 - runtime-stack-id: standalone:vllm -kimik2.5-int4-h200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.16.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id063 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id063 - workload-type: code - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - precision: int4 - runner: h200 - runtime-stack-id: standalone:vllm -kimik2.5-int4-mi300x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: amd:mi300x_192gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id064 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id064 - workload-type: code - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - precision: int4 - runner: mi300x - runtime-stack-id: standalone:vllm -kimik2.5-int4-mi325x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: amd:mi325x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id065 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id065 - workload-type: code - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - precision: int4 - runner: mi325x - runtime-stack-id: standalone:vllm -kimik2.5-int4-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: kimi_k2_5 - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id066 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id066 - workload-type: code - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - precision: int4 - runner: mi355x - runtime-stack-id: standalone:vllm -minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.19.0-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id067 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id067 - workload-type: code - model: nvidia/MiniMax-M2.5-NVFP4 - model-prefix: minimaxm2.5 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:b200_sxm_180gb - image: vllm/vllm-openai:v0.19.0-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id068 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id068 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:h100_sxm_80gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id069 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id069 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: h100 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: nvidia:h200_sxm_141gb - image: vllm/vllm-openai:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id070 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id070 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-mi300x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: amd:mi300x_192gb - image: vllm/vllm-openai-rocm:v0.16.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id071 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id071 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: mi300x - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-mi325x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: amd:mi325x_288gb - image: vllm/vllm-openai-rocm:v0.18.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id072 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id072 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: mi325x - runtime-stack-id: standalone:vllm -minimaxm2.5-fp8-mi355x-atom-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: atom - hardware-profile-id: amd:mi355x_288gb - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id073 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id073 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:atom -minimaxm2.5-fp8-mi355x-vllm-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: minimax_m2_5 - framework: vllm - hardware-profile-id: amd:mi355x_288gb - image: vllm/vllm-openai-rocm:v0.19.0 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id074 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: unsupported - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id074 - workload-type: code - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:vllm -qwen3.5-bf16-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id075 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id075 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - precision: bf16 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-bf16-mi300x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi300x_192gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id076 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id076 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - precision: bf16 - runner: mi300x - runtime-stack-id: standalone:sglang -qwen3.5-bf16-mi325x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi325x_288gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id077 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id077 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - precision: bf16 - runner: mi325x - runtime-stack-id: standalone:sglang -qwen3.5-bf16-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id078 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id078 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - precision: bf16 - runner: mi355x - runtime-stack-id: standalone:sglang -qwen3.5-fp4-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id079 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id079 - workload-type: code - model: nvidia/Qwen3.5-397B-A17B-NVFP4 - model-prefix: qwen3.5 - precision: fp4 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-fp4-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: lmsysorg/sglang:v0.5.10-rocm720-mi35x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id080 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id080 - workload-type: code - model: amd/Qwen3.5-397B-A17B-MXFP4 - model-prefix: qwen3.5 - precision: fp4 - runner: mi355x - runtime-stack-id: standalone:sglang -qwen3.5-fp8-b200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id081 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id081 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-b200-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:b200_sxm_180gb - image: lmsysorg/sglang:v0.5.9-cu130 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id082 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id082 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: b200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-h200-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.9-cu129-amd64 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id083 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id083 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-h200-sglang-mtp-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: nvidia:h200_sxm_141gb - image: lmsysorg/sglang:v0.5.10.post1 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id084 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id084 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: h200 - runtime-stack-id: standalone:sglang -qwen3.5-fp8-mi300x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi300x_192gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id085 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id085 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: mi300x - runtime-stack-id: standalone:sglang -qwen3.5-fp8-mi325x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi325x_288gb - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id086 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id086 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: mi325x - runtime-stack-id: standalone:sglang -qwen3.5-fp8-mi355x-sglang-isb1-kv-stress: - benchmark-type: isb1_kv_stress - canonical-model-id: qwen3_5_397b_a17b - framework: sglang - hardware-profile-id: amd:mi355x_288gb - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 - kv-cache-dtype: fp8 - kv-stress-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - search-space: - - duration-s: 1800 - offload-modes: - - 'on' - - 'off' - - noprefix - users: &id087 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - support-status: reviewed_preview - tp-configs: - - duration-s: 1800 - ep: 1 - offload-modes: - - 'on' - - 'off' - - noprefix - tp: 8 - users: *id087 - workload-type: code - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - runner: mi355x - runtime-stack-id: standalone:sglang diff --git a/.github/configs/isb1-master.yaml b/.github/configs/isb1-master.yaml deleted file mode 100644 index ff71182e5..000000000 --- a/.github/configs/isb1-master.yaml +++ /dev/null @@ -1,869 +0,0 @@ -# ISB1 master sweep config. -# -# Each stanza pairs a runtime/hardware/model identity with one or more -# replay-config entries. Every (runtime-stack-id, hardware-profile-id, -# canonical-model-id) triple must exist in the referenced export bundle; -# row-level filtering in the hydrator selects the matching cells. -# -# Export-file paths are flat under datasets/isb1/exports/ (no per-engine -# subdirs). Qwen3.5 bundles are suffixed _qwen3.5.json to keep identity -# triples unambiguous. -# -# Core entries keep an explicit 8k1k max-model-len. Extension entries -# intentionally omit max-model-len so the ISB1 workflow derives the -# served-shape value from the export stem (32k1k / 64k1k / 131k1k) at -# execution time. -# -# support-status: -# supported — benchmark_certification_status=dataset_replay_verified -# reviewed_preview — preview rows, pinned explicitly for disclosure -# unsupported — rows retained for bundle coverage; not executed -# -# Current closure: dsr1, gptoss, qwen3.5 on standalone:vllm across core 8k1k -# and extension 32k/64k/131k bands, plus bounded 500k code preview on -# standalone:sglang for gptoss and qwen3.5. SGLang core/extension lanes are -# deferred until matching cells are materialized in the corresponding -# bundles. - -dsr1-fp8-b200-isb1-vllm: - image: vllm/vllm-openai:v0.19.0-cu130 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: deepseek_r1_0528 - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -dsr1-fp8-h200-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: deepseek_r1_0528 - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -gptoss-fp4-b200-isb1-vllm: - # Keep the existing B200 GPT-OSS vLLM pin from the official throughput lane. - image: vllm/vllm-openai:v0.15.1 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: gpt_oss_120b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -gptoss-fp4-h100-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: gpt_oss_120b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -gptoss-fp4-h200-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -qwen3.5-fp8-b200-isb1-vllm: - image: vllm/vllm-openai:v0.19.0-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -qwen3.5-fp8-h100-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -qwen3.5-fp8-h200-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -dsr1-fp8-b200-isb1-sglang-extension: - image: lmsysorg/sglang:v0.5.9-cu130 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - framework: sglang - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: deepseek_r1_0528 - replay-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - support-status: unsupported - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -dsr1-fp8-h200-isb1-sglang-extension: - image: lmsysorg/sglang:v0.5.9-cu130 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - framework: sglang - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: deepseek_r1_0528 - replay-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - support-status: unsupported - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -dsr1-fp8-b200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.19.0-cu130 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: deepseek_r1_0528 - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/chat_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/chat_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - support-status: unsupported - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -dsr1-fp8-h200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: deepseek_r1_0528 - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/chat_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/chat_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - support-status: unsupported - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -gptoss-fp4-b200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.15.1 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: gpt_oss_120b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/chat_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/chat_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - - export-file: datasets/isb1/exports/extension_131k/chat_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -gptoss-fp4-h100-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: gpt_oss_120b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/chat_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/chat_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - - export-file: datasets/isb1/exports/extension_131k/chat_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -gptoss-fp4-h200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/chat_32k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/chat_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - - export-file: datasets/isb1/exports/extension_131k/chat_131k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-b200-isb1-sglang-extension: - image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: sglang - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-h200-isb1-sglang-extension: - image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: sglang - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-b200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.19.0-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-h100-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-h200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - - max-concurrency: 4 - -qwen3.5-fp8-b200-isb1-sglang-500k-preview-code: - image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: sglang - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -qwen3.5-fp8-h100-isb1-sglang-500k-preview-code: - image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: sglang - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -qwen3.5-fp8-h200-isb1-sglang-500k-preview-code: - image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: sglang - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -gptoss-fp4-b200-isb1-sglang-500k-preview-code: - image: lmsysorg/sglang:v0.5.9-cu130 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: sglang - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: gpt_oss_120b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -gptoss-fp4-h100-isb1-sglang-500k-preview-code: - image: lmsysorg/sglang:v0.5.9-cu130 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: sglang - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: gpt_oss_120b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - -gptoss-fp4-h200-isb1-sglang-500k-preview-code: - image: lmsysorg/sglang:v0.5.9-cu130 - model: openai/gpt-oss-120b - model-prefix: gptoss - precision: fp4 - framework: sglang - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b - max-model-len: 524288 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 2 - max-turns-per-session: 4 - num-warmup-sessions: 0 - diff --git a/.github/configs/isb1-qwen-1m-preview.yaml b/.github/configs/isb1-qwen-1m-preview.yaml deleted file mode 100644 index 66ac28a67..000000000 --- a/.github/configs/isb1-qwen-1m-preview.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Manual-only gated Qwen 1M preview surface. -# The selected export cells remain support-status=reviewed_preview and -# benchmark_certification_status=dataset_replay_verified, but this file is -# intentionally separate from isb1-master.yaml so the lane stays out of the -# ordinary runnable support statement. -# -# Use only for explicit validation dispatches while KV-offload observability and -# correctness remain under review. Running this file does not imply native 1M -# served-lane support or KV-offload certification. - -qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code: - image: lmsysorg/sglang:v0.5.9-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - precision: fp8 - framework: sglang - runner: b200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:sglang - hardware-profile-id: nvidia:b200_sxm_180gb - canonical-model-id: qwen3_5_397b_a17b - max-model-len: 1048576 - replay-configs: - - export-file: datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 1 - max-sessions: 1 - max-turns-per-session: 3 - num-warmup-sessions: 0 - diff --git a/.github/configs/isb1-triattn-preview.yaml b/.github/configs/isb1-triattn-preview.yaml deleted file mode 100644 index ee482c046..000000000 --- a/.github/configs/isb1-triattn-preview.yaml +++ /dev/null @@ -1,291 +0,0 @@ -# TriAttention KV-compression preview lanes for ISB1 replay benchmarks. -# -# These entries deploy vLLM with the TriAttention plugin enabled for runtime -# KV-cache compression on H100/H200 Hopper-class GPUs. The plugin uses env -# vars TRIATTN_RUNTIME_KV_BUDGET and TRIATTN_RUNTIME_SPARSE_STATS_PATH, -# configured in the benchmark scripts. -# -# Key differences from baseline vLLM ISB1 entries: -# - model-prefix includes "triattn" suffix to route to dedicated scripts -# - Prefix caching disabled (incompatible with KV compression) -# - max-num-batched-tokens lowered to 1024 (prevents OOM from large prefills) -# - KV budget auto-detected: 2048 for code workloads, 12000 for chat workloads -# -# This file is intentionally separate from isb1-master.yaml — TriAttention -# preview lanes stay out of the ordinary runnable support statement. -# Use only for explicit validation dispatches. -# -# Prerequisites: -# - triattention pip package installed in the container (or installed at runtime) -# - Optional: pre-calibrated stats at /workspace/triattn_stats/_stats.pt - -# --------------------------------------------------------------------------- -# DeepSeek-R1 FP8 — H100/H200 with TriAttention — core 8k1k -# --------------------------------------------------------------------------- - -dsr1triattn-fp8-h100-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1triattn - precision: fp8 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: deepseek_r1_0528 - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -dsr1triattn-fp8-h200-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1triattn - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: deepseek_r1_0528 - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -# --------------------------------------------------------------------------- -# DeepSeek-R1 FP8 — H100/H200 with TriAttention — long-context extensions -# --------------------------------------------------------------------------- - -dsr1triattn-fp8-h100-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1triattn - precision: fp8 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: deepseek_r1_0528 - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - -dsr1triattn-fp8-h200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1triattn - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: deepseek_r1_0528 - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - -# --------------------------------------------------------------------------- -# Qwen 3.5 FP8 — H100/H200 with TriAttention — extension only -# (Qwen 3.5 is not present in core 8k1k exports; only extension 131k) -# --------------------------------------------------------------------------- - -qwen3.5triattn-fp8-h100-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5triattn - precision: fp8 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - -qwen3.5triattn-fp8-h200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5triattn - precision: fp8 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: qwen3_5_397b_a17b - replay-configs: - - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 2 - num-warmup-sessions: 1 - -# --------------------------------------------------------------------------- -# GPT-OSS-120B FP4 — H100/H200 with TriAttention — core 8k1k -# --------------------------------------------------------------------------- - -gptosstriattn-fp4-h100-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptosstriattn - precision: fp4 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: gpt_oss_120b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -gptosstriattn-fp4-h200-isb1-vllm: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptosstriattn - precision: fp4 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b - max-model-len: 10240 - replay-configs: - - export-file: datasets/isb1/exports/core/chat_8k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - - export-file: datasets/isb1/exports/core/code_8k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - max-concurrency: 8 - -# --------------------------------------------------------------------------- -# GPT-OSS-120B FP4 — H100/H200 with TriAttention — long-context extensions -# --------------------------------------------------------------------------- - -gptosstriattn-fp4-h100-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptosstriattn - precision: fp4 - framework: vllm - runner: h100 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h100_sxm_80gb - canonical-model-id: gpt_oss_120b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - -gptosstriattn-fp4-h200-isb1-vllm-extension: - image: vllm/vllm-openai:v0.18.0 - model: openai/gpt-oss-120b - model-prefix: gptosstriattn - precision: fp4 - framework: vllm - runner: h200 - benchmark-type: isb1_replay - runtime-stack-id: standalone:vllm - hardware-profile-id: nvidia:h200_sxm_141gb - canonical-model-id: gpt_oss_120b - replay-configs: - - export-file: datasets/isb1/exports/extension_32k/code_32k1k.json - request-mode: multi-turn - support-status: reviewed_preview - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 - - export-file: datasets/isb1/exports/extension_64k/code_64k1k.json - request-mode: multi-turn - support-status: supported - search-space: - - max-concurrency: 4 - num-warmup-sessions: 1 diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 27ee51eef..5b550879c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1826,7 +1826,7 @@ glm5-fp8-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp4-b200-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5 + image: lmsysorg/sglang:v0.5.10.post1-cu130 model: nvidia/GLM-5-NVFP4 model-prefix: glm5 runner: b200 @@ -1863,6 +1863,43 @@ qwen3.5-fp8-b200-sglang-mtp: search-space: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + +qwen3.5-fp8-b300-sglang-mtp: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: b300 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + +qwen3.5-fp8-b300-sglang: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: b300 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + kimik2.5-int4-b200-vllm: image: vllm/vllm-openai:v0.15.1 model: moonshotai/Kimi-K2.5 diff --git a/.github/workflows/benchmark-isb1-tmpl.yml b/.github/workflows/benchmark-isb1-tmpl.yml deleted file mode 100644 index d152d2062..000000000 --- a/.github/workflows/benchmark-isb1-tmpl.yml +++ /dev/null @@ -1,451 +0,0 @@ -name: Template - Benchmark ISB1 -on: - workflow_call: - inputs: - runner: - required: true - type: string - image: - required: true - type: string - model: - required: true - type: string - model-prefix: - required: true - type: string - precision: - required: true - type: string - framework: - required: true - type: string - exp-name: - required: true - type: string - benchmark-type: - required: true - type: string - export-file: - required: true - type: string - runtime-stack-id: - required: true - type: string - hardware-profile-id: - required: true - type: string - canonical-model-id: - required: true - type: string - support-status: - required: false - type: string - default: '' - request-mode: - required: true - type: string - max-concurrency: - required: true - type: string - max-sessions: - required: false - type: string - default: '' - max-turns-per-session: - required: false - type: string - default: '' - max-output-len: - required: false - type: string - default: '' - num-warmup-sessions: - required: false - type: string - default: '0' - ignore-waits: - required: false - type: boolean - default: false - ignore-eos: - required: false - type: boolean - default: false - max-model-len: - required: false - type: string - default: '' - tp-override: - required: false - type: string - default: '' - ep-override: - required: false - type: string - default: '' - trace-source: - required: false - type: string - default: '' - offload-mode: - required: false - type: string - default: '' - kv-cache-dtype: - required: false - type: string - default: '' - disable-prefix-caching: - required: false - type: boolean - default: false - benchmark-duration-s: - required: false - type: string - default: '' - workload-type: - required: false - type: string - default: '' - vllm-cpu-offload-gb: - required: false - type: string - default: '' - vllm-swap-space-gb: - required: false - type: string - default: '' - sglang-mem-fraction-override: - required: false - type: string - default: '' - sglang-chunked-prefill-override: - required: false - type: string - default: '' - ref: - description: Git ref (branch/sha) to checkout - required: false - type: string - -env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - HF_HUB_CACHE: '/mnt/hf_hub_cache/' - EXP_NAME: ${{ inputs.exp-name }} - MODEL: ${{ inputs.model }} - MODEL_PREFIX: ${{ inputs.model-prefix }} - IMAGE: ${{ inputs.image }} - FRAMEWORK: ${{ inputs.framework }} - PRECISION: ${{ inputs.precision }} - BENCHMARK_TYPE: ${{ inputs.benchmark-type }} - EXPORT_FILE: ${{ inputs.export-file }} - RUNTIME_STACK_ID: ${{ inputs.runtime-stack-id }} - HARDWARE_PROFILE_ID: ${{ inputs.hardware-profile-id }} - CANONICAL_MODEL_ID: ${{ inputs.canonical-model-id }} - SUPPORT_STATUS: ${{ inputs.support-status }} - REQUEST_MODE: ${{ inputs.request-mode }} - MAX_CONCURRENCY: ${{ inputs.max-concurrency }} - MAX_SESSIONS: ${{ inputs.max-sessions }} - MAX_TURNS_PER_SESSION: ${{ inputs.max-turns-per-session }} - MAX_OUTPUT_LEN: ${{ inputs.max-output-len }} - NUM_WARMUP_SESSIONS: ${{ inputs.num-warmup-sessions }} - IGNORE_WAITS: ${{ inputs.ignore-waits }} - IGNORE_EOS: ${{ inputs.ignore-eos }} - OFFLOAD_MODE: ${{ inputs.offload-mode }} - KV_CACHE_DTYPE: ${{ inputs.kv-cache-dtype }} - DISABLE_PREFIX_CACHING: ${{ inputs.disable-prefix-caching }} - BENCHMARK_DURATION_S: ${{ inputs.benchmark-duration-s }} - WORKLOAD_TYPE: ${{ inputs.workload-type }} - VLLM_CPU_OFFLOAD_GB: ${{ inputs.vllm-cpu-offload-gb }} - VLLM_SWAP_SPACE_GB: ${{ inputs.vllm-swap-space-gb }} - SGLANG_MEM_FRACTION_OVERRIDE: ${{ inputs.sglang-mem-fraction-override }} - SGLANG_CHUNKED_PREFILL_OVERRIDE: ${{ inputs.sglang-chunked-prefill-override }} - TP_OVERRIDE: ${{ inputs.tp-override }} - EP_OVERRIDE: ${{ inputs.ep-override }} - TRACE_SOURCE: ${{ inputs.trace-source }} - PYTHONDONTWRITEBYTECODE: '1' - PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache - -permissions: - contents: read - -jobs: - benchmark: - runs-on: ${{ inputs.runner }} - timeout-minutes: 300 - name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | ${{ inputs.benchmark-type }} conc-${{ inputs.max-concurrency }}" - steps: - - name: Resource cleanup (pre-run) - run: &resource-cleanup | - if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then - echo "[Docker] Cleaning up resources ..." - docker ps -aq | xargs -r docker rm -f - docker network prune -f - while [ -n "$(docker ps -aq)" ]; do - docker ps -a - sleep 5 - done - fi - - if command -v squeue >/dev/null 2>&1; then - if [[ "${{ runner.name }}" == h100-* || "${{ runner.name }}" == h200-* || "${{ runner.name }}" == b200-* ]]; then - echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." - scancel --name="${{ runner.name }}" || true - while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do - squeue --name="${{ runner.name }}" - sleep 5 - done - else - echo "[Slurm] Cleaning up jobs for user: $USER ..." - scancel -u "$USER" || true - while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do - squeue -u "$USER" - sleep 5 - done - fi - fi - - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - ref: ${{ inputs.ref || github.ref }} - clean: false - - - name: Certify ISB1 export contract - env: - INPUT_EXPORT_FILE: ${{ inputs.export-file }} - INPUT_RUNTIME_STACK_ID: ${{ inputs.runtime-stack-id }} - INPUT_HARDWARE_PROFILE_ID: ${{ inputs.hardware-profile-id }} - INPUT_CANONICAL_MODEL_ID: ${{ inputs.canonical-model-id }} - INPUT_SUPPORT_STATUS: ${{ inputs.support-status }} - INPUT_MAX_MODEL_LEN: ${{ inputs.max-model-len }} - run: | - python3 - <<'PY' - import json - import os - import re - from pathlib import Path - - export_path = Path(os.environ["INPUT_EXPORT_FILE"]) - if not export_path.exists(): - raise SystemExit(f"Missing ISB1 export file: {export_path}") - - payload = json.loads(export_path.read_text()) - exports = payload.get("exports") - if not isinstance(exports, list) or not exports: - raise SystemExit( - f"ISB1 export file must contain a non-empty 'exports' list: {export_path}" - ) - - support_status = os.environ.get("INPUT_SUPPORT_STATUS", "").strip() or None - explicit_max_model_len = os.environ.get("INPUT_MAX_MODEL_LEN", "").strip() - if not re.search(r"(?P\d+)k(?P\d+)k", export_path.stem) and not explicit_max_model_len: - raise SystemExit( - "Mixed-shape ISB1 exports require explicit max-model-len in the workflow input. " - f"Missing for '{export_path}'." - ) - - identity_cells = [ - cell - for cell in exports - if cell.get("runtime_stack_id") == os.environ["INPUT_RUNTIME_STACK_ID"] - and cell.get("hardware_profile_id") == os.environ["INPUT_HARDWARE_PROFILE_ID"] - and cell.get("canonical_model_id") == os.environ["INPUT_CANONICAL_MODEL_ID"] - ] - identity_statuses = sorted( - { - cell.get("support_status") - for cell in identity_cells - if cell.get("support_status") is not None - } - ) - matching_cells = [ - cell - for cell in identity_cells - if support_status is None or cell.get("support_status") == support_status - ] - - if support_status is None and len(identity_statuses) > 1: - raise SystemExit( - f"Ambiguous ISB1 support tier for {export_path}; identity spans {identity_statuses}. " - "Pin support-status explicitly." - ) - if not matching_cells: - raise SystemExit( - "No ISB1 export cell matches the requested workflow identity/tier for " - f"{export_path}. Available tiers for that identity: {identity_statuses or ['']}" - ) - - certification_statuses = sorted( - { - cell.get("benchmark_certification_status") - for cell in matching_cells - if cell.get("benchmark_certification_status") is not None - } - ) - if not certification_statuses: - raise SystemExit( - "Selected ISB1 export cells must declare benchmark_certification_status. " - f"Missing for '{export_path}'." - ) - if certification_statuses != ["dataset_replay_verified"]: - raise SystemExit( - "Current InferenceX ISB1 consumer lanes only accept " - "benchmark_certification_status=dataset_replay_verified. " - f"Selected cells for '{export_path}' resolved to {certification_statuses}." - ) - - print( - "Certified ISB1 export contract for " - f"{export_path} with support-status={support_status or ''} " - f"and benchmark_certification_status={certification_statuses[0]}" - ) - PY - - - name: Derive ISB1 runner env - env: - INPUT_RUNNER: ${{ inputs.runner }} - INPUT_EXPORT_FILE: ${{ inputs.export-file }} - INPUT_MAX_MODEL_LEN: ${{ inputs.max-model-len }} - INPUT_MAX_CONCURRENCY: ${{ inputs.max-concurrency }} - INPUT_TP_OVERRIDE: ${{ inputs.tp-override }} - run: | - python3 - <<'PY' >> "$GITHUB_ENV" - import json - import os - import re - from pathlib import Path - - runner = os.environ["INPUT_RUNNER"].lower() - export_file = os.environ["INPUT_EXPORT_FILE"] - explicit_max_model_len = os.environ.get("INPUT_MAX_MODEL_LEN", "").strip() - max_concurrency = os.environ["INPUT_MAX_CONCURRENCY"] - - if runner.startswith(("h100", "h200", "b200")): - tp = 8 - else: - raise SystemExit( - f"ISB1 replay lane is NVIDIA-first in PR1b; unsupported runner '{runner}'." - ) - - tp_override = os.environ.get("INPUT_TP_OVERRIDE", "").strip() - if tp_override: - tp = int(tp_override) - - if tp < 8: - raise SystemExit( - f"ISB1 replay requires TP=8 on NVIDIA runners; derived TP={tp} for runner '{runner}'." - ) - - export_path = Path(export_file) - match = re.search(r"(?P\d+)k(?P\d+)k", export_path.stem) - - if match: - isl = int(match.group("isl")) * 1024 - osl = int(match.group("osl")) * 1024 - else: - try: - payload = json.loads(export_path.read_text()) - except Exception as exc: - raise SystemExit( - f"Could not inspect preview export metadata from '{export_file}': {exc}" - ) - served_shape = payload.get("served_shape") or {} - isl = int(served_shape.get("isl", 0) or 0) - osl = int(served_shape.get("osl", 0) or 0) - if not explicit_max_model_len: - raise SystemExit( - "Mixed-shape preview exports require explicit max-model-len in the ISB1 config. " - f"Missing for '{export_file}'." - ) - - if explicit_max_model_len: - max_model_len = int(explicit_max_model_len) - else: - max_model_len = isl + osl + (200 if max(isl, osl) >= 8192 else 20) - - print(f"TP={tp}") - print("EP_SIZE=1") - print("DP_ATTENTION=false") - print("SPEC_DECODING=none") - print("DISAGG=false") - print(f"CONC={max_concurrency}") - print(f"ISL={isl}") - print(f"OSL={osl}") - print(f"MAX_MODEL_LEN={max_model_len}") - print("RANDOM_RANGE_RATIO=1.0") - print(f"EXPORT_STEM={Path(export_file).stem}") - PY - - - id: launch - name: Launch job script - env: - RUNNER_NAME: ${{ runner.name }} - RUNNER_TYPE: ${{ inputs.runner }} - run: | - RESULT_FILENAME="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_${BENCHMARK_TYPE}_${EXPORT_STEM}_conc${MAX_CONCURRENCY}_${RUNNER_NAME}" - echo "RESULT_FILENAME=${RESULT_FILENAME}" >> "$GITHUB_ENV" - echo "result_filename=${RESULT_FILENAME}" >> "$GITHUB_OUTPUT" - bash ./runners/launch_${RUNNER_NAME%%_*}.sh - - FOUND_RESULT_FILE= - for i in {1..10}; do - if [ -f "$RESULT_FILENAME.json" ]; then - FOUND_RESULT_FILE=true - break - fi - echo "Waiting for result file... (attempt $i)" - sleep 1 - done - - if [ -z "$FOUND_RESULT_FILE" ]; then - echo "Run failed: Replay result $RESULT_FILENAME.json not found." >&2 - exit 1 - fi - - - name: Process result - run: | - python3 utils/process_result_isb1.py - - - name: Upload result - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 - with: - name: isb1_${{ steps.launch.outputs.result_filename }} - path: agg_${{ steps.launch.outputs.result_filename }}.json - - - name: Upload raw replay result - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 - with: - name: replay_${{ steps.launch.outputs.result_filename }} - path: ${{ steps.launch.outputs.result_filename }}.json - if-no-files-found: ignore - - - name: Upload server logs - if: always() - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 - with: - name: server_logs_${{ steps.launch.outputs.result_filename }} - path: server.log - if-no-files-found: ignore - - - name: Upload GPU metrics - if: always() - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 - with: - name: gpu_metrics_${{ steps.launch.outputs.result_filename }} - path: gpu_metrics.csv - if-no-files-found: ignore - - - name: Upload KV metrics - if: always() - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 - with: - name: kv_metrics_${{ steps.launch.outputs.result_filename }} - path: kv_metrics.csv - if-no-files-found: ignore - - - name: Resource cleanup (post-run) - if: always() - run: *resource-cleanup diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 05ab23ef8..d5a6cc1f4 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -116,7 +116,7 @@ jobs: # Cleanup SLURM resources if command -v squeue >/dev/null 2>&1; then - if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* ]]; then + if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* || "${{ runner.name }}" == b300-nv* ]]; then echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." scancel --name="${{ runner.name }}" || true while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index 6582914ca..353918609 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -29,7 +29,6 @@ jobs: pattern: ${{ inputs.result-prefix && format('{0}_*', inputs.result-prefix) || '*' }} - name: Print summary - if: inputs.result-prefix != 'isb1' run: | pip install tabulate python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY @@ -39,29 +38,8 @@ jobs: pip install tabulate python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }} - - name: ISB1 operator summary - if: inputs.result-prefix == 'isb1' - run: | - pip install tabulate - python3 utils/summarize_isb1.py results/ >> $GITHUB_STEP_SUMMARY - - - name: ISB1 gate report - if: inputs.result-prefix == 'isb1' - run: | - AGGREGATE_PATH="agg_${{ inputs.result-prefix }}.json" - python3 utils/gate_isb1.py "$AGGREGATE_PATH" | tee isb1_gate_report.json - python3 utils/gate_isb1.py "$AGGREGATE_PATH" --format markdown >> $GITHUB_STEP_SUMMARY - - name: Upload aggregated results uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: results_${{ inputs.result-prefix || 'all' }} path: agg_${{ inputs.result-prefix || 'all' }}.json - - - name: Upload ISB1 gate report - if: inputs.result-prefix == 'isb1' - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 - with: - name: isb1_gate_report - path: isb1_gate_report.json - if-no-files-found: ignore diff --git a/.github/workflows/pr-recipe-reminder.yml b/.github/workflows/pr-recipe-reminder.yml index a8ca02743..a4d0a30a6 100644 --- a/.github/workflows/pr-recipe-reminder.yml +++ b/.github/workflows/pr-recipe-reminder.yml @@ -40,6 +40,10 @@ jobs: If it is not, please create a PR first before we can merge your PR into the master branch. Let's ensure that the documentation is first class such that the entire ML community can benefit from your hard work! Thank you - https://github.com/vllm-project/recipes - - https://github.com/sgl-project/sgl-cookbook`.replace(/^ /gm, ''); + - https://github.com/sgl-project/sgl-cookbook + + **PR authors are responsible for ensuring that after merging, all GitHub Action jobs fully pass.** A lot of the time, failures are just flakes and simply re-running the failed jobs will fix it. If re-running failed jobs is attempted, PR authors are responsible for ensuring it passes. See GitHub's docs on re-running failed jobs: https://docs.github.com/en/actions/how-tos/manage-workflow-runs/re-run-workflows-and-jobs#re-running-failed-jobs-in-a-workflow + + If additional help is needed, PR authors can reach out to core maintainers over Slack.`.replace(/^ /gm, ''); await github.rest.issues.createComment({ owner, repo, issue_number, body }); diff --git a/.github/workflows/run-isb1-kv-stress-sweep.yml b/.github/workflows/run-isb1-kv-stress-sweep.yml deleted file mode 100644 index f72ef3307..000000000 --- a/.github/workflows/run-isb1-kv-stress-sweep.yml +++ /dev/null @@ -1,110 +0,0 @@ -name: Run ISB1 KV Stress Sweep -run-name: ISB1 KV Stress - ${{ github.event.inputs.config-file || '.github/configs/isb1-kv-stress.yaml' }} - -on: - workflow_dispatch: - inputs: - config-file: - description: ISB1 KV stress config file path - required: true - default: .github/configs/isb1-kv-stress.yaml - runner-type: - description: Optional space-separated runner filters (e.g. h200 b200) - required: false - default: '' - runner-config: - description: Runner config YAML - required: false - default: .github/configs/runners.yaml - ref: - description: Git ref to checkout - required: false - default: '' - -jobs: - setup: - runs-on: ubuntu-latest - outputs: - kv-stress-matrix: ${{ steps.generate.outputs.kv-stress-matrix }} - has-matrix: ${{ steps.generate.outputs.has-matrix }} - steps: - - name: Checkout code - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: pip install pydantic pyyaml - - - id: generate - env: - CONFIG_FILE: ${{ inputs.config-file }} - RUNNER_CONFIG: ${{ inputs.runner-config }} - RUNNER_TYPE: ${{ inputs.runner-type }} - run: | - if [ ! -f "$CONFIG_FILE" ]; then - echo "Missing ISB1 KV stress config file: $CONFIG_FILE" >&2 - exit 1 - fi - - cmd=(python3 utils/matrix_logic/generate_sweep_configs.py isb1-kv-stress-sweep --config-files "$CONFIG_FILE" --runner-config "$RUNNER_CONFIG") - - if [ -n "$RUNNER_TYPE" ]; then - read -r -a runner_types <<< "$RUNNER_TYPE" - cmd+=(--runner-type "${runner_types[@]}") - fi - - matrix_json="$("${cmd[@]}")" - compact_matrix="$(printf '%s' "$matrix_json" | python3 -c 'import json,sys; print(json.dumps(json.load(sys.stdin)))')" - has_matrix="$(printf '%s' "$compact_matrix" | python3 -c 'import json,sys; print("true" if json.load(sys.stdin) else "false")')" - - { - echo "kv-stress-matrix=$compact_matrix" - echo "has-matrix=$has_matrix" - } >> "$GITHUB_OUTPUT" - - sweep: - needs: setup - if: ${{ needs.setup.outputs.has-matrix == 'true' }} - uses: ./.github/workflows/benchmark-isb1-tmpl.yml - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.setup.outputs.kv-stress-matrix) }} - secrets: inherit - with: - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - model-prefix: ${{ matrix.config.model-prefix }} - precision: ${{ matrix.config.precision }} - framework: ${{ matrix.config.framework }} - exp-name: ${{ matrix.config.exp-name }} - benchmark-type: ${{ matrix.config.benchmark-type }} - export-file: ${{ matrix.config.export-file }} - runtime-stack-id: ${{ matrix.config.runtime-stack-id }} - hardware-profile-id: ${{ matrix.config.hardware-profile-id }} - canonical-model-id: ${{ matrix.config.canonical-model-id }} - support-status: ${{ matrix.config.support-status || '' }} - request-mode: ${{ matrix.config.request-mode }} - max-concurrency: ${{ matrix.config.max-concurrency }} - max-model-len: ${{ matrix.config.max-model-len || '' }} - tp-override: ${{ matrix.config.tp || '' }} - ep-override: ${{ matrix.config.ep || '' }} - trace-source: ${{ matrix.config.trace-source || '' }} - offload-mode: ${{ matrix.config.offload-mode }} - kv-cache-dtype: ${{ matrix.config.kv-cache-dtype }} - disable-prefix-caching: ${{ matrix.config.disable-prefix-caching }} - benchmark-duration-s: ${{ matrix.config.benchmark-duration-s }} - workload-type: ${{ matrix.config.workload-type }} - ref: ${{ inputs.ref || github.ref }} - - collect-results: - needs: [setup, sweep] - if: ${{ always() && needs.setup.outputs.has-matrix == 'true' && needs.sweep.result != 'skipped' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - result-prefix: isb1 diff --git a/.github/workflows/run-isb1-sweep.yml b/.github/workflows/run-isb1-sweep.yml deleted file mode 100644 index a8f3177de..000000000 --- a/.github/workflows/run-isb1-sweep.yml +++ /dev/null @@ -1,256 +0,0 @@ -name: Run ISB1 Sweep -run-name: ISB1 Sweep - ${{ github.event.inputs.config-files || '.github/configs/isb1-master.yaml' }} - -on: - workflow_dispatch: - inputs: - config-files: - description: Space-separated ISB1 config file paths - required: true - default: .github/configs/isb1-master.yaml - runner-config: - description: Runner config YAML - required: false - default: .github/configs/runners.yaml - model-prefix: - description: Optional space-separated model-prefix filters - required: false - default: '' - precision: - description: Optional space-separated precision filters - required: false - default: '' - framework: - description: Optional space-separated framework filters - required: false - default: '' - runner-type: - description: Optional space-separated runner filters - required: false - default: '' - runner-node-filter: - description: Optional runner-node substring filter - required: false - default: '' - max-concurrency: - description: Optional cap applied to replay max-concurrency - required: false - default: '' - vllm-cpu-offload-gb: - description: Optional vLLM CPU offload budget in GB for long-context runs - required: false - default: '' - vllm-swap-space-gb: - description: Optional vLLM swap-space budget in GB for long-context runs - required: false - default: '' - sglang-mem-fraction-override: - description: Optional SGLang mem-fraction-static override for long-context runs - required: false - default: '' - sglang-chunked-prefill-override: - description: Optional SGLang chunked-prefill-size override for long-context runs - required: false - default: '' - ref: - description: Git ref to checkout - required: false - default: '' - -jobs: - setup: - runs-on: ubuntu-latest - outputs: - replay-matrix: ${{ steps.generate.outputs.replay-matrix }} - has-matrix: ${{ steps.generate.outputs.has-matrix }} - steps: - - name: Checkout code - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - ref: ${{ inputs.ref || github.ref }} - - - name: Install dependencies - run: pip install pydantic pyyaml - - - id: generate - env: - CONFIG_FILES: ${{ inputs.config-files }} - RUNNER_CONFIG: ${{ inputs.runner-config }} - MODEL_PREFIX: ${{ inputs.model-prefix }} - PRECISION: ${{ inputs.precision }} - FRAMEWORK: ${{ inputs.framework }} - RUNNER_TYPE: ${{ inputs.runner-type }} - RUNNER_NODE_FILTER: ${{ inputs.runner-node-filter }} - MAX_CONCURRENCY: ${{ inputs.max-concurrency }} - run: | - read -r -a config_files <<< "$CONFIG_FILES" - - for config_file in "${config_files[@]}"; do - if [ ! -f "$config_file" ]; then - echo "Missing ISB1 config file: $config_file" >&2 - echo "PR1b adds the workflow lane only; the committed config arrives in PR2." >&2 - exit 1 - fi - done - - cmd=(python3 utils/matrix_logic/generate_sweep_configs.py isb1-sweep --config-files "${config_files[@]}" --runner-config "$RUNNER_CONFIG") - - if [ -n "$MODEL_PREFIX" ]; then - read -r -a model_prefixes <<< "$MODEL_PREFIX" - cmd+=(--model-prefix "${model_prefixes[@]}") - fi - if [ -n "$PRECISION" ]; then - read -r -a precisions <<< "$PRECISION" - cmd+=(--precision "${precisions[@]}") - fi - if [ -n "$FRAMEWORK" ]; then - read -r -a frameworks <<< "$FRAMEWORK" - cmd+=(--framework "${frameworks[@]}") - fi - if [ -n "$RUNNER_TYPE" ]; then - read -r -a runner_types <<< "$RUNNER_TYPE" - cmd+=(--runner-type "${runner_types[@]}") - fi - if [ -n "$RUNNER_NODE_FILTER" ]; then - cmd+=(--runner-node-filter "$RUNNER_NODE_FILTER") - fi - if [ -n "$MAX_CONCURRENCY" ]; then - cmd+=(--max-concurrency "$MAX_CONCURRENCY") - fi - - matrix_json="$("${cmd[@]}")" - compact_matrix="$(printf '%s' "$matrix_json" | python3 -c 'import json,sys; print(json.dumps(json.load(sys.stdin)))')" - has_matrix="$(printf '%s' "$compact_matrix" | python3 -c 'import json,sys; print("true" if json.load(sys.stdin) else "false")')" - - { - echo "replay-matrix=$compact_matrix" - echo "has-matrix=$has_matrix" - } >> "$GITHUB_OUTPUT" - - - name: Write ISB1 preflight run manifest - env: - REPLAY_MATRIX: ${{ steps.generate.outputs.replay-matrix }} - HAS_MATRIX: ${{ steps.generate.outputs.has-matrix }} - INPUT_CONFIG_FILES: ${{ inputs.config-files }} - INPUT_RUNNER_CONFIG: ${{ inputs.runner-config }} - INPUT_MODEL_PREFIX: ${{ inputs.model-prefix }} - INPUT_PRECISION: ${{ inputs.precision }} - INPUT_FRAMEWORK: ${{ inputs.framework }} - INPUT_RUNNER_TYPE: ${{ inputs.runner-type }} - INPUT_RUNNER_NODE_FILTER: ${{ inputs.runner-node-filter }} - INPUT_MAX_CONCURRENCY: ${{ inputs.max-concurrency }} - INPUT_VLLM_CPU_OFFLOAD_GB: ${{ inputs.vllm-cpu-offload-gb }} - INPUT_VLLM_SWAP_SPACE_GB: ${{ inputs.vllm-swap-space-gb }} - INPUT_SGLANG_MEM_FRACTION_OVERRIDE: ${{ inputs.sglang-mem-fraction-override }} - INPUT_SGLANG_CHUNKED_PREFILL_OVERRIDE: ${{ inputs.sglang-chunked-prefill-override }} - INPUT_REF: ${{ inputs.ref || github.ref }} - WORKFLOW_RUN_ID: ${{ github.run_id }} - WORKFLOW_RUN_ATTEMPT: ${{ github.run_attempt }} - WORKFLOW_SHA: ${{ github.sha }} - run: | - python3 - <<'PY' - import json - import os - from collections import Counter - - matrix_rows = json.loads(os.environ.get("REPLAY_MATRIX") or "[]") - - def count_by(field: str) -> dict[str, int]: - values = [row.get(field) for row in matrix_rows] - normalized = ["" if value is None else str(value) for value in values] - return dict(sorted(Counter(normalized).items())) - - manifest = { - "dispatch_inputs": { - "config-files": os.environ.get("INPUT_CONFIG_FILES", ""), - "runner-config": os.environ.get("INPUT_RUNNER_CONFIG", ""), - "model-prefix": os.environ.get("INPUT_MODEL_PREFIX", ""), - "precision": os.environ.get("INPUT_PRECISION", ""), - "framework": os.environ.get("INPUT_FRAMEWORK", ""), - "runner-type": os.environ.get("INPUT_RUNNER_TYPE", ""), - "runner-node-filter": os.environ.get("INPUT_RUNNER_NODE_FILTER", ""), - "max-concurrency": os.environ.get("INPUT_MAX_CONCURRENCY", ""), - "vllm-cpu-offload-gb": os.environ.get("INPUT_VLLM_CPU_OFFLOAD_GB", ""), - "vllm-swap-space-gb": os.environ.get("INPUT_VLLM_SWAP_SPACE_GB", ""), - "sglang-mem-fraction-override": os.environ.get("INPUT_SGLANG_MEM_FRACTION_OVERRIDE", ""), - "sglang-chunked-prefill-override": os.environ.get("INPUT_SGLANG_CHUNKED_PREFILL_OVERRIDE", ""), - "ref": os.environ.get("INPUT_REF", ""), - }, - "matrix_summary": { - "has_matrix": os.environ.get("HAS_MATRIX", "false"), - "total_cells": len(matrix_rows), - "by_model_prefix": count_by("model-prefix"), - "by_framework": count_by("framework"), - "by_runner": count_by("runner"), - "by_support_status": count_by("support-status"), - }, - "workflow_context": { - "run_id": os.environ.get("WORKFLOW_RUN_ID", ""), - "run_attempt": os.environ.get("WORKFLOW_RUN_ATTEMPT", ""), - "sha": os.environ.get("WORKFLOW_SHA", ""), - }, - "matrix_rows": matrix_rows, - } - - with open("isb1_run_manifest.json", "w", encoding="utf-8") as fh: - json.dump(manifest, fh, indent=2, sort_keys=True) - PY - - - name: Upload ISB1 run manifest - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 - with: - name: isb1_run_manifest - path: isb1_run_manifest.json - if-no-files-found: error - - sweep: - needs: setup - if: ${{ needs.setup.outputs.has-matrix == 'true' }} - uses: ./.github/workflows/benchmark-isb1-tmpl.yml - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.setup.outputs.replay-matrix) }} - secrets: inherit - with: - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - model-prefix: ${{ matrix.config.model-prefix }} - precision: ${{ matrix.config.precision }} - framework: ${{ matrix.config.framework }} - exp-name: ${{ matrix.config.exp-name }} - benchmark-type: ${{ matrix.config.benchmark-type }} - export-file: ${{ matrix.config.export-file }} - runtime-stack-id: ${{ matrix.config.runtime-stack-id }} - hardware-profile-id: ${{ matrix.config.hardware-profile-id }} - canonical-model-id: ${{ matrix.config.canonical-model-id }} - support-status: ${{ matrix.config.support-status || '' }} - request-mode: ${{ matrix.config.request-mode }} - max-concurrency: ${{ matrix.config.max-concurrency }} - max-sessions: ${{ matrix.config.max-sessions || '' }} - max-turns-per-session: ${{ matrix.config.max-turns-per-session || '' }} - max-output-len: ${{ matrix.config.max-output-len || '' }} - num-warmup-sessions: ${{ matrix.config.num-warmup-sessions || '0' }} - ignore-waits: ${{ matrix.config.ignore-waits || false }} - ignore-eos: ${{ matrix.config.ignore-eos || false }} - max-model-len: ${{ matrix.config.max-model-len || '' }} - offload-mode: ${{ matrix.config.offload-mode || '' }} - kv-cache-dtype: ${{ matrix.config.kv-cache-dtype || '' }} - disable-prefix-caching: ${{ matrix.config.disable-prefix-caching || false }} - benchmark-duration-s: ${{ matrix.config.benchmark-duration-s || '' }} - vllm-cpu-offload-gb: ${{ inputs.vllm-cpu-offload-gb || '' }} - vllm-swap-space-gb: ${{ inputs.vllm-swap-space-gb || '' }} - sglang-mem-fraction-override: ${{ inputs.sglang-mem-fraction-override || '' }} - sglang-chunked-prefill-override: ${{ inputs.sglang-chunked-prefill-override || '' }} - ref: ${{ inputs.ref || github.ref }} - - collect-results: - needs: [setup, sweep] - if: ${{ always() && needs.setup.outputs.has-matrix == 'true' && needs.sweep.result != 'skipped' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - result-prefix: isb1 diff --git a/.gitignore b/.gitignore index 1b87019c5..03d36472a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,2 @@ **/__pycache__/** -**/.coverage -**/.DS_Store -prompt-exports/ -.claude \ No newline at end of file +**/.coverage \ No newline at end of file diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index ea35df323..535313252 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -66,304 +66,6 @@ stop_gpu_monitor() { GPU_MONITOR_PID="" } -KV_METRICS_PID="" -KV_METRICS_CSV="/workspace/kv_metrics.csv" -VLLM_OFFLOAD_EXTRA_ARGS="" -VLLM_EXTRA_ARGS="" -SGLANG_EXTRA_ARGS="" - -build_yarn_override_json() { - local max_model_len="${1:?}" - local factor="2.0" - if (( max_model_len > 600000 )); then - factor="4.0" - fi - echo "{\"text_config\":{\"rope_parameters\":{\"mrope_interleaved\":true,\"mrope_section\":[11,11,10],\"rope_type\":\"yarn\",\"rope_theta\":10000000,\"partial_rotary_factor\":0.25,\"factor\":${factor},\"original_max_position_embeddings\":262144}}}" -} - -apply_yarn_config_if_needed() { - local model="${1:?}" - local max_model_len="${2:?}" - if [[ "$model" == *"Qwen3.5"* || "$model" == *"qwen3.5"* || "$model" == *"Qwen3_5"* ]] && (( max_model_len > 262144 )); then - YARN_OVERRIDE_JSON=$(build_yarn_override_json "$max_model_len") - export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 - export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 - echo "YaRN enabled: factor=$(echo "$YARN_OVERRIDE_JSON" | grep -o '"factor":[0-9.]*' | cut -d: -f2) for max-model-len=$max_model_len" - fi -} - -_append_config_kv_once() { - local key="$1" - local value="$2" - - if [[ ! -f config.yaml ]]; then - return 0 - fi - - if ! grep -Eq "^${key}:" config.yaml; then - echo "${key}: ${value}" >> config.yaml - fi -} - -_remove_config_kv() { - local key="$1" - - if [[ ! -f config.yaml ]]; then - return 0 - fi - - local tmp_file - tmp_file=$(mktemp) - grep -Ev "^${key}:" config.yaml > "$tmp_file" - mv "$tmp_file" config.yaml -} - -_detect_total_cpu_dram_gb() { - if [[ -n "${TOTAL_CPU_DRAM_GB:-}" ]]; then - echo "${TOTAL_CPU_DRAM_GB}" - return 0 - fi - - if [[ -f /proc/meminfo ]]; then - awk '/MemTotal/{printf "%.0f", $2/1048576}' /proc/meminfo - return 0 - fi - - if command -v sysctl >/dev/null 2>&1; then - local mem_bytes - mem_bytes=$(sysctl -n hw.memsize 2>/dev/null || echo "") - if [[ -n "$mem_bytes" ]]; then - awk -v bytes="$mem_bytes" 'BEGIN {printf "%.0f", bytes/1073741824}' - return 0 - fi - fi - - echo "64" -} - -apply_vllm_offload_config() { - local mode="${OFFLOAD_MODE:-legacy}" - local detected_dram_gb="" - - VLLM_OFFLOAD_EXTRA_ARGS="" - VLLM_EXTRA_ARGS="" - - case "$mode" in - on) - PREFIX_CACHING_CONFIG="" - _remove_config_kv "no-enable-prefix-caching" - _remove_config_kv "cpu-offload-gb" - _remove_config_kv "swap-space" - detected_dram_gb="$(_detect_total_cpu_dram_gb)" - VLLM_OFFLOAD_EXTRA_ARGS="--kv_offloading_backend native --kv_offloading_size ${detected_dram_gb} --disable-hybrid-kv-cache-manager" - ;; - off) - PREFIX_CACHING_CONFIG="" - _remove_config_kv "no-enable-prefix-caching" - _remove_config_kv "cpu-offload-gb" - _remove_config_kv "swap-space" - ;; - noprefix) - PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" - _remove_config_kv "cpu-offload-gb" - _remove_config_kv "swap-space" - _append_config_kv_once "no-enable-prefix-caching" "true" - ;; - legacy|"") - if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - _append_config_kv_once "cpu-offload-gb" "${VLLM_CPU_OFFLOAD_GB}" - fi - if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - _append_config_kv_once "swap-space" "${VLLM_SWAP_SPACE_GB}" - fi - ;; - *) - echo "WARN: Unknown OFFLOAD_MODE='${mode}', falling back to legacy behavior" >&2 - if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - _append_config_kv_once "cpu-offload-gb" "${VLLM_CPU_OFFLOAD_GB}" - fi - if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - _append_config_kv_once "swap-space" "${VLLM_SWAP_SPACE_GB}" - fi - ;; - esac - - if [[ "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then - PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" - _append_config_kv_once "no-enable-prefix-caching" "true" - fi - - if [[ "${KV_CACHE_DTYPE:-}" == "fp8" ]]; then - _append_config_kv_once "kv-cache-dtype" "fp8" - fi - - if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then - VLLM_EXTRA_ARGS="${VLLM_EXTRA_ARGS:-} --hf-overrides '${YARN_OVERRIDE_JSON}'" - fi -} - -apply_sglang_offload_config() { - local mode="${OFFLOAD_MODE:-legacy}" - - SGLANG_EXTRA_ARGS="" - - case "$mode" in - on) - echo "WARN: OFFLOAD_MODE=on requested for SGLang, but native KV offload is not supported. Leaving cache mode unchanged." >&2 - ;; - off) - RADIX_CACHE_ARGS="" - ;; - noprefix) - RADIX_CACHE_ARGS="--disable-radix-cache" - ;; - legacy|"") - ;; - *) - echo "WARN: Unknown OFFLOAD_MODE='${mode}' for SGLang; leaving radix cache args unchanged." >&2 - ;; - esac - - if [[ "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then - RADIX_CACHE_ARGS="--disable-radix-cache" - fi - - if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then - SGLANG_EXTRA_ARGS="${SGLANG_EXTRA_ARGS:-} --json-model-override-args '${YARN_OVERRIDE_JSON}'" - fi -} - -# launch_vllm_server [extra args...] -# Sets: SERVER_PID, SERVER_LOG -launch_vllm_server() { - local model="$1" - local port="$2" - local config_yaml_path="$3" - shift 3 || true - local extra_args=("$@") - - if [[ -z "$model" || -z "$port" || -z "$config_yaml_path" ]]; then - echo "launch_vllm_server requires: model port config_yaml_path" >&2 - return 1 - fi - - hf download "$model" - apply_vllm_offload_config - - SERVER_LOG="${SERVER_LOG:-/workspace/server.log}" - - local vllm_max_num_seqs="${VLLM_MAX_NUM_SEQS:-}" - if [[ -z "$vllm_max_num_seqs" ]]; then - local conc_value="${CONC:-256}" - if [[ "$conc_value" =~ ^[0-9]+$ ]] && (( conc_value > 256 )); then - vllm_max_num_seqs="$conc_value" - else - vllm_max_num_seqs="256" - fi - fi - - local vllm_tp="${TP:-1}" - local vllm_gpu_mem_util="${VLLM_GPU_MEMORY_UTILIZATION:-0.9}" - - local offload_args=() - if [[ -n "$VLLM_OFFLOAD_EXTRA_ARGS" ]]; then - # shellcheck disable=SC2206 - offload_args=($VLLM_OFFLOAD_EXTRA_ARGS) - fi - - PYTHONNOUSERSITE=1 vllm serve "$model" --host 0.0.0.0 --port "$port" \ - --config "$config_yaml_path" \ - --gpu-memory-utilization "$vllm_gpu_mem_util" \ - --tensor-parallel-size "$vllm_tp" \ - --max-num-seqs "$vllm_max_num_seqs" \ - "${extra_args[@]}" \ - "${offload_args[@]}" \ - > "$SERVER_LOG" 2>&1 & - - SERVER_PID=$! - export SERVER_PID - export SERVER_LOG -} - -# launch_sglang_server [extra args...] -# Sets: SERVER_PID, SERVER_LOG -launch_sglang_server() { - local model="$1" - local port="$2" - shift 2 || true - local extra_args=("$@") - - if [[ -z "$model" || -z "$port" ]]; then - echo "launch_sglang_server requires: model port" >&2 - return 1 - fi - - hf download "$model" - if [[ -n "${OFFLOAD_MODE:-}" || "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then - apply_sglang_offload_config - fi - - SERVER_LOG="${SERVER_LOG:-/workspace/server.log}" - - local sglang_tp="${TP:-1}" - local sglang_dp="${DP_SIZE:-1}" - - PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ - --model-path "$model" \ - --host 0.0.0.0 \ - --port "$port" \ - --tensor-parallel-size "$sglang_tp" \ - --data-parallel-size "$sglang_dp" \ - "${extra_args[@]}" \ - > "$SERVER_LOG" 2>&1 & - - SERVER_PID=$! - export SERVER_PID - export SERVER_LOG -} - -start_kv_metrics_collector() { - local port="${1:-8888}" - local output="${2:-$KV_METRICS_CSV}" - local interval="${3:-2.0}" - local collector_script - - collector_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../datasets/isb1/scripts" && pwd)/metrics_collector.py" - - if [[ ! -f "$collector_script" ]]; then - echo "[KV Metrics] Collector script not found at $collector_script, skipping" - return 0 - fi - - if [[ -n "$KV_METRICS_PID" ]] && kill -0 "$KV_METRICS_PID" 2>/dev/null; then - echo "[KV Metrics] Collector already running (PID=$KV_METRICS_PID)" - return 0 - fi - - KV_METRICS_CSV="$output" - python3 "$collector_script" \ - --metrics-url "http://0.0.0.0:${port}/metrics" \ - --output "$output" \ - --interval "$interval" >/tmp/kv_metrics_collector.log 2>&1 & - KV_METRICS_PID=$! - - echo "[KV Metrics] Started (PID=$KV_METRICS_PID, interval=${interval}s, output=$output)" -} - -stop_kv_metrics_collector() { - if [[ -n "$KV_METRICS_PID" ]] && kill -0 "$KV_METRICS_PID" 2>/dev/null; then - kill "$KV_METRICS_PID" 2>/dev/null || true - wait "$KV_METRICS_PID" 2>/dev/null || true - echo "[KV Metrics] Stopped (PID=$KV_METRICS_PID)" - if [[ -f "$KV_METRICS_CSV" ]]; then - local lines - lines=$(wc -l < "$KV_METRICS_CSV") - echo "[KV Metrics] Collected $lines rows -> $KV_METRICS_CSV" - fi - fi - KV_METRICS_PID="" -} - # Check if required environment variables are set # Usage: check_env_vars VAR1 VAR2 VAR3 ... # Exits with code 1 if any variable is not set @@ -693,194 +395,6 @@ run_benchmark_serving() { return $benchmark_exit_code } -is_isb1_replay_benchmark() { - [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] -} - -is_isb1_kv_stress_benchmark() { - [[ "${BENCHMARK_TYPE:-}" == "isb1_kv_stress" ]] -} - -resolve_replay_request_mode_for_harness() { - local requested_mode="${1:-auto}" - - case "$requested_mode" in - ""|auto|chat|completions) - printf '%s' "${requested_mode:-auto}" - ;; - multi-turn|multi_turn|multiturn) - printf 'auto' - ;; - *) - echo "WARN: Unsupported replay request mode '$requested_mode'; using 'auto' for the harness boundary" >&2 - printf 'auto' - ;; - esac -} - -run_isb1_kv_stress_campaign_cell() { - check_env_vars \ - BENCHMARK_TYPE \ - EXPORT_FILE \ - MAX_CONCURRENCY \ - OFFLOAD_MODE \ - BENCHMARK_DURATION_S \ - KV_CACHE_DTYPE \ - WORKLOAD_TYPE - - if ! is_isb1_kv_stress_benchmark; then - echo "Error: run_isb1_kv_stress_campaign_cell called with BENCHMARK_TYPE='${BENCHMARK_TYPE:-}'" >&2 - return 1 - fi - - local port="${PORT:-8888}" - local kv_metrics_output="/workspace/kv_metrics.csv" - local metadata_path="/workspace/kv_stress_campaign_metadata.json" - local replay_exit_code=0 - - start_gpu_monitor - start_kv_metrics_collector "$port" "$kv_metrics_output" 2.0 - - run_benchmark_export_replay "$@" || replay_exit_code=$? - - stop_kv_metrics_collector - stop_gpu_monitor - - python3 - <<'PY' -import json -import os -import time - -metadata = { - "benchmark_type": os.getenv("BENCHMARK_TYPE", ""), - "export_file": os.getenv("EXPORT_FILE", ""), - "runtime_stack_id": os.getenv("RUNTIME_STACK_ID", ""), - "hardware_profile_id": os.getenv("HARDWARE_PROFILE_ID", ""), - "canonical_model_id": os.getenv("CANONICAL_MODEL_ID", ""), - "request_mode": os.getenv("REQUEST_MODE", ""), - "max_concurrency": os.getenv("MAX_CONCURRENCY", ""), - "offload_mode": os.getenv("OFFLOAD_MODE", ""), - "disable_prefix_caching": os.getenv("DISABLE_PREFIX_CACHING", ""), - "kv_cache_dtype": os.getenv("KV_CACHE_DTYPE", ""), - "benchmark_duration_s": os.getenv("BENCHMARK_DURATION_S", ""), - "workload_type": os.getenv("WORKLOAD_TYPE", ""), - "metrics_files": { - "gpu": "/workspace/gpu_metrics.csv", - "kv": "/workspace/kv_metrics.csv", - }, - "captured_at_epoch_s": int(time.time()), -} -with open("/workspace/kv_stress_campaign_metadata.json", "w", encoding="utf-8") as f: - json.dump(metadata, f, indent=2, sort_keys=True) -PY - - echo "[KV Stress] Campaign metadata written to $metadata_path" - return "$replay_exit_code" -} - -run_single_node_benchmark() { - if ! is_isb1_replay_benchmark && ! is_isb1_kv_stress_benchmark; then - run_benchmark_serving "$@" - return $? - fi - - set +x - local model="" - local port="" - local result_filename="" - local result_dir="" - local workspace_dir="" - local trust_remote_code=false - local server_pid="" - - while [[ $# -gt 0 ]]; do - case $1 in - --model) model="$2"; shift 2 ;; - --port) port="$2"; shift 2 ;; - --result-filename) result_filename="$2"; shift 2 ;; - --result-dir) result_dir="$2"; shift 2 ;; - --bench-serving-dir) workspace_dir="$2"; shift 2 ;; - --trust-remote-code) trust_remote_code=true; shift ;; - --server-pid) server_pid="$2"; shift 2 ;; - --backend|--input-len|--output-len|--random-range-ratio|--num-prompts|--max-concurrency) - shift 2 - ;; - --use-chat-template) - shift - ;; - *) - echo "Unknown parameter: $1" - return 1 - ;; - esac - done - - if [[ -z "$model" ]]; then - echo "Error: --model is required" - return 1 - fi - if [[ -z "$port" ]]; then - echo "Error: --port is required" - return 1 - fi - if [[ -z "$result_filename" ]]; then - echo "Error: --result-filename is required" - return 1 - fi - if [[ -z "$result_dir" ]]; then - echo "Error: --result-dir is required" - return 1 - fi - - local replay_args=( - --model "$model" - --port "$port" - --export-file "${EXPORT_FILE}" - --runtime-stack-id "${RUNTIME_STACK_ID}" - --hardware-profile-id "${HARDWARE_PROFILE_ID}" - --canonical-model-id "${CANONICAL_MODEL_ID}" - --request-mode "${REQUEST_MODE:-auto}" - --max-concurrency "${MAX_CONCURRENCY}" - --num-warmup-sessions "${NUM_WARMUP_SESSIONS:-0}" - --result-filename "$result_filename" - --result-dir "$result_dir" - ) - - if [[ -n "$workspace_dir" ]]; then - replay_args+=(--bench-serving-dir "$workspace_dir") - fi - if [[ -n "${MAX_SESSIONS:-}" ]]; then - replay_args+=(--max-sessions "${MAX_SESSIONS}") - fi - if [[ -n "${SUPPORT_STATUS:-}" ]]; then - replay_args+=(--support-status "${SUPPORT_STATUS}") - fi - if [[ -n "${MAX_TURNS_PER_SESSION:-}" ]]; then - replay_args+=(--max-turns-per-session "${MAX_TURNS_PER_SESSION}") - fi - if [[ -n "${MAX_OUTPUT_LEN:-}" ]]; then - replay_args+=(--max-output-len "${MAX_OUTPUT_LEN}") - fi - if [[ "${IGNORE_WAITS:-false}" == "true" ]]; then - replay_args+=(--ignore-waits) - fi - if [[ "${IGNORE_EOS:-false}" == "true" ]]; then - replay_args+=(--ignore-eos) - fi - if [[ "$trust_remote_code" == true ]]; then - replay_args+=(--trust-remote-code) - fi - if [[ -n "$server_pid" ]]; then - replay_args+=(--server-pid "$server_pid") - fi - - if is_isb1_kv_stress_benchmark; then - run_isb1_kv_stress_campaign_cell "${replay_args[@]}" - else - run_benchmark_export_replay "${replay_args[@]}" - fi -} - # -------------------------------- # Profiling trace helpers @@ -1291,215 +805,3 @@ run_eval() { fi return $eval_rc } - - -# --------------------------------------------------------------------------- -# Multi-turn benchmark wrapper -# --------------------------------------------------------------------------- - -# Run multi-turn chat benchmark with standardized parameters. -# Exercises growing KV cache across conversation turns via /v1/chat/completions. -# -# IMPORTANT: The server MUST be started with prefix/radix caching ENABLED -# for meaningful multi-turn results. Do NOT use --disable-radix-cache or -# --no-enable-prefix-caching with multi-turn benchmarks. -# Replay ISB1 export sessions/events against a running server. -# -# Supports: -# - inferencex_multiturn exports via /v1/chat/completions (standalone vLLM/SGLang) -# - inferencex_trace_replay exports via either chat or projected completions -# mode (useful for TRT / Dynamo-style cells) -# -# Parameters: -# --model: Model name sent to the target server -# --port: Server port -# --export-file: Path to export JSON -# --runtime-stack-id: Filter selected export cells to one runtime stack -# --hardware-profile-id: Filter selected export cells to one hardware row -# --canonical-model-id: Filter selected export cells to one canonical model row -# --request-mode: auto|chat|completions (default: auto) -# --max-concurrency: Max concurrent replay sessions -# --num-warmup-sessions: Warmup sessions before measurement -# --result-filename: Result filename without extension -# --result-dir: Result directory -# --max-sessions: Optional session limit for smoke runs -# --max-turns-per-session: Optional turn cap for smoke runs -# --max-output-len: Optional per-turn output cap -# --ignore-waits: Ignore inter-turn wait gaps from export metadata -# --trust-remote-code: Optional flag -# --server-pid: Optional server process ID to monitor -run_benchmark_export_replay() { - set +x - local model="" - local port="" - local export_file="" - local runtime_stack_id="" - local hardware_profile_id="" - local canonical_model_id="" - local trace_id="" - local support_status="" - local request_mode="auto" - local max_concurrency="8" - local num_warmup_sessions="1" - local result_filename="" - local result_dir="" - local workspace_dir="" - local max_sessions="" - local max_turns_per_session="" - local max_output_len="" - local ignore_waits=false - local trust_remote_code=false - local ignore_eos=false - local server_pid="" - - while [[ $# -gt 0 ]]; do - case $1 in - --model) model="$2"; shift 2 ;; - --port) port="$2"; shift 2 ;; - --export-file) export_file="$2"; shift 2 ;; - --runtime-stack-id) runtime_stack_id="$2"; shift 2 ;; - --hardware-profile-id) hardware_profile_id="$2"; shift 2 ;; - --canonical-model-id) canonical_model_id="$2"; shift 2 ;; - --trace-id) trace_id="$2"; shift 2 ;; - --support-status) support_status="$2"; shift 2 ;; - --request-mode) request_mode="$2"; shift 2 ;; - --max-concurrency) max_concurrency="$2"; shift 2 ;; - --num-warmup-sessions) num_warmup_sessions="$2"; shift 2 ;; - --result-filename) result_filename="$2"; shift 2 ;; - --result-dir) result_dir="$2"; shift 2 ;; - --bench-serving-dir) workspace_dir="$2"; shift 2 ;; - --max-sessions) max_sessions="$2"; shift 2 ;; - --max-turns-per-session) max_turns_per_session="$2"; shift 2 ;; - --max-output-len) max_output_len="$2"; shift 2 ;; - --ignore-waits) ignore_waits=true; shift ;; - --trust-remote-code) trust_remote_code=true; shift ;; - --ignore-eos) ignore_eos=true; shift ;; - --server-pid) server_pid="$2"; shift 2 ;; - *) echo "Unknown parameter: $1"; return 1 ;; - esac - done - - if [[ -z "$model" ]]; then echo "Error: --model is required"; return 1; fi - if [[ -z "$port" ]]; then echo "Error: --port is required"; return 1; fi - if [[ -z "$export_file" ]]; then echo "Error: --export-file is required"; return 1; fi - if [[ -z "$result_filename" ]]; then echo "Error: --result-filename is required"; return 1; fi - if [[ -z "$result_dir" ]]; then echo "Error: --result-dir is required"; return 1; fi - - if [[ -z "$workspace_dir" ]]; then - workspace_dir=$(pwd) - fi - - local requested_request_mode="$request_mode" - local harness_request_mode - harness_request_mode=$(resolve_replay_request_mode_for_harness "$request_mode") - - local benchmark_cmd=( - python3 "$workspace_dir/utils/bench_serving/benchmark_export_replay.py" - --model "$model" - --base-url "http://0.0.0.0:$port" - --export-file "$export_file" - --request-mode "$harness_request_mode" - --max-concurrency "$max_concurrency" - --num-warmup-sessions "$num_warmup_sessions" - --save-result - --result-dir "$result_dir" - --result-filename "$result_filename.json" - --metadata - "benchmark_type=${BENCHMARK_TYPE:-isb1_replay}" - "export_file=$export_file" - "runtime_stack_id=$runtime_stack_id" - "hardware_profile_id=$hardware_profile_id" - "canonical_model_id=$canonical_model_id" - "request_mode=$requested_request_mode" - "harness_request_mode=$harness_request_mode" - ) - - if [[ -n "${WORKLOAD_TYPE:-}" ]]; then - benchmark_cmd+=(--metadata "workload_type=${WORKLOAD_TYPE}") - fi - if [[ -n "${BENCHMARK_DURATION_S:-}" ]]; then - benchmark_cmd+=(--metadata "benchmark_duration_s=${BENCHMARK_DURATION_S}") - fi - if [[ -n "${OFFLOAD_MODE:-}" ]]; then - benchmark_cmd+=(--metadata "offload_mode=${OFFLOAD_MODE}") - fi - if [[ -n "${KV_CACHE_DTYPE:-}" ]]; then - benchmark_cmd+=(--metadata "kv_cache_dtype=${KV_CACHE_DTYPE}") - fi - if [[ -n "${DISABLE_PREFIX_CACHING:-}" ]]; then - benchmark_cmd+=(--metadata "disable_prefix_caching=${DISABLE_PREFIX_CACHING}") - fi - - if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - benchmark_cmd+=(--metadata "vllm_cpu_offload_gb=${VLLM_CPU_OFFLOAD_GB}") - fi - if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - benchmark_cmd+=(--metadata "vllm_swap_space_gb=${VLLM_SWAP_SPACE_GB}") - fi - if [[ -n "${SGLANG_MEM_FRACTION_OVERRIDE:-}" ]]; then - benchmark_cmd+=(--metadata "sglang_mem_fraction_override=${SGLANG_MEM_FRACTION_OVERRIDE}") - fi - if [[ -n "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-}" ]]; then - benchmark_cmd+=(--metadata "sglang_chunked_prefill_override=${SGLANG_CHUNKED_PREFILL_OVERRIDE}") - fi - - if [[ -n "$runtime_stack_id" ]]; then - benchmark_cmd+=(--runtime-stack-id "$runtime_stack_id") - fi - if [[ -n "$hardware_profile_id" ]]; then - benchmark_cmd+=(--hardware-profile-id "$hardware_profile_id") - fi - if [[ -n "$canonical_model_id" ]]; then - benchmark_cmd+=(--canonical-model-id "$canonical_model_id") - fi - if [[ -n "$trace_id" ]]; then - benchmark_cmd+=(--trace-id "$trace_id") - fi - if [[ -n "$support_status" ]]; then - benchmark_cmd+=(--support-status "$support_status") - fi - if [[ -n "$max_sessions" ]]; then - benchmark_cmd+=(--max-sessions "$max_sessions") - fi - if [[ -n "$max_turns_per_session" ]]; then - benchmark_cmd+=(--max-turns-per-session "$max_turns_per_session") - fi - if [[ -n "$max_output_len" ]]; then - benchmark_cmd+=(--max-output-len "$max_output_len") - fi - if [[ "$ignore_waits" == true ]]; then - benchmark_cmd+=(--ignore-waits) - fi - if [[ "$trust_remote_code" == true ]]; then - benchmark_cmd+=(--trust-remote-code) - fi - if [[ "$ignore_eos" == true ]]; then - benchmark_cmd+=(--ignore-eos) - fi - - set -x - if [[ -n "$server_pid" ]]; then - "${benchmark_cmd[@]}" & - local benchmark_pid=$! - - while kill -0 "$benchmark_pid" 2>/dev/null; do - if ! kill -0 "$server_pid" 2>/dev/null; then - echo "ERROR: Server process $server_pid died during export replay benchmark" - kill "$benchmark_pid" 2>/dev/null - wait "$benchmark_pid" 2>/dev/null - set +x - return 1 - fi - sleep 2 - done - - wait "$benchmark_pid" - local benchmark_exit_code=$? - else - "${benchmark_cmd[@]}" - local benchmark_exit_code=$? - fi - set +x - - return $benchmark_exit_code -} diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/dsr1_fp4_b200.sh index e11290b95..d88941628 100644 --- a/benchmarks/single_node/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/dsr1_fp4_b200.sh @@ -31,26 +31,13 @@ else fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" -RUNTIME_CONTEXT_ARGS="" -if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then - RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN" -fi +EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context - RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -fi -RADIX_CACHE_ARGS="--disable-radix-cache" -if is_isb1_replay_benchmark; then - RADIX_CACHE_ARGS="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ @@ -58,7 +45,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem $RADIX_CACHE_ARGS --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -67,7 +54,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_single_node_benchmark \ +run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -77,8 +64,7 @@ run_single_node_benchmark \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" + --result-dir /workspace/ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -87,8 +73,5 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/dsr1_fp8_b200.sh index 0fbe9bd6c..e6d8a0e9c 100644 --- a/benchmarks/single_node/dsr1_fp8_b200.sh +++ b/benchmarks/single_node/dsr1_fp8_b200.sh @@ -38,9 +38,9 @@ if [[ $TP -eq 8 ]]; then MAX_RUNNING_REQUESTS=128 CUDA_GRAPH_MAX_BATCH_SIZE=128 - MEM_FRAC_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.82}" - CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" - MAX_PREFILL_TOKENS="$CHUNKED_PREFILL_SIZE" + MEM_FRAC_STATIC=0.82 + CHUNKED_PREFILL_SIZE=32768 + MAX_PREFILL_TOKENS=32768 elif [[ $TP -eq 4 ]]; then if [[ $ISL -ne 8192 ]] || [[ $OSL -ne 1024 ]]; then echo "TP=4 not yet supported for ISL=$ISL OSL=$OSL!" @@ -52,9 +52,9 @@ elif [[ $TP -eq 4 ]]; then MAX_RUNNING_REQUESTS=32 CUDA_GRAPH_MAX_BATCH_SIZE=32 - MEM_FRAC_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.95}" - CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-8192}" - MAX_PREFILL_TOKENS="$CHUNKED_PREFILL_SIZE" + MEM_FRAC_STATIC=0.95 + CHUNKED_PREFILL_SIZE=8192 + MAX_PREFILL_TOKENS=8192 SCHEDULER_RECV_INTERVAL=10 else @@ -63,34 +63,21 @@ else fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" -RUNTIME_CONTEXT_ARGS="" -if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then - RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN" -fi +EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context - RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -fi -RADIX_CACHE_ARGS="--disable-radix-cache" -if is_isb1_replay_benchmark; then - RADIX_CACHE_ARGS="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ --tensor-parallel-size=$TP --data-parallel-size=1 \ --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ ---enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL $RADIX_CACHE_ARGS \ ---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +--enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ +--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -99,7 +86,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_single_node_benchmark \ +run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -109,8 +96,7 @@ run_single_node_benchmark \ --num-prompts "$((CONC * 10))" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" + --result-dir /workspace/ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -119,8 +105,5 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi stop_gpu_monitor -set +x +set +x \ No newline at end of file diff --git a/benchmarks/single_node/dsr1_fp8_b200_vllm.sh b/benchmarks/single_node/dsr1_fp8_b200_vllm.sh deleted file mode 100644 index 5c3639fa9..000000000 --- a/benchmarks/single_node/dsr1_fp8_b200_vllm.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" -if is_isb1_replay_benchmark; then - PREFIX_CACHING_CONFIG="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' -$PREFIX_CACHING_CONFIG -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml -fi -if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - -export TORCH_CUDA_ARCH_LIST="10.0" -export PYTHONNOUSERSITE=1 -export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl - -# Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT \ ---config config.yaml \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size $TP \ ---max-num-seqs 256 \ ---disable-log-requests \ ---trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ -> $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" \ - --trust-remote-code - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/dsr1_fp8_h200.sh b/benchmarks/single_node/dsr1_fp8_h200.sh index a9730917a..c820d180b 100644 --- a/benchmarks/single_node/dsr1_fp8_h200.sh +++ b/benchmarks/single_node/dsr1_fp8_h200.sh @@ -23,50 +23,34 @@ PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi export TORCH_CUDA_ARCH_LIST="9.0" -RUNTIME_CONTEXT_ARGS="" -if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then - RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN" -fi +EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context - RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -fi -RADIX_CACHE_ARGS="--disable-radix-cache" -if is_isb1_replay_benchmark; then - RADIX_CACHE_ARGS="" + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.82}" -CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" set -x if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ - $RADIX_CACHE_ARGS --max-running-requests 512 --cuda-graph-max-bs 512 \ - --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \ + --disable-radix-cache --max-running-requests 512 --cuda-graph-max-bs 512 \ + --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \ --attention-backend flashinfer --stream-interval 10 \ --decode-log-interval 1 \ - $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & else PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ - $RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \ - --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \ + --disable-radix-cache --max-running-requests 256 --cuda-graph-max-bs 256 \ + --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \ --attention-backend flashinfer --stream-interval 10 \ --decode-log-interval 1 \ - $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & fi SERVER_PID=$! @@ -74,7 +58,7 @@ SERVER_PID=$! # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -run_single_node_benchmark \ +run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -84,8 +68,7 @@ run_single_node_benchmark \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" + --result-dir /workspace/ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -94,8 +77,5 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/dsr1_fp8_h200_vllm.sh b/benchmarks/single_node/dsr1_fp8_h200_vllm.sh deleted file mode 100644 index 65348e831..000000000 --- a/benchmarks/single_node/dsr1_fp8_h200_vllm.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" -if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then - PREFIX_CACHING_CONFIG="" -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' -$PREFIX_CACHING_CONFIG -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -apply_vllm_offload_config - -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 - -if ! is_isb1_kv_stress_benchmark; then - start_gpu_monitor -fi - -set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT \ ---config config.yaml \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size $TP \ ---max-num-seqs 256 \ ---disable-log-requests \ ---trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ -> $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" \ - --trust-remote-code - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if ! is_isb1_kv_stress_benchmark; then - stop_gpu_monitor -fi -set +x diff --git a/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh b/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh deleted file mode 100755 index 60f06b13e..000000000 --- a/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env bash -# TriAttention-enabled vLLM benchmark for DeepSeek-R1 FP8 on H100. -# -# Differences from baseline dsr1_fp8_h200_vllm.sh: -# - Installs triattention vLLM plugin -# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) -# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available -# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) -# - Explicitly disables prefix caching (incompatible with KV compression) - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -# --- TriAttention plugin setup --- -pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." - -# Auto-detect KV budget from export filename: chat workloads get larger budget. -TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" -if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then - TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" -fi -export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" - -# Use pre-calibrated sparse stats if available on the runner. -TRIATTN_STATS="/workspace/triattn_stats/deepseek_r1_0528_stats.pt" -if [[ -f "$TRIATTN_STATS" ]]; then - export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" - echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" -else - echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." -fi - -export ENABLE_TRIATTENTION=1 -echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" -# --- End TriAttention setup --- - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -enable-prefix-caching: false -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 1024 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT \ ---config config.yaml \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size $TP \ ---max-num-seqs 256 \ ---disable-log-requests \ ---trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ -> $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" \ - --trust-remote-code - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh b/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh deleted file mode 100755 index 1c4722964..000000000 --- a/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env bash -# TriAttention-enabled vLLM benchmark for DeepSeek-R1 FP8 on H200. -# -# Differences from baseline dsr1_fp8_h200_vllm.sh: -# - Installs triattention vLLM plugin -# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) -# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available -# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) -# - Explicitly disables prefix caching (incompatible with KV compression) - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -# --- TriAttention plugin setup --- -pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." - -# Auto-detect KV budget from export filename: chat workloads get larger budget. -TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" -if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then - TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" -fi -export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" - -# Use pre-calibrated sparse stats if available on the runner. -TRIATTN_STATS="/workspace/triattn_stats/deepseek_r1_0528_stats.pt" -if [[ -f "$TRIATTN_STATS" ]]; then - export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" - echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" -else - echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." -fi - -export ENABLE_TRIATTENTION=1 -echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" -# --- End TriAttention setup --- - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -enable-prefix-caching: false -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 1024 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT \ ---config config.yaml \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size $TP \ ---max-num-seqs 256 \ ---disable-log-requests \ ---trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ -> $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" \ - --trust-remote-code - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh index 95240230e..f6a6f72e9 100644 --- a/benchmarks/single_node/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/gptoss_fp4_b200.sh @@ -34,33 +34,15 @@ if [ "${EVAL_ONLY}" = "true" ]; then CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi -PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" -if is_isb1_replay_benchmark; then - PREFIX_CACHING_CONFIG="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - cat > config.yaml << EOF kv-cache-dtype: fp8 compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' -$PREFIX_CACHING_CONFIG +no-enable-prefix-caching: true max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $CALCULATED_MAX_MODEL_LEN EOF -if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml -fi -if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 @@ -70,9 +52,6 @@ PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ @@ -80,8 +59,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ --max-num-seqs 512 \ ---disable-log-requests $VLLM_OFFLOAD_EXTRA_ARGS \ -> $SERVER_LOG 2>&1 & +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -90,7 +68,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_single_node_benchmark \ +run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -100,8 +78,7 @@ run_single_node_benchmark \ --num-prompts $(( CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" + --result-dir /workspace/ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -110,8 +87,5 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/gptoss_fp4_b200_sglang.sh b/benchmarks/single_node/gptoss_fp4_b200_sglang.sh deleted file mode 100644 index f3d9ad82c..000000000 --- a/benchmarks/single_node/gptoss_fp4_b200_sglang.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -export NCCL_NVLS_ENABLE=1 -export SGL_ENABLE_JIT_DEEPGEMM=false -export SGLANG_ENABLE_FLASHINFER_GEMM=true -export PYTHONUNBUFFERED=1 - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -if [[ $CONC -ge 16 ]]; then - SCHEDULER_RECV_INTERVAL=30 -else - SCHEDULER_RECV_INTERVAL=10 -fi - -CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" -fi - -RADIX_CACHE_ARGS="--disable-radix-cache" -if is_isb1_replay_benchmark; then - RADIX_CACHE_ARGS="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" -CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path="$MODEL" --host=0.0.0.0 --port="$PORT" \ ---trust-remote-code \ ---tensor-parallel-size="$TP" --data-parallel-size=1 \ ---cuda-graph-max-bs 128 --max-running-requests 128 \ ---mem-fraction-static "$MEM_FRACTION_STATIC" --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 \ ---context-length "$CONTEXT_LENGTH" --kv-cache-dtype fp8_e4m3 \ -$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \ ---scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" \ ---reasoning-parser gpt-oss --tokenizer-worker-num 6 --stream-interval 30 > "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index dc5baf287..8d0e773a2 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -17,42 +17,20 @@ fi hf download "$MODEL" -if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then - MAX_MODEL_LEN="${MAX_MODEL_LEN}" -else - MAX_MODEL_LEN=10240 -fi +MAX_MODEL_LEN=10240 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi -PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" -if is_isb1_replay_benchmark; then - PREFIX_CACHING_CONFIG="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - cat > config.yaml << EOF -$PREFIX_CACHING_CONFIG +no-enable-prefix-caching: true max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $MAX_MODEL_LEN EOF -if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml -fi -if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - export PYTHONNOUSERSITE=1 export VLLM_MXFP4_USE_MARLIN=1 SERVER_LOG=/workspace/server.log @@ -60,17 +38,13 @@ PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --config config.yaml \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ ---max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \ -> $SERVER_LOG 2>&1 & +--max-num-seqs=$CONC > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -79,7 +53,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_single_node_benchmark \ +run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -89,8 +63,7 @@ run_single_node_benchmark \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" + --result-dir /workspace/ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -99,8 +72,5 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/gptoss_fp4_h100_sglang.sh b/benchmarks/single_node/gptoss_fp4_h100_sglang.sh deleted file mode 100644 index a045cd99c..000000000 --- a/benchmarks/single_node/gptoss_fp4_h100_sglang.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -export TORCH_CUDA_ARCH_LIST="9.0" - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" -fi - -RADIX_CACHE_ARGS="--disable-radix-cache" -if is_isb1_replay_benchmark; then - RADIX_CACHE_ARGS="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" -CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ ---host 0.0.0.0 --port "$PORT" --trust-remote-code \ ---tensor-parallel-size="$TP" --data-parallel-size=1 \ -$RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \ ---chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 --mem-fraction-static "$MEM_FRACTION_STATIC" \ ---context-length "$CONTEXT_LENGTH" --reasoning-parser gpt-oss --stream-interval 10 > "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh index 9be9959bf..2a9359b96 100644 --- a/benchmarks/single_node/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/gptoss_fp4_h200.sh @@ -18,9 +18,7 @@ fi hf download "$MODEL" # Start GPU monitoring (power, temperature, clocks every second) -if ! is_isb1_kv_stress_benchmark; then - start_gpu_monitor -fi +start_gpu_monitor set -x pip install datasets pandas @@ -39,21 +37,14 @@ if [ "${EVAL_ONLY}" = "true" ]; then CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi -PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" -if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then - PREFIX_CACHING_CONFIG="" -fi - # Create config.yaml cat > config.yaml << EOF -$PREFIX_CACHING_CONFIG +no-enable-prefix-caching: true max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $CALCULATED_MAX_MODEL_LEN EOF -apply_vllm_offload_config - SERVER_LOG=/workspace/server.log export TORCH_CUDA_ARCH_LIST="9.0" PORT=${PORT:-8888} @@ -64,15 +55,14 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --config config.yaml \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ - --max-num-seqs $CONC $VLLM_OFFLOAD_EXTRA_ARGS \ -> $SERVER_LOG 2>&1 & + --max-num-seqs $CONC > $SERVER_LOG 2>&1 & SERVER_PID=$! # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -run_single_node_benchmark \ +run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -82,8 +72,7 @@ run_single_node_benchmark \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" + --result-dir /workspace/ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -92,7 +81,5 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring -if ! is_isb1_kv_stress_benchmark; then - stop_gpu_monitor -fi +stop_gpu_monitor set +x diff --git a/benchmarks/single_node/gptoss_fp4_h200_sglang.sh b/benchmarks/single_node/gptoss_fp4_h200_sglang.sh deleted file mode 100644 index 069b1a452..000000000 --- a/benchmarks/single_node/gptoss_fp4_h200_sglang.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -export TORCH_CUDA_ARCH_LIST="9.0" - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" -fi - -RADIX_CACHE_ARGS="--disable-radix-cache" -if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then - RADIX_CACHE_ARGS="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" -CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" - -if ! is_isb1_kv_stress_benchmark; then - start_gpu_monitor -fi - -set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ ---host 0.0.0.0 --port "$PORT" --trust-remote-code \ ---tensor-parallel-size="$TP" --data-parallel-size=1 \ -$RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \ ---chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 --mem-fraction-static "$MEM_FRACTION_STATIC" \ ---context-length "$CONTEXT_LENGTH" --reasoning-parser gpt-oss --stream-interval 10 > "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if ! is_isb1_kv_stress_benchmark; then - stop_gpu_monitor -fi -set +x diff --git a/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh b/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh deleted file mode 100755 index cfff2a12d..000000000 --- a/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env bash -# TriAttention-enabled vLLM benchmark for GPT-OSS-120B FP4 on H100. -# -# Differences from baseline gptoss_fp4_h100.sh: -# - Installs triattention vLLM plugin -# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) -# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available -# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) -# - Explicitly disables prefix caching (incompatible with KV compression) - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -# --- TriAttention plugin setup --- -pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." - -# Auto-detect KV budget from export filename: chat workloads get larger budget. -TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" -if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then - TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" -fi -export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" - -# Use pre-calibrated sparse stats if available on the runner. -TRIATTN_STATS="/workspace/triattn_stats/gpt_oss_120b_stats.pt" -if [[ -f "$TRIATTN_STATS" ]]; then - export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" - echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" -else - echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." -fi - -export ENABLE_TRIATTENTION=1 -echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" -# --- End TriAttention setup --- - -if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then - MAX_MODEL_LEN="${MAX_MODEL_LEN}" -else - MAX_MODEL_LEN=10240 -fi - -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -cat > config.yaml << EOF -enable-prefix-caching: false -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 1024 -max-model-len: $MAX_MODEL_LEN -EOF - -if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml -fi -if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - -export PYTHONNOUSERSITE=1 -export VLLM_MXFP4_USE_MARLIN=1 -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---config config.yaml \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \ -> $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts $(( $CONC * 10 )) \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh b/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh deleted file mode 100755 index fc6f465bc..000000000 --- a/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env bash -# TriAttention-enabled vLLM benchmark for GPT-OSS-120B FP4 on H200. -# -# Differences from baseline gptoss_fp4_h100.sh: -# - Installs triattention vLLM plugin -# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) -# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available -# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) -# - Explicitly disables prefix caching (incompatible with KV compression) - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -# --- TriAttention plugin setup --- -pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." - -# Auto-detect KV budget from export filename: chat workloads get larger budget. -TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" -if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then - TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" -fi -export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" - -# Use pre-calibrated sparse stats if available on the runner. -TRIATTN_STATS="/workspace/triattn_stats/gpt_oss_120b_stats.pt" -if [[ -f "$TRIATTN_STATS" ]]; then - export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" - echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" -else - echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." -fi - -export ENABLE_TRIATTENTION=1 -echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" -# --- End TriAttention setup --- - -if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then - MAX_MODEL_LEN="${MAX_MODEL_LEN}" -else - MAX_MODEL_LEN=10240 -fi - -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -cat > config.yaml << EOF -enable-prefix-caching: false -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 1024 -max-model-len: $MAX_MODEL_LEN -EOF - -if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml -fi -if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - -export PYTHONNOUSERSITE=1 -export VLLM_MXFP4_USE_MARLIN=1 -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---config config.yaml \ ---gpu-memory-utilization=0.9 \ ---tensor-parallel-size=$TP \ ---max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \ -> $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts $(( $CONC * 10 )) \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh index 8aca9860a..f7c71963d 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh @@ -19,25 +19,34 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 20)) +MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor # following Andy Luo linkedin's recipe https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/ python3 -m sglang.launch_server \ - --attention-backend triton \ + --attention-backend aiter \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ + --data-parallel-size 1 \ --trust-remote-code \ - --mem-fraction-static 0.8 \ - --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --tokenizer-worker-num 6 \ + --enable-aiter-allreduce-fusion \ + --cuda-graph-max-bs $CONC \ + --disable-radix-cache \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --scheduler-recv-interval 30 \ + --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh index 8aca9860a..f7c71963d 100644 --- a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh @@ -19,25 +19,34 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 20)) +MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor # following Andy Luo linkedin's recipe https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/ python3 -m sglang.launch_server \ - --attention-backend triton \ + --attention-backend aiter \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ + --data-parallel-size 1 \ --trust-remote-code \ - --mem-fraction-static 0.8 \ - --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --tokenizer-worker-num 6 \ + --enable-aiter-allreduce-fusion \ + --cuda-graph-max-bs $CONC \ + --disable-radix-cache \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --scheduler-recv-interval 30 \ + --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index 701695def..6d40e3e3f 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -9,7 +9,8 @@ check_env_vars \ ISL \ OSL \ RANDOM_RANGE_RATIO \ - RESULT_FILENAME + RESULT_FILENAME \ + EP_SIZE if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -19,11 +20,14 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 20)) +MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -34,7 +38,14 @@ python3 -m sglang.launch_server \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ + --ep-size $EP_SIZE \ --trust-remote-code \ + --tokenizer-worker-num 6 \ + --enable-aiter-allreduce-fusion \ + --cuda-graph-max-bs $CONC \ + --disable-radix-cache \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --scheduler-recv-interval 30 \ --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh deleted file mode 100755 index 97fb5127c..000000000 --- a/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -export NCCL_NVLS_ENABLE=1 -export SGL_ENABLE_JIT_DEEPGEMM=false -export SGLANG_ENABLE_FLASHINFER_GEMM=true -export PYTHONUNBUFFERED=1 -export TORCH_CUDA_ARCH_LIST="10.0" - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -if [[ $CONC -ge 16 ]]; then - SCHEDULER_RECV_INTERVAL=30 -else - SCHEDULER_RECV_INTERVAL=10 -fi - -CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" -fi - -MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}" -CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" - -RADIX_CACHE_ARGS="--disable-radix-cache" -if is_isb1_replay_benchmark; then - RADIX_CACHE_ARGS="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path="$MODEL" --host=0.0.0.0 --port="$PORT" \ ---trust-remote-code \ ---tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \ ---quantization fp8 --kv-cache-dtype fp8_e4m3 \ ---mamba-ssm-dtype bfloat16 \ ---cuda-graph-max-bs "$CONC" --max-running-requests 128 \ ---mem-fraction-static "$MEM_FRACTION_STATIC" --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" \ ---context-length "$CONTEXT_LENGTH" \ ---attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \ -$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \ ---scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" \ ---reasoning-parser qwen3 --tool-call-parser qwen3_coder \ ---tokenizer-worker-num 6 --stream-interval 30 > "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh deleted file mode 100755 index e48c56700..000000000 --- a/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -apply_yarn_config_if_needed "$MODEL" "$CALCULATED_MAX_MODEL_LEN" - -PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" -if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then - PREFIX_CACHING_CONFIG="" -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' -$PREFIX_CACHING_CONFIG -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -apply_vllm_offload_config - -export TORCH_CUDA_ARCH_LIST="10.0" -export PYTHONNOUSERSITE=1 -export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl - -if ! is_isb1_kv_stress_benchmark; then - start_gpu_monitor -fi - -set -x -vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ ---config config.yaml \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size "$TP" \ ---max-num-seqs 256 \ ---disable-log-requests \ ---trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS $VLLM_EXTRA_ARGS \ -> "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" \ - --trust-remote-code - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if ! is_isb1_kv_stress_benchmark; then - stop_gpu_monitor -fi -set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/qwen3.5_fp8_b300.sh new file mode 100644 index 000000000..b87d25e91 --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_b300.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CONTEXT_LENGTH=$((ISL + OSL + 20)) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +--trust-remote-code \ +--tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \ +--enable-symm-mem \ +--disable-radix-cache \ +--quantization fp8 \ +--kv-cache-dtype fp8_e4m3 \ +--mamba-ssm-dtype bfloat16 \ +--attention-backend trtllm_mha \ +--moe-runner-backend flashinfer_trtllm \ +--cuda-graph-max-bs $CONC \ +--max-running-requests $CONC \ +--max-prefill-tokens 16384 \ +--chunked-prefill-size 16384 \ +--mem-fraction-static 0.8 \ +--stream-interval 50 \ +--scheduler-recv-interval 10 \ +--tokenizer-worker-num 6 \ +--tokenizer-path $MODEL \ +--context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --use-chat-template + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh new file mode 100644 index 000000000..a0c5f4828 --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CONTEXT_LENGTH=$((ISL + OSL + 20)) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +set -x +SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +--trust-remote-code \ +--tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \ +--enable-symm-mem \ +--disable-radix-cache \ +--quantization fp8 \ +--kv-cache-dtype fp8_e4m3 \ +--mamba-ssm-dtype bfloat16 \ +--attention-backend trtllm_mha \ +--moe-runner-backend flashinfer_trtllm \ +--cuda-graph-max-bs $CONC \ +--max-running-requests $CONC \ +--max-prefill-tokens 16384 \ +--chunked-prefill-size 16384 \ +--mem-fraction-static 0.8 \ +--stream-interval 50 \ +--scheduler-recv-interval 10 \ +--tokenizer-worker-num 6 \ +--tokenizer-path $MODEL \ +--speculative-algorithm EAGLE \ +--speculative-num-steps 3 \ +--speculative-eagle-topk 1 \ +--speculative-num-draft-tokens 4 \ +--context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --use-chat-template + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh deleted file mode 100755 index 61df75cff..000000000 --- a/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -export TORCH_CUDA_ARCH_LIST="9.0" - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" -fi - -MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}" -CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-16384}" - -RADIX_CACHE_ARGS="--disable-radix-cache" -if is_isb1_replay_benchmark; then - RADIX_CACHE_ARGS="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ ---host 0.0.0.0 --port "$PORT" --trust-remote-code \ ---tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \ ---quantization fp8 --kv-cache-dtype fp8_e4m3 \ ---mamba-ssm-dtype bfloat16 \ -$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \ ---max-running-requests 128 --cuda-graph-max-bs 128 \ ---chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \ ---context-length "$CONTEXT_LENGTH" \ ---reasoning-parser qwen3 --tool-call-parser qwen3_coder \ ---attention-backend flashinfer \ ---stream-interval 30 --tokenizer-worker-num 6 > "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh deleted file mode 100755 index 6f576ea0f..000000000 --- a/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" -if is_isb1_replay_benchmark; then - PREFIX_CACHING_CONFIG="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -$PREFIX_CACHING_CONFIG -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml -fi -if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ ---config config.yaml \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size "$TP" \ ---max-num-seqs 256 \ ---disable-log-requests \ ---trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ -> "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" \ - --trust-remote-code - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh deleted file mode 100755 index b3d5ea50b..000000000 --- a/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -export TORCH_CUDA_ARCH_LIST="9.0" - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" -fi - -apply_yarn_config_if_needed "$MODEL" "$CONTEXT_LENGTH" - -MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}" -CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-16384}" - -RADIX_CACHE_ARGS="--disable-radix-cache" -if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then - RADIX_CACHE_ARGS="" -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -if ! is_isb1_kv_stress_benchmark; then - start_gpu_monitor -fi - -set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ ---host 0.0.0.0 --port "$PORT" --trust-remote-code \ ---tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \ ---reasoning-parser qwen3 --tool-call-parser qwen3_coder \ ---enable-flashinfer-allreduce-fusion \ ---max-running-requests 128 \ ---chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \ ---mem-fraction-static "$MEM_FRACTION_STATIC" \ ---cuda-graph-max-bs 128 \ ---context-length "$CONTEXT_LENGTH" \ ---kv-cache-dtype fp8_e4m3 \ ---quantization fp8 \ ---attention-backend flashinfer \ ---stream-interval 30 \ ---tokenizer-worker-num 6 \ ---mamba-ssm-dtype bfloat16 \ -$RADIX_CACHE_ARGS \ -$SGLANG_EXTRA_ARGS \ -> "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if ! is_isb1_kv_stress_benchmark; then - stop_gpu_monitor -fi -set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh deleted file mode 100755 index de5c66c44..000000000 --- a/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -apply_yarn_config_if_needed "$MODEL" "$CALCULATED_MAX_MODEL_LEN" - -PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" -if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then - PREFIX_CACHING_CONFIG="" -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -$PREFIX_CACHING_CONFIG -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -apply_vllm_offload_config - -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 - -if ! is_isb1_kv_stress_benchmark; then - start_gpu_monitor -fi - -set -x -vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ ---config config.yaml \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size "$TP" \ ---max-num-seqs 256 \ ---disable-log-requests \ ---trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS $VLLM_EXTRA_ARGS \ -> "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" \ - --trust-remote-code - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if ! is_isb1_kv_stress_benchmark; then - stop_gpu_monitor -fi -set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh index 00cc9cf91..fe761d88d 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh @@ -19,11 +19,14 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 20)) +MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -31,14 +34,20 @@ start_gpu_monitor # following AMD Andy linkedin's recipe # https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/ python3 -m sglang.launch_server \ - --attention-backend triton \ + --attention-backend aiter \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ + --data-parallel-size 1 \ --trust-remote-code \ - --mem-fraction-static 0.8 \ - --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --tokenizer-worker-num 6 \ + --enable-aiter-allreduce-fusion \ + --cuda-graph-max-bs $CONC \ + --disable-radix-cache \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --scheduler-recv-interval 30 \ + --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh index 00cc9cf91..fe761d88d 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh @@ -19,11 +19,14 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 20)) +MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -31,14 +34,20 @@ start_gpu_monitor # following AMD Andy linkedin's recipe # https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/ python3 -m sglang.launch_server \ - --attention-backend triton \ + --attention-backend aiter \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ + --data-parallel-size 1 \ --trust-remote-code \ - --mem-fraction-static 0.8 \ - --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --tokenizer-worker-num 6 \ + --enable-aiter-allreduce-fusion \ + --cuda-graph-max-bs $CONC \ + --disable-radix-cache \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --scheduler-recv-interval 30 \ + --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index 701695def..6d40e3e3f 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -9,7 +9,8 @@ check_env_vars \ ISL \ OSL \ RANDOM_RANGE_RATIO \ - RESULT_FILENAME + RESULT_FILENAME \ + EP_SIZE if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -19,11 +20,14 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 20)) +MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -34,7 +38,14 @@ python3 -m sglang.launch_server \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ + --ep-size $EP_SIZE \ --trust-remote-code \ + --tokenizer-worker-num 6 \ + --enable-aiter-allreduce-fusion \ + --cuda-graph-max-bs $CONC \ + --disable-radix-cache \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --scheduler-recv-interval 30 \ --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh b/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh deleted file mode 100755 index 87e81ab22..000000000 --- a/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env bash -# TriAttention-enabled vLLM benchmark for Qwen 3.5 FP8 on H100. -# -# Differences from baseline qwen3.5_fp8_h100_vllm.sh: -# - Installs triattention vLLM plugin -# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) -# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available -# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) -# - Explicitly disables prefix caching (incompatible with KV compression) - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -# --- TriAttention plugin setup --- -pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." - -# Auto-detect KV budget from export filename: chat workloads get larger budget. -TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" -if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then - TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" -fi -export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" - -# Use pre-calibrated sparse stats if available on the runner. -TRIATTN_STATS="/workspace/triattn_stats/qwen3_5_397b_a17b_stats.pt" -if [[ -f "$TRIATTN_STATS" ]]; then - export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" - echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" -else - echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." -fi - -export ENABLE_TRIATTENTION=1 -echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" -# --- End TriAttention setup --- - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -enable-prefix-caching: false -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 1024 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml -fi -if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ ---config config.yaml \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size "$TP" \ ---max-num-seqs 256 \ ---disable-log-requests \ ---trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ -> "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" \ - --trust-remote-code - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh b/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh deleted file mode 100755 index 83fb3b8c6..000000000 --- a/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env bash -# TriAttention-enabled vLLM benchmark for Qwen 3.5 FP8 on H200. -# -# Differences from baseline qwen3.5_fp8_h200_vllm.sh: -# - Installs triattention vLLM plugin -# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) -# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available -# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) -# - Explicitly disables prefix caching (incompatible with KV compression) - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -hf download "$MODEL" - -# --- TriAttention plugin setup --- -pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." - -# Auto-detect KV budget from export filename: chat workloads get larger budget. -TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" -if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then - TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" -fi -export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" - -# Use pre-calibrated sparse stats if available on the runner. -TRIATTN_STATS="/workspace/triattn_stats/qwen3_5_397b_a17b_stats.pt" -if [[ -f "$TRIATTN_STATS" ]]; then - export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" - echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" -else - echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." -fi - -export ENABLE_TRIATTENTION=1 -echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" -# --- End TriAttention setup --- - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -cat > config.yaml << EOF -kv-cache-dtype: fp8 -enable-prefix-caching: false -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 1024 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml -fi -if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml -fi -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_vllm_offload_config -fi - -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 - -start_gpu_monitor -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 -fi - -set -x -vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ ---config config.yaml \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size "$TP" \ ---max-num-seqs 256 \ ---disable-log-requests \ ---trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ -> "$SERVER_LOG" 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_single_node_benchmark \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --server-pid "$SERVER_PID" \ - --trust-remote-code - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - stop_kv_metrics_collector -fi -stop_gpu_monitor -set +x diff --git a/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md deleted file mode 100644 index fcb33d8cc..000000000 --- a/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md +++ /dev/null @@ -1,110 +0,0 @@ ---- -version: 1.1.0 -date: 2026-04-16 -status: proposed ---- - -# ISB1 ↔ kv-cache-tester Coexistence Plan - -## The Two Systems - -| | kv-cache-tester (PR #993) | ISB1 (PR #1032) | -|---|---|---| -| **Location** | `experimental/multiturn/vllm_benchmark/kv-cache-tester/` | `datasets/isb1/exports/` | -| **Traces** | 522 real Claude Code sessions | 35 synthetic multi-turn traces | -| **Source** | Real production agentic workloads | Synthetic with controlled stress patterns | -| **Replay** | `trace_replay_tester.py` | `benchmark_export_replay.py` | -| **Config** | `multiturn-agentic-trace.yaml` | `isb1-kv-stress.yaml` | -| **Metrics** | Prometheus sidecar (`metrics_collector.py`) | `process_result_isb1.py` | - -## Why Both Are Needed - -**kv-cache-tester** shows how chips perform under **real workloads** — actual Claude Code -sessions with natural token distributions. This is the ground truth for "how does inference -actually work in production?" - -**ISB1** shows how chips perform under **controlled stress conditions** — specific KV cache -behaviors that real workloads rarely trigger but production systems must handle: - -| Stress Pattern | kv-cache-tester | ISB1 | -|---|---|---| -| Natural agentic workload distribution | ✅ (522 real traces) | ❌ | -| Targeted prefix reuse testing | ❌ | ✅ (high_prefix stress class) | -| Forced KV offload cliff | ❌ (depends on trace) | ✅ (offload_cliff stress, 128K-1M context) | -| Session reactivation after idle | ❌ | ✅ (reactivation stress, idle windows) | -| KV compaction under long sessions | ❌ | ✅ (compaction_heavy stress, 25+ turns) | -| Shared prefix fanout | ❌ | ✅ (fanout stress, branching requests) | -| 500K-1M context depth | ❌ (real traces are shorter) | ✅ (xlc2/ulc1/ulc2 bands) | - -Together they cover the Pareto frontier from realistic operating points (kv-cache-tester) -through stress-test extremes (ISB1). - -## How They Coexist - -### Configs (no conflict) -```yaml -# kv-cache-tester config (PR #993) -# .github/configs/multiturn-agentic-trace.yaml -h200-fp8-llama70b: - trace-file: experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/... - -# ISB1 config (PR #1032) -# .github/configs/isb1-kv-stress.yaml -dsr1-fp8-h200-isb1-kv-stress-vllm: - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json -``` - -### Workflows (no conflict) -```yaml -# kv-cache-tester workflow (PR #993) -# .github/workflows/multiturn-sweep.yml → benchmark-multiturn-tmpl.yml -# Uses: trace_replay_tester.py - -# ISB1 workflow (PR #1032) -# .github/workflows/run-isb1-kv-stress-sweep.yml → benchmark-isb1-tmpl.yml -# Uses: benchmark_export_replay.py -``` - -### Data directories (no conflict) -``` -experimental/multiturn/vllm_benchmark/ ← kv-cache-tester (PR #993, untouched by PR #1032) - kv-cache-tester/ 522 real traces + replayer - aiperf/ AIPerf submodule - bench/metrics_collector.py Prometheus sidecar - analysis/plot_pareto.py Pareto charts - -datasets/isb1/ ← ISB1 (PR #1032) - exports/ ISB1 replay bundles - core/ 8K baseline - extension_32k/ 32K context (flat) - extension_64k/ 64K context (flat) - extension_131k/ 131K context (flat) - preview/long_context_500k/ 500K reviewed_preview - preview/long_context_1m/ 1M gated preview -``` - -### Shared infrastructure ISB1 USES from PR #993 -- vLLM offload API flags (`--kv_offloading_backend native`, etc.) -- Prometheus metrics collector pattern (ISB1 ships its own `process_result_isb1.py` pipeline) -- Offload mode sweep pattern (on/off/noprefix) -- Runner launch scripts (`runners/launch_*.sh`) -- Concurrency sweep structure - -### What PR #1032 does NOT touch -- `experimental/multiturn/vllm_benchmark/kv-cache-tester/` — kv-cache-tester tree -- `aiperf/` submodule — alternative benchmark, unchanged -- `benchmark-multiturn-tmpl.yml` — kv-cache-tester workflow template, unchanged -- `multiturn-agentic-trace.yaml` — kv-cache-tester config, unchanged - -## Support-status vocabulary - -ISB1 replay surfaces in PR #1032 classify under the five-class support vocabulary: - -- `supported` — core 8K replay path -- `reviewed_preview` — 32K / 64K / 131K extensions, 500K preview -- `gated` — 1M preview (manual config `isb1-qwen-1m-preview.yaml` only) -- `artifact_only` — retained artifacts without live replay -- `unsupported` — not a valid path - -No ISB1 surface in PR #1032 claims `live_benchmark_certification`; all claims are bounded -to `dataset_replay_verified`. diff --git a/datasets/isb1/GMI_EXECUTION_PLAN.md b/datasets/isb1/GMI_EXECUTION_PLAN.md deleted file mode 100644 index f1aa6b464..000000000 --- a/datasets/isb1/GMI_EXECUTION_PLAN.md +++ /dev/null @@ -1,182 +0,0 @@ -# ISB1 KV Cache Benchmark — GMI Cloud Execution Plan - -Bare-metal execution runbook for ISB1 replay bundles on GMI Cloud Hopper (H100/H200) -and Blackwell (GB200). All runs described here are `support_status=reviewed_preview` -with `benchmark_certification_status=dataset_replay_verified` — i.e. replay and export -certification, not live-serving certification. - -## Available Hardware - -| GPU | HBM | Available | Max Context Before Offload | -|-----|-----|-----------|---------------------------| -| **GB200** | 192GB HBM3e | ✅ | ~384K tokens (FP8 KV) | -| **H100** | 80GB HBM3 | ✅ | ~128K tokens (FP8 KV) | - -## Execution Order - -Run benchmarks in this order — cheapest/fastest first to validate the setup works. - -### Phase 1: Validation Run (1 hour) - -Prove the pipeline works end-to-end before burning GPU hours. - -```bash -# On H100 — single model, single concurrency, 5 min duration -export MODEL=deepseek-ai/DeepSeek-R1-0528 -export TP=8 -export EXPORT_FILE=datasets/isb1/exports/extension_131k/code_131k1k.json - -# Launch server -bash benchmarks/single_node/dsr1_fp8_h100_vllm.sh - -# Run ONE cell: 2 users, offload=off, 300s -python utils/bench_serving/benchmark_export_replay.py \ - --export-file $EXPORT_FILE \ - --max-concurrency 2 \ - --duration 300 \ - --request-mode multi-turn - -# Verify result has actual_context_len > 0 -python utils/process_result_isb1.py --result-file results/*.json -``` - -**Pass criteria:** TTFT and throughput numbers appear. `actual_context_len` > 100K. - -### Phase 2: H100 KV Stress Sweep (8 hours) - -H100 80GB is the interesting GPU — KV cache fills up first. - -```bash -# Models to test: -# 1. DeepSeek-R1 FP8 (TP8) -# 2. GPT-OSS FP4 (TP8) - -# Sweep per model: -# users: [2, 4, 8, 16, 32, 64] # H100 can't do 128+ at 131K -# offload-modes: [on, off, noprefix] -# duration: 1800s (30 min) -# export: extension_131k/vllm/code_131k1k.json - -# Total cells: 2 models × 6 concurrency × 3 offload = 36 cells -# Time: 36 × 30min = 18 hours → with 2 models sequential = ~9 hours -``` - -**What to look for:** -- Offload cliff: at what concurrency does offload=on start helping? -- Prefix cache hit rate: does it stay >50% under load? -- Preemption count: how many requests get evicted? -- TTFT degradation: when does p99 TTFT exceed 10s? - -### Phase 3: GB200 KV Stress Sweep (8 hours) - -GB200 192GB has 2.4x more HBM — the cliff comes later. - -```bash -# Same sweep but higher concurrency (more HBM room): -# users: [2, 4, 8, 16, 32, 64, 128, 256] -# offload-modes: [on, off, noprefix] -# duration: 1800s - -# Add Qwen 3.5 (needs more memory for MoE): -# 3 models × 8 concurrency × 3 offload = 72 cells -# Time: 72 × 30min = 36 hours → might need to cut duration to 900s -``` - -**What to look for:** -- At what concurrency does GB200 hit its offload cliff? -- Is the cliff at ~3x H100's cliff (proportional to HBM)? -- Does 192GB allow prefix caching to stay effective longer? - -### Phase 4: Long Context Preview (4 hours, GB200 only) - -500K and 1M token traces — only GB200 has enough memory. - -```bash -# 500K preview (Qwen 3.5 only): -export EXPORT_FILE=datasets/isb1/exports/preview/long_context_500k/\ -inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json - -# 1M preview (Qwen 3.5 only): -export EXPORT_FILE=datasets/isb1/exports/preview/long_context_1m/\ -inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json - -# Low concurrency (these are HUGE contexts): -# users: [1, 2, 4] -# offload-modes: [on, off] -# duration: 900s -``` - -**What to look for:** -- Can GB200 serve 1M context at all? -- What's the TTFT for a 1M token prefill? -- Does KV offload work at this scale? - -## Estimated GPU Time - -| Phase | GPU | Duration | Cost (est) | -|-------|-----|----------|------------| -| 1. Validation | H100 | 1 hour | ~$3 | -| 2. H100 sweep | H100 | 9 hours | ~$27 | -| 3. GB200 sweep | GB200 | 18 hours | ~$90 | -| 4. Long context | GB200 | 4 hours | ~$20 | -| **Total** | | **32 hours** | **~$140** | - -## Portable Run Script - -Use `gmi_portable_benchmark.sh` for manual runs without GitHub Actions: - -```bash -# Set GMI-specific env vars -export GMI_API_KEY="..." -export HF_TOKEN="..." -export MODEL=deepseek-ai/DeepSeek-R1-0528 -export GPU_TYPE=h100 # or gb200 - -# Run the portable benchmark -bash datasets/isb1/scripts/gmi_portable_benchmark.sh \ - --model $MODEL \ - --gpu $GPU_TYPE \ - --export-file datasets/isb1/exports/extension_131k/code_131k1k.json \ - --users 2,4,8,16,32,64 \ - --offload-modes on,off,noprefix \ - --duration 1800 -``` - -## Result Collection - -After each phase, results go to: -``` -results/ -├── h100_dsr1_fp8_kv_stress/ -│ ├── users_2_offload_on.json -│ ├── users_2_offload_off.json -│ └── ... -└── gb200_dsr1_fp8_kv_stress/ - └── ... -``` - -Process and visualize: -```bash -# Aggregate results -python datasets/isb1/scripts/collect_sweep_results.py \ - --results-dir results/ \ - --output results/sweep_summary.json - -# Generate Pareto frontier chart -python datasets/isb1/scripts/plot_pareto.py \ - --summary results/sweep_summary.json \ - --output results/pareto_frontier.png -``` - -## What Success Looks Like - -After all phases, we have: -1. **Pareto frontier chart:** throughput vs p99 TTFT for H100 and GB200 -2. **Offload cliff identification:** exact concurrency where offload starts helping -3. **Prefix cache benefit:** measured hit rate under realistic multi-turn load -4. **HBM scaling evidence:** does 2.4x more HBM give 2.4x more capacity? -5. **Long context feasibility:** can GB200 serve 500K/1M context at all? - -Results feed the Pareto summaries and capacity-cliff annotations consumed by the -ISB1 replay analyzers (`datasets/isb1/scripts/gmi_analyze_sweep.py`, -`datasets/isb1/scripts/plot_pareto.py`). diff --git a/datasets/isb1/scripts/adapt_trace_replay_result.py b/datasets/isb1/scripts/adapt_trace_replay_result.py deleted file mode 100644 index 445ab7d9c..000000000 --- a/datasets/isb1/scripts/adapt_trace_replay_result.py +++ /dev/null @@ -1,214 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import csv -import json -from pathlib import Path -from statistics import mean -from typing import Any - - -def _to_float(value: Any) -> float | None: - if value in (None, ""): - return None - try: - return float(value) - except (TypeError, ValueError): - return None - - -def _percentile(values: list[float], p: float) -> float: - if not values: - return 0.0 - if len(values) == 1: - return values[0] - ordered = sorted(values) - idx = (len(ordered) - 1) * p - lo = int(idx) - hi = min(lo + 1, len(ordered) - 1) - frac = idx - lo - return ordered[lo] * (1 - frac) + ordered[hi] * frac - - -def _read_csv_rows(path: Path) -> list[dict[str, str]]: - with path.open("r", encoding="utf-8", newline="") as handle: - return list(csv.DictReader(handle)) - - -def _pick(row: dict[str, str], *keys: str) -> float | None: - for key in keys: - if key in row: - value = _to_float(row.get(key)) - if value is not None: - return value - return None - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Adapt kv-cache trace replay CSV output into ISB1 replay JSON schema" - ) - parser.add_argument("--input-dir", default="/workspace", help="Directory containing trace replay outputs") - parser.add_argument( - "--detailed-csv", - default="detailed_results.csv", - help="Detailed replay CSV filename (inside --input-dir)", - ) - parser.add_argument( - "--summary-json", - default=None, - help="Optional summary JSON path (used as supplemental source if present)", - ) - parser.add_argument("--output-json", required=True, help="Output adapted replay JSON path") - parser.add_argument("--model-id", default="", help="Model ID for output metadata") - parser.add_argument("--max-concurrency", type=int, default=1, help="Max concurrency used") - parser.add_argument("--request-mode", default="multi-turn", help="Request mode metadata") - parser.add_argument( - "--benchmark-certification-status", - default="dataset_replay_verified", - help="Benchmark certification status to stamp in selection", - ) - parser.add_argument( - "--support-status", - default="reviewed_preview", - help="Support status to stamp in selection", - ) - parser.add_argument( - "--result-stem", - default="", - help="Optional result stem to infer total wall time from /workspace/.json", - ) - return parser.parse_args() - - -def main() -> int: - args = parse_args() - input_dir = Path(args.input_dir) - detailed_csv_path = input_dir / args.detailed_csv - output_path = Path(args.output_json) - - if not detailed_csv_path.exists(): - raise SystemExit(f"Missing detailed CSV: {detailed_csv_path}") - - rows = _read_csv_rows(detailed_csv_path) - ttft_ms: list[float] = [] - tpot_ms: list[float] = [] - output_tokens: list[float] = [] - prompt_tokens: list[float] = [] - session_ids: set[str] = set() - - for row in rows: - ttft = _pick(row, "ttft_ms", "ttft", "time_to_first_token_ms") - if ttft is not None: - ttft_ms.append(ttft) - - tpot = _pick(row, "tpot_ms", "tpot", "time_per_output_token_ms") - if tpot is not None: - tpot_ms.append(tpot) - - out_tok = _pick(row, "output_tokens", "generated_tokens", "completion_tokens") - if out_tok is not None: - output_tokens.append(out_tok) - - in_tok = _pick(row, "input_tokens", "prompt_tokens", "content_token_count") - if in_tok is not None: - prompt_tokens.append(in_tok) - - for key in ("session_id", "session", "conversation_id"): - sid = row.get(key) - if sid: - session_ids.add(str(sid)) - break - - completed_sessions = len(session_ids) if session_ids else len(rows) - total_sessions = completed_sessions - - total_output_tokens = sum(output_tokens) - total_prompt_tokens = sum(prompt_tokens) - total_token_count = total_output_tokens + total_prompt_tokens - - total_wall_time_s = 0.0 - if args.result_stem: - maybe_summary = input_dir / f"{args.result_stem}.json" - if maybe_summary.exists(): - try: - summary = json.loads(maybe_summary.read_text(encoding="utf-8")) - total_wall_time_s = float( - _to_float(summary.get("test_duration_seconds")) - or _to_float(summary.get("duration_s")) - or _to_float(summary.get("total_duration_s")) - or 0.0 - ) - except Exception: - total_wall_time_s = 0.0 - - if total_wall_time_s <= 0 and args.summary_json: - summary_path = Path(args.summary_json) - if summary_path.exists(): - try: - summary = json.loads(summary_path.read_text(encoding="utf-8")) - total_wall_time_s = float( - _to_float(summary.get("test_duration_seconds")) - or _to_float(summary.get("duration_s")) - or _to_float(summary.get("total_duration_s")) - or 0.0 - ) - except Exception: - total_wall_time_s = 0.0 - - if total_wall_time_s <= 0: - total_wall_time_s = 1.0 - - aggregate_metrics = { - "total_token_throughput_tps": total_token_count / total_wall_time_s, - "output_throughput_tps": total_output_tokens / total_wall_time_s, - "mean_ttft_ms": mean(ttft_ms) if ttft_ms else 0.0, - "median_ttft_ms": _percentile(ttft_ms, 0.50), - "p99_ttft_ms": _percentile(ttft_ms, 0.99), - "mean_tpot_ms": mean(tpot_ms) if tpot_ms else 0.0, - "median_tpot_ms": _percentile(tpot_ms, 0.50), - "p99_tpot_ms": _percentile(tpot_ms, 0.99), - "completed_sessions": completed_sessions, - "total_sessions": total_sessions, - "session_throughput_sps": completed_sessions / total_wall_time_s, - "total_wall_time_s": total_wall_time_s, - } - - adapted = { - "model_id": args.model_id, - "max_concurrency": args.max_concurrency, - "request_mode": args.request_mode, - "harness_request_mode": "auto", - "aggregate_metrics": aggregate_metrics, - "selection": { - "support_statuses": [args.support_status], - "benchmark_certification_statuses": [args.benchmark_certification_status], - }, - "server_metrics_summary": { - "observability_status": "unavailable", - "gpu_cache_metric_name": None, - "cpu_cache_metric_name": None, - "gpu_cache_usage_peak": 0.0, - "cpu_cache_usage_peak": 0.0, - "preemption_count": 0, - "kv_offload_observed": False, - "cpu_cache_metric_available": False, - }, - "depth_telemetry": { - "total_actual_input_tokens": int(total_prompt_tokens), - "max_actual_context_len_per_turn": int(max(prompt_tokens) if prompt_tokens else 0), - }, - "num_sessions": total_sessions, - "max_turns": None, - "per_turn_metrics": {}, - } - - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(json.dumps(adapted, indent=2, sort_keys=True), encoding="utf-8") - print(f"Wrote adapted replay JSON: {output_path}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/datasets/isb1/scripts/analyze_benchmark_distributions.py b/datasets/isb1/scripts/analyze_benchmark_distributions.py deleted file mode 100644 index 06c5a65f1..000000000 --- a/datasets/isb1/scripts/analyze_benchmark_distributions.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import json -from pathlib import Path -from typing import Any - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Analyze ISL/OSL/turn distributions for ISB1 exports or kv-cache traces") - parser.add_argument("--export-file", default=None, help="ISB1 export JSON file") - parser.add_argument("--trace-dir", default=None, help="kv-cache-tester trace directory") - parser.add_argument("--output-dir", required=True, help="Output directory") - return parser.parse_args() - - -def _percentile(values: list[float], p: float) -> float: - if not values: - return 0.0 - if len(values) == 1: - return values[0] - ordered = sorted(values) - idx = (len(ordered) - 1) * p - lo = int(idx) - hi = min(lo + 1, len(ordered) - 1) - frac = idx - lo - return ordered[lo] * (1 - frac) + ordered[hi] * frac - - -def _histogram(values: list[int], bins: list[int]) -> dict[str, int]: - counts: dict[str, int] = {} - for value in values: - placed = False - prev = 0 - for bound in bins: - if value <= bound: - key = f"{prev + 1}-{bound}" - counts[key] = counts.get(key, 0) + 1 - placed = True - break - prev = bound - if not placed: - key = f">{bins[-1]}" - counts[key] = counts.get(key, 0) + 1 - return counts - - -def _extract_isb1(export_payload: dict[str, Any]) -> tuple[list[int], list[int], list[int]]: - isl: list[int] = [] - osl: list[int] = [] - turns_per_session: list[int] = [] - - for cell in export_payload.get("exports", []): - session = cell.get("session") or {} - turns = session.get("turns") or [] - turns_per_session.append(len(turns)) - for turn in turns: - input_tokens = ( - turn.get("actual_input_tokens") - or turn.get("content_token_count") - or turn.get("prompt_tokens") - or turn.get("input_tokens") - or 0 - ) - output_tokens = ( - turn.get("expected_output_tokens") - or turn.get("target_output_tokens") - or turn.get("output_tokens") - or 0 - ) - try: - isl.append(int(input_tokens)) - except Exception: - isl.append(0) - try: - osl.append(int(output_tokens)) - except Exception: - osl.append(0) - - return isl, osl, turns_per_session - - -def _extract_trace_dir(trace_dir: Path) -> tuple[list[int], list[int], list[int]]: - isl: list[int] = [] - osl: list[int] = [] - turns_per_session: list[int] = [] - - files = list(sorted(trace_dir.glob("*.json"))) - if not files: - raise SystemExit(f"No JSON traces found in {trace_dir}") - - for path in files: - payload = json.loads(path.read_text(encoding="utf-8")) - sessions = payload.get("sessions") or [] - for session in sessions: - turns = session.get("turns") or [] - turns_per_session.append(len(turns)) - for turn in turns: - isl.append(int(turn.get("content_token_count", 0) or 0)) - osl.append(int(turn.get("target_output_tokens", 0) or 0)) - - return isl, osl, turns_per_session - - -def build_report(isl: list[int], osl: list[int], turns_per_session: list[int], source: str) -> dict[str, Any]: - return { - "source": source, - "num_sessions": len(turns_per_session), - "num_turns": len(isl), - "isl": { - "p50": _percentile([float(x) for x in isl], 0.50), - "p95": _percentile([float(x) for x in isl], 0.95), - "max": max(isl) if isl else 0, - "histogram": _histogram(isl, [1024, 4096, 8192, 16384, 32768, 65536]), - }, - "osl": { - "p50": _percentile([float(x) for x in osl], 0.50), - "p95": _percentile([float(x) for x in osl], 0.95), - "max": max(osl) if osl else 0, - "histogram": _histogram(osl, [64, 128, 256, 512, 1024, 2048, 4096]), - }, - "turns_per_session": { - "p50": _percentile([float(x) for x in turns_per_session], 0.50), - "p95": _percentile([float(x) for x in turns_per_session], 0.95), - "max": max(turns_per_session) if turns_per_session else 0, - "histogram": _histogram(turns_per_session, [2, 4, 8, 16, 32]), - }, - } - - -def main() -> int: - args = parse_args() - if bool(args.export_file) == bool(args.trace_dir): - raise SystemExit("Provide exactly one of --export-file or --trace-dir") - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - if args.export_file: - export_path = Path(args.export_file) - payload = json.loads(export_path.read_text(encoding="utf-8")) - isl, osl, turns_per_session = _extract_isb1(payload) - report = build_report(isl, osl, turns_per_session, source=str(export_path)) - else: - trace_dir = Path(args.trace_dir) - isl, osl, turns_per_session = _extract_trace_dir(trace_dir) - report = build_report(isl, osl, turns_per_session, source=str(trace_dir)) - - output_path = output_dir / "distribution_report.json" - output_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8") - print(f"Wrote: {output_path}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/datasets/isb1/scripts/collect_sweep_results.py b/datasets/isb1/scripts/collect_sweep_results.py deleted file mode 100644 index 0d7155428..000000000 --- a/datasets/isb1/scripts/collect_sweep_results.py +++ /dev/null @@ -1,183 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import csv -import json -import sqlite3 -from pathlib import Path -from typing import Any - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Aggregate sweep results from DB or agg_*.json directory") - parser.add_argument("--db-path", default=None, help="SQLite DB path") - parser.add_argument("--json-dir", default=None, help="Directory containing agg_*.json files") - parser.add_argument("--output-dir", required=True, help="Output directory") - parser.add_argument("--cliff-ttft-ms", type=float, default=5000.0, help="TTFT p99 threshold for capacity cliff") - return parser.parse_args() - - -def _to_float(value: Any) -> float | None: - if value in (None, ""): - return None - try: - return float(value) - except (TypeError, ValueError): - return None - - -def _to_int(value: Any) -> int | None: - if value in (None, ""): - return None - try: - return int(float(value)) - except (TypeError, ValueError): - return None - - -def collect_from_db(db_path: Path) -> list[dict[str, Any]]: - conn = sqlite3.connect(db_path) - conn.row_factory = sqlite3.Row - rows = conn.execute( - """ - SELECT offload_mode, throughput_tok_s, ttft_p99_ms, max_concurrency, raw_result_json - FROM benchmark_runs - WHERE offload_mode IS NOT NULL - ORDER BY id ASC - """ - ).fetchall() - conn.close() - - out: list[dict[str, Any]] = [] - for row in rows: - concurrency = row["max_concurrency"] - if concurrency in (None, "") and row["raw_result_json"]: - try: - payload = json.loads(row["raw_result_json"]) - concurrency = payload.get("conc") or payload.get("max_concurrency") - except Exception: - pass - out.append( - { - "offload_mode": row["offload_mode"], - "concurrency": _to_int(concurrency), - "throughput_tok_s": _to_float(row["throughput_tok_s"]), - "ttft_p99_ms": _to_float(row["ttft_p99_ms"]), - "source": "db", - } - ) - return out - - -def collect_from_json_dir(json_dir: Path) -> list[dict[str, Any]]: - rows: list[dict[str, Any]] = [] - for path in sorted(json_dir.glob("agg_*.json")): - try: - payload = json.loads(path.read_text(encoding="utf-8")) - except Exception: - continue - rows.append( - { - "offload_mode": payload.get("offload_mode"), - "concurrency": _to_int(payload.get("conc") or payload.get("max_concurrency")), - "throughput_tok_s": _to_float(payload.get("throughput_tok_s") or payload.get("tput_per_gpu")), - "ttft_p99_ms": _to_float(payload.get("ttft_p99_ms") or payload.get("p99_ttft_ms")), - "source": str(path.name), - } - ) - return rows - - -def compute_capacity_cliff(rows: list[dict[str, Any]], threshold_ms: float) -> dict[str, Any]: - cliff: dict[str, Any] = {} - for mode in sorted({row.get("offload_mode") for row in rows if row.get("offload_mode")}): - mode_rows = sorted( - [r for r in rows if r.get("offload_mode") == mode and r.get("concurrency") is not None], - key=lambda r: r["concurrency"], - ) - cliff_row = None - for row in mode_rows: - if (row.get("ttft_p99_ms") or 0.0) > threshold_ms: - cliff_row = row - break - cliff[str(mode)] = cliff_row - return cliff - - -def compute_offload_benefit(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: - by_conc: dict[int, dict[str, dict[str, Any]]] = {} - for row in rows: - conc = row.get("concurrency") - mode = row.get("offload_mode") - if conc is None or mode is None: - continue - by_conc.setdefault(int(conc), {})[str(mode)] = row - - deltas: list[dict[str, Any]] = [] - for conc in sorted(by_conc): - modes = by_conc[conc] - on = modes.get("on") - off = modes.get("off") - if not on or not off: - continue - on_tput = on.get("throughput_tok_s") or 0.0 - off_tput = off.get("throughput_tok_s") or 0.0 - deltas.append( - { - "concurrency": conc, - "throughput_on": on_tput, - "throughput_off": off_tput, - "offload_benefit_delta_tps": on_tput - off_tput, - } - ) - return deltas - - -def write_csv(path: Path, rows: list[dict[str, Any]]) -> None: - with path.open("w", newline="", encoding="utf-8") as handle: - writer = csv.writer(handle) - writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms", "source"]) - for row in rows: - writer.writerow([ - row.get("offload_mode"), - row.get("concurrency"), - row.get("throughput_tok_s"), - row.get("ttft_p99_ms"), - row.get("source"), - ]) - - -def main() -> int: - args = parse_args() - if not args.db_path and not args.json_dir: - raise SystemExit("Provide --db-path or --json-dir") - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - rows: list[dict[str, Any]] = [] - if args.db_path: - rows.extend(collect_from_db(Path(args.db_path))) - if args.json_dir: - rows.extend(collect_from_json_dir(Path(args.json_dir))) - - summary = { - "num_rows": len(rows), - "capacity_cliff": compute_capacity_cliff(rows, args.cliff_ttft_ms), - "offload_benefit": compute_offload_benefit(rows), - "rows": rows, - } - - json_path = output_dir / "sweep_aggregate.json" - csv_path = output_dir / "sweep_aggregate.csv" - json_path.write_text(json.dumps(summary, indent=2, sort_keys=True), encoding="utf-8") - write_csv(csv_path, rows) - - print(f"Wrote: {json_path}") - print(f"Wrote: {csv_path}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/datasets/isb1/scripts/generate_qwen35_low_band_exports.py b/datasets/isb1/scripts/generate_qwen35_low_band_exports.py deleted file mode 100755 index 51be8b531..000000000 --- a/datasets/isb1/scripts/generate_qwen35_low_band_exports.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python3 -"""Generate dedicated Qwen 3.5 ISB1 export bundles for 8k/32k/64k lanes. - -These files are derived from the committed generic export bundles by selecting only -GPT-OSS cells that are actually runnable (`supported` or `reviewed_preview`), then -rewriting model identity fields to the Qwen 3.5 replay identity while keeping trace -payloads unchanged. -""" - -from __future__ import annotations - -import json -from copy import deepcopy -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[3] -EXPORT_ROOT = ROOT / "datasets" / "isb1" / "exports" - -QWEN_MODEL_ID = "qwen3_5_397b_a17b" -GPTOSS_MODEL_ID = "gpt_oss_120b" -ALLOWED_SUPPORT_STATUSES = {"supported", "reviewed_preview"} - -TARGETS = [ - ("core", "8k1k", "chat", "vllm"), - ("core", "8k1k", "chat", "sglang"), - ("core", "8k1k", "code", "vllm"), - ("core", "8k1k", "code", "sglang"), - ("extension_32k", "32k1k", "chat", "vllm"), - ("extension_32k", "32k1k", "chat", "sglang"), - ("extension_32k", "32k1k", "code", "vllm"), - ("extension_32k", "32k1k", "code", "sglang"), - ("extension_64k", "64k1k", "chat", "vllm"), - ("extension_64k", "64k1k", "chat", "sglang"), - ("extension_64k", "64k1k", "code", "vllm"), - ("extension_64k", "64k1k", "code", "sglang"), -] - - -def _source_path(lane: str, shape: str, surface: str, engine: str) -> Path: - return EXPORT_ROOT / lane / engine / f"{surface}_{shape}.json" - - -def _target_path(lane: str, shape: str, surface: str, engine: str) -> Path: - return EXPORT_ROOT / lane / engine / f"{surface}_{shape}_qwen3.5.json" - - -def _rewrite_bundle_id(bundle_id: str, lane: str, engine: str, surface: str, shape: str) -> str: - expected_prefix = f"isb1_{lane}_{engine}_{surface}_{shape}" - if bundle_id != expected_prefix: - raise ValueError( - f"Unexpected bundle_id {bundle_id!r}; expected {expected_prefix!r} for {lane}/{engine}/{surface}_{shape}" - ) - return f"{bundle_id}_qwen3_5" - - -def _rewrite_cell(cell: dict) -> dict: - rewritten = deepcopy(cell) - rewritten["canonical_model_id"] = QWEN_MODEL_ID - rewritten["thinking_history_policy"] = "strip_reasoning" - rewritten["history_projection_mode"] = "strip_reasoning_history" - rewritten["support_status"] = "reviewed_preview" - return rewritten - - -def build_export(lane: str, shape: str, surface: str, engine: str) -> tuple[Path, int]: - source_path = _source_path(lane, shape, surface, engine) - target_path = _target_path(lane, shape, surface, engine) - - payload = json.loads(source_path.read_text()) - exports = payload.get("exports") - if not isinstance(exports, list): - raise ValueError(f"Missing exports list in {source_path}") - - filtered = [ - _rewrite_cell(cell) - for cell in exports - if cell.get("canonical_model_id") == GPTOSS_MODEL_ID - and cell.get("support_status") in ALLOWED_SUPPORT_STATUSES - ] - if not filtered: - raise ValueError(f"No runnable GPT-OSS cells found in {source_path}") - - payload["bundle_id"] = _rewrite_bundle_id(payload.get("bundle_id"), lane, engine, surface, shape) - payload["exports"] = filtered - - target_path.write_text(json.dumps(payload, indent=2) + "\n") - return target_path, len(filtered) - - -def main() -> int: - for lane, shape, surface, engine in TARGETS: - target_path, count = build_export(lane, shape, surface, engine) - print(f"wrote {target_path.relative_to(ROOT)} ({count} cells)") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/datasets/isb1/scripts/gmi_analyze_sweep.py b/datasets/isb1/scripts/gmi_analyze_sweep.py deleted file mode 100644 index d0c3465b2..000000000 --- a/datasets/isb1/scripts/gmi_analyze_sweep.py +++ /dev/null @@ -1,250 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import csv -import json -import sqlite3 -import subprocess -import sys -from pathlib import Path -from statistics import median -from typing import Any - -from isb1_results_db import render_table - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Analyze KV sweep runs from ISB1 SQLite results.") - parser.add_argument("--db-path", required=True, help="Path to SQLite DB (isb1_results.db)") - parser.add_argument("--output-dir", default=".", help="Directory to write summary outputs") - parser.add_argument("--pareto", action="store_true", help="Also run plot_pareto.py") - parser.add_argument( - "--distributions", - action="store_true", - help="Also run analyze_benchmark_distributions.py", - ) - parser.add_argument("--export-file", default=None, help="Export JSON for --distributions") - parser.add_argument("--trace-dir", default=None, help="Trace directory for --distributions") - return parser.parse_args() - - -def _to_float(value: Any) -> float | None: - if value in (None, ""): - return None - try: - return float(value) - except (TypeError, ValueError): - return None - - -def _to_int(value: Any) -> int | None: - if value in (None, ""): - return None - try: - return int(float(value)) - except (TypeError, ValueError): - return None - - -def _extract_concurrency(raw_result_json: str | None) -> int | None: - if not raw_result_json: - return None - try: - payload = json.loads(raw_result_json) - except json.JSONDecodeError: - return None - return _to_int(payload.get("conc") or payload.get("max_concurrency")) - - -def percentile(values: list[float], p: float) -> float | None: - if not values: - return None - ordered = sorted(values) - if len(ordered) == 1: - return ordered[0] - idx = (len(ordered) - 1) * p - lo = int(idx) - hi = min(lo + 1, len(ordered) - 1) - frac = idx - lo - return ordered[lo] * (1 - frac) + ordered[hi] * frac - - -def load_rows(db_path: Path) -> list[dict[str, Any]]: - conn = sqlite3.connect(db_path) - conn.row_factory = sqlite3.Row - rows = conn.execute( - """ - SELECT - id, - offload_mode, - ttft_p50_ms, - ttft_p99_ms, - throughput_tok_s, - preemption_count, - status, - raw_result_json - FROM benchmark_runs - WHERE offload_mode IS NOT NULL - ORDER BY id ASC - """ - ).fetchall() - conn.close() - - normalized: list[dict[str, Any]] = [] - for row in rows: - concurrency = _extract_concurrency(row["raw_result_json"]) - normalized.append( - { - "offload_mode": row["offload_mode"], - "concurrency": concurrency, - "ttft_p50_ms": _to_float(row["ttft_p50_ms"]), - "ttft_p99_ms": _to_float(row["ttft_p99_ms"]), - "throughput_tok_s": _to_float(row["throughput_tok_s"]), - "preemption_count": _to_int(row["preemption_count"]) or 0, - "status": row["status"], - } - ) - return normalized - - -def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: - grouped: dict[tuple[str, int], list[dict[str, Any]]] = {} - for row in rows: - if row["concurrency"] is None: - continue - key = (row["offload_mode"], row["concurrency"]) - grouped.setdefault(key, []).append(row) - - summary_rows: list[dict[str, Any]] = [] - for (offload_mode, concurrency), items in sorted(grouped.items(), key=lambda x: (x[0][0], x[0][1])): - ttft_p50_values = [x["ttft_p50_ms"] for x in items if x["ttft_p50_ms"] is not None] - ttft_p99_values = [x["ttft_p99_ms"] for x in items if x["ttft_p99_ms"] is not None] - throughput_values = [x["throughput_tok_s"] for x in items if x["throughput_tok_s"] is not None] - preemptions = [x["preemption_count"] for x in items] - success_count = sum(1 for x in items if x["status"] == "success") - - summary_rows.append( - { - "offload_mode": offload_mode, - "concurrency": concurrency, - "runs": len(items), - "success_runs": success_count, - "ttft_p50_ms": median(ttft_p50_values) if ttft_p50_values else None, - "ttft_p99_ms": percentile(ttft_p99_values, 0.99), - "throughput_tok_s": median(throughput_values) if throughput_values else None, - "preemptions": int(median(preemptions)) if preemptions else 0, - } - ) - - return { - "total_rows": len(rows), - "grouped_rows": len(summary_rows), - "summary": summary_rows, - } - - -def write_summary_json(output_dir: Path, summary: dict[str, Any]) -> Path: - output_path = output_dir / "sweep_summary.json" - output_path.write_text(json.dumps(summary, indent=2)) - return output_path - - -def write_pareto_csv(output_dir: Path, summary: dict[str, Any]) -> Path: - output_path = output_dir / "pareto_data.csv" - with output_path.open("w", newline="") as handle: - writer = csv.writer(handle) - writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms"]) - for row in summary["summary"]: - writer.writerow( - [ - row["offload_mode"], - row["concurrency"], - row["throughput_tok_s"], - row["ttft_p99_ms"], - ] - ) - return output_path - - -def print_console_summary(summary: dict[str, Any]) -> None: - headers = [ - "offload_mode", - "concurrency", - "runs", - "success_runs", - "ttft_p50_ms", - "ttft_p99_ms", - "throughput_tok_s", - "preemptions", - ] - rows = [ - [ - row["offload_mode"], - row["concurrency"], - row["runs"], - row["success_runs"], - row["ttft_p50_ms"], - row["ttft_p99_ms"], - row["throughput_tok_s"], - row["preemptions"], - ] - for row in summary["summary"] - ] - - print(f"Total rows: {summary['total_rows']}") - print(f"Grouped rows: {summary['grouped_rows']}") - if rows: - print(render_table(headers, rows)) - else: - print("No sweep rows with offload_mode + concurrency found.") - - -def main() -> int: - args = parse_args() - db_path = Path(args.db_path) - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - rows = load_rows(db_path) - summary = summarize(rows) - summary_path = write_summary_json(output_dir, summary) - pareto_path = write_pareto_csv(output_dir, summary) - - print_console_summary(summary) - print(f"Wrote: {summary_path}") - print(f"Wrote: {pareto_path}") - - script_dir = Path(__file__).resolve().parent - - if args.pareto: - pareto_cmd = [ - sys.executable, - str(script_dir / "plot_pareto.py"), - "--db-path", - str(db_path), - "--output-dir", - str(output_dir), - ] - subprocess.run(pareto_cmd, check=True) - - if args.distributions: - dist_cmd = [ - sys.executable, - str(script_dir / "analyze_benchmark_distributions.py"), - "--output-dir", - str(output_dir), - ] - if args.export_file: - dist_cmd.extend(["--export-file", args.export_file]) - elif args.trace_dir: - dist_cmd.extend(["--trace-dir", args.trace_dir]) - else: - raise SystemExit("--distributions requires --export-file or --trace-dir") - subprocess.run(dist_cmd, check=True) - - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/datasets/isb1/scripts/gmi_full_suite.sh b/datasets/isb1/scripts/gmi_full_suite.sh deleted file mode 100755 index fad23efc1..000000000 --- a/datasets/isb1/scripts/gmi_full_suite.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env bash -set -Eeuo pipefail - -SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh" - -usage() { - echo "Usage: gmi_full_suite.sh --gpu-type [--db-path ]" -} - -GPU_TYPE="" -DB_PATH="" - -while [[ $# -gt 0 ]]; do - case "$1" in - --gpu-type) - GPU_TYPE="$2" - shift 2 - ;; - --db-path) - DB_PATH="$2" - shift 2 - ;; - --help|-h) - usage - exit 0 - ;; - *) - echo "Unknown: $1" >&2 - exit 1 - ;; - esac -done - -[[ -n "$GPU_TYPE" ]] || { - usage >&2 - exit 1 -} - -case "$GPU_TYPE" in - h100|h200|b200) ;; - *) - echo "Unsupported --gpu-type: $GPU_TYPE" >&2 - exit 1 - ;; -esac - -[[ -x "$PORTABLE_SCRIPT" ]] || { - echo "Expected executable helper at $PORTABLE_SCRIPT" >&2 - exit 1 -} - -if [[ -n "$DB_PATH" ]]; then - export ISB1_RESULTS_DB_PATH="$DB_PATH" -fi - -PASSED=0 -FAILED=0 -SKIPPED=0 - -run_combo() { - local model="$1" - local engine="$2" - local band="$3" - local workload="${4:-code}" - - echo "=========================================" - echo ">>> $model × $engine × $band × $workload on $GPU_TYPE" - echo "=========================================" - - if "$PORTABLE_SCRIPT" \ - --gpu-type "$GPU_TYPE" \ - --model "$model" \ - --engine "$engine" \ - --context-band "$band" \ - --workload "$workload"; then - ((PASSED++)) || true - else - echo "FAILED: $model × $engine × $band × $workload" >&2 - ((FAILED++)) || true - fi -} - -# Core 8k — all models × all engines × chat + code -for model in qwen3.5 gptoss dsr1; do - for engine in vllm sglang; do - for workload in chat code; do - run_combo "$model" "$engine" 8k "$workload" - done - done -done - -# 131k — all models × all engines × chat + code -for model in qwen3.5 gptoss dsr1; do - for engine in vllm sglang; do - for workload in chat code; do - run_combo "$model" "$engine" 131k "$workload" - done - done -done - -# 500k — qwen3.5 + gptoss only (DSR1 max context=164k, exceeds model capability) -for model in qwen3.5 gptoss; do - for engine in vllm sglang; do - for workload in chat code; do - run_combo "$model" "$engine" 500k "$workload" - done - done -done - -# 1m — qwen3.5 only (only model supporting 1M context), b200 only -if [[ "$GPU_TYPE" == "b200" ]]; then - for engine in vllm sglang; do - for workload in chat code; do - run_combo qwen3.5 "$engine" 1m "$workload" - done - done -else - SKIPPED=4 -fi - -echo -echo "=========================================" -echo "SUITE COMPLETE: passed=$PASSED failed=$FAILED skipped=$SKIPPED" -echo "=========================================" - -if command -v python3 >/dev/null 2>&1; then - summary_cmd=(python3 "$SCRIPT_DIR/isb1_results_db.py" summary) - if [[ -n "$DB_PATH" ]]; then - summary_cmd+=(--db-path "$DB_PATH") - fi - "${summary_cmd[@]}" 2>/dev/null || true -fi - -[[ "$FAILED" -eq 0 ]] diff --git a/datasets/isb1/scripts/gmi_kv_sweep.sh b/datasets/isb1/scripts/gmi_kv_sweep.sh deleted file mode 100644 index e953aba1a..000000000 --- a/datasets/isb1/scripts/gmi_kv_sweep.sh +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env bash -set -Eeuo pipefail - -SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh" - -usage() { - cat <<'EOF' -Usage: - gmi_kv_sweep.sh \ - --gpu-type \ - --model \ - --engine \ - --context-band <8k|32k|64k|131k|500k|1m> \ - --workload \ - [--users "2,4,8,16,32,64"] \ - [--offload-modes "on,off,noprefix"] \ - [--kv-cache-dtype ] \ - [--benchmark-duration-s ] \ - [--disable-prefix-caching] \ - [--total-cpu-dram-gb ] \ - [--trace-source ] \ - [--db-path ] -EOF -} - -die() { - echo "ERROR: $*" >&2 - exit 1 -} - -trim() { - local x="$1" - x="${x#${x%%[![:space:]]*}}" - x="${x%${x##*[![:space:]]}}" - printf '%s' "$x" -} - -GPU_TYPE="" -MODEL="" -ENGINE="" -CONTEXT_BAND="" -WORKLOAD="" -USERS="2,4,8,16,32,64" -OFFLOAD_MODES="on,off,noprefix" -KV_CACHE_DTYPE="" -BENCHMARK_DURATION_S="1800" -DISABLE_PREFIX_CACHING=false -TOTAL_CPU_DRAM_GB="" -TRACE_SOURCE="isb1" -DB_PATH="" - -while [[ $# -gt 0 ]]; do - case "$1" in - --gpu-type) GPU_TYPE="$2"; shift 2 ;; - --model) MODEL="$2"; shift 2 ;; - --engine) ENGINE="$2"; shift 2 ;; - --context-band) CONTEXT_BAND="$2"; shift 2 ;; - --workload) WORKLOAD="$2"; shift 2 ;; - --users) USERS="$2"; shift 2 ;; - --offload-modes) OFFLOAD_MODES="$2"; shift 2 ;; - --kv-cache-dtype) KV_CACHE_DTYPE="$2"; shift 2 ;; - --benchmark-duration-s) BENCHMARK_DURATION_S="$2"; shift 2 ;; - --disable-prefix-caching) DISABLE_PREFIX_CACHING=true; shift ;; - --total-cpu-dram-gb) TOTAL_CPU_DRAM_GB="$2"; shift 2 ;; - --trace-source) TRACE_SOURCE="$2"; shift 2 ;; - --db-path) DB_PATH="$2"; shift 2 ;; - -h|--help) usage; exit 0 ;; - *) die "Unknown argument: $1" ;; - esac -done - -[[ -n "$GPU_TYPE" ]] || die "--gpu-type is required" -[[ -n "$MODEL" ]] || die "--model is required" -[[ -n "$ENGINE" ]] || die "--engine is required" -[[ -n "$CONTEXT_BAND" ]] || die "--context-band is required" -[[ -n "$WORKLOAD" ]] || die "--workload is required" -[[ -x "$PORTABLE_SCRIPT" ]] || die "Expected executable script: $PORTABLE_SCRIPT" - -case "$ENGINE" in - vllm|sglang) ;; - *) die "Unsupported --engine: $ENGINE" ;; -esac - -case "$TRACE_SOURCE" in - isb1|kv_cache_tester|aiperf) ;; - *) die "Unsupported --trace-source: $TRACE_SOURCE" ;; -esac - -IFS=',' read -r -a user_list <<< "$USERS" -IFS=',' read -r -a mode_list <<< "$OFFLOAD_MODES" - -[[ "${#user_list[@]}" -gt 0 ]] || die "--users cannot be empty" -[[ "${#mode_list[@]}" -gt 0 ]] || die "--offload-modes cannot be empty" - -TOTAL=0 -PASSED=0 -FAILED=0 - -for raw_mode in "${mode_list[@]}"; do - mode=$(trim "$raw_mode") - [[ -n "$mode" ]] || continue - - case "$mode" in - on|off|noprefix|legacy) ;; - *) die "Unsupported offload mode in --offload-modes: $mode" ;; - esac - - if [[ "$ENGINE" == "sglang" && "$mode" == "on" ]]; then - echo "Skipping mode=on for SGLang (no native offload support)" - continue - fi - - for raw_users in "${user_list[@]}"; do - users=$(trim "$raw_users") - [[ "$users" =~ ^[0-9]+$ ]] || die "Invalid user concurrency: $users" - - TOTAL=$((TOTAL + 1)) - echo "========================================================" - echo "Run $TOTAL: model=$MODEL engine=$ENGINE users=$users mode=$mode" - echo "========================================================" - - cmd=( - "$PORTABLE_SCRIPT" - --gpu-type "$GPU_TYPE" - --model "$MODEL" - --engine "$ENGINE" - --context-band "$CONTEXT_BAND" - --workload "$WORKLOAD" - --benchmark-type isb1_kv_stress - --benchmark-duration-s "$BENCHMARK_DURATION_S" - --max-concurrency "$users" - --trace-source "$TRACE_SOURCE" - --offload-mode "$mode" - ) - - if [[ -n "$KV_CACHE_DTYPE" ]]; then - cmd+=(--kv-cache-dtype "$KV_CACHE_DTYPE") - fi - if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then - cmd+=(--disable-prefix-caching) - fi - if [[ -n "$TOTAL_CPU_DRAM_GB" ]]; then - cmd+=(--total-cpu-dram-gb "$TOTAL_CPU_DRAM_GB") - fi - if [[ -n "$DB_PATH" ]]; then - if ISB1_RESULTS_DB_PATH="$DB_PATH" "${cmd[@]}"; then - PASSED=$((PASSED + 1)) - echo "PASS users=$users mode=$mode" - else - FAILED=$((FAILED + 1)) - echo "FAIL users=$users mode=$mode" >&2 - fi - else - if "${cmd[@]}"; then - PASSED=$((PASSED + 1)) - echo "PASS users=$users mode=$mode" - else - FAILED=$((FAILED + 1)) - echo "FAIL users=$users mode=$mode" >&2 - fi - fi - done -done - -echo -echo "KV sweep complete" -echo " total: $TOTAL" -echo " passed: $PASSED" -echo " failed: $FAILED" - -if [[ -n "$DB_PATH" && -f "$DB_PATH" ]]; then - echo " db: $DB_PATH" -fi - -[[ "$FAILED" -eq 0 ]] diff --git a/datasets/isb1/scripts/gmi_portable_benchmark.sh b/datasets/isb1/scripts/gmi_portable_benchmark.sh deleted file mode 100755 index f41722e36..000000000 --- a/datasets/isb1/scripts/gmi_portable_benchmark.sh +++ /dev/null @@ -1,1019 +0,0 @@ -#!/usr/bin/env bash -set -Eeuo pipefail - -usage() { - cat <<'EOF' -Usage: - gmi_portable_benchmark.sh \ - --gpu-type \ - --model \ - --engine \ - --context-band <8k|32k|64k|131k|500k|1m> \ - --workload \ - [--benchmark-type ] \ - [--offload-mode ] \ - [--kv-cache-dtype ] \ - [--disable-prefix-caching] \ - [--total-cpu-dram-gb ] \ - [--benchmark-duration-s ] \ - [--max-concurrency ] \ - [--trace-source ] - -Required environment: - HF_TOKEN or HUGGING_FACE_HUB_TOKEN Hugging Face token for model access - -Optional environment: - PORT API port (default: 8000) - TP Tensor parallelism (default: 8) - HEALTH_TIMEOUT_S Readiness timeout in seconds (default: 1800) - HEALTH_POLL_INTERVAL_S Readiness poll interval (default: 10) - BENCHMARK_OUTPUT_ROOT Output root (default: /datasets/isb1/results/gmi) - GMI_RUN_LABEL Optional suffix added to result names -EOF -} - -die() { - echo "ERROR: $*" >&2 - exit 1 -} - -require_cmd() { - command -v "$1" >/dev/null 2>&1 || die "Missing required command: $1" -} - -SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -REPO_ROOT=$(cd "$SCRIPT_DIR/../../.." && pwd) -source "$REPO_ROOT/benchmarks/benchmark_lib.sh" -PORT=${PORT:-8000} -TP=${TP:-8} -HEALTH_TIMEOUT_S=${HEALTH_TIMEOUT_S:-1800} -HEALTH_POLL_INTERVAL_S=${HEALTH_POLL_INTERVAL_S:-10} -BENCHMARK_OUTPUT_ROOT=${BENCHMARK_OUTPUT_ROOT:-"$REPO_ROOT/datasets/isb1/results/gmi"} -REQUEST_MODE=multi-turn -HARNESS_REQUEST_MODE=auto -IGNORE_WAITS=true - -GPU_TYPE="" -MODEL_KEY="" -ENGINE="" -CONTEXT_BAND="" -WORKLOAD="" -BENCHMARK_TYPE="isb1_replay" -OFFLOAD_MODE="" -KV_CACHE_DTYPE="" -DISABLE_PREFIX_CACHING=false -TOTAL_CPU_DRAM_GB="" -BENCHMARK_DURATION_S="" -MAX_CONCURRENCY_OVERRIDE="" -TRACE_SOURCE="isb1" - -while [[ $# -gt 0 ]]; do - case "$1" in - --gpu-type) - GPU_TYPE="$2" - shift 2 - ;; - --model) - MODEL_KEY="$2" - shift 2 - ;; - --engine) - ENGINE="$2" - shift 2 - ;; - --context-band) - CONTEXT_BAND="$2" - shift 2 - ;; - --workload) - WORKLOAD="$2" - shift 2 - ;; - --benchmark-type) - BENCHMARK_TYPE="$2" - shift 2 - ;; - --offload-mode) - OFFLOAD_MODE="$2" - shift 2 - ;; - --kv-cache-dtype) - KV_CACHE_DTYPE="$2" - shift 2 - ;; - --disable-prefix-caching) - DISABLE_PREFIX_CACHING=true - shift - ;; - --total-cpu-dram-gb) - TOTAL_CPU_DRAM_GB="$2" - shift 2 - ;; - --benchmark-duration-s) - BENCHMARK_DURATION_S="$2" - shift 2 - ;; - --max-concurrency) - MAX_CONCURRENCY_OVERRIDE="$2" - shift 2 - ;; - --trace-source) - TRACE_SOURCE="$2" - shift 2 - ;; - -h|--help) - usage - exit 0 - ;; - *) - die "Unknown argument: $1" - ;; - esac -done - -[[ -n "$GPU_TYPE" ]] || die "--gpu-type is required" -[[ -n "$MODEL_KEY" ]] || die "--model is required" -[[ -n "$ENGINE" ]] || die "--engine is required" -[[ -n "$CONTEXT_BAND" ]] || die "--context-band is required" -[[ -n "$WORKLOAD" ]] || die "--workload is required" - -case "$GPU_TYPE" in - h100|h200|b200) ;; - *) die "Unsupported --gpu-type: $GPU_TYPE" ;; -esac - -case "$ENGINE" in - vllm|sglang) ;; - *) die "Unsupported --engine: $ENGINE" ;; -esac - -case "$CONTEXT_BAND" in - 8k|32k|64k|131k|500k|1m) ;; - *) die "Unsupported --context-band: $CONTEXT_BAND" ;; -esac - -case "$WORKLOAD" in - chat|code) ;; - *) die "Unsupported --workload: $WORKLOAD (must be chat or code)" ;; -esac - -case "$BENCHMARK_TYPE" in - isb1_replay|isb1_kv_stress) ;; - *) die "Unsupported --benchmark-type: $BENCHMARK_TYPE" ;; -esac - -case "$TRACE_SOURCE" in - isb1|kv_cache_tester|aiperf) ;; - *) die "Unsupported --trace-source: $TRACE_SOURCE" ;; -esac - -case "${OFFLOAD_MODE:-}" in - ""|on|off|noprefix|legacy) ;; - *) die "Unsupported --offload-mode: $OFFLOAD_MODE" ;; -esac - -case "${KV_CACHE_DTYPE:-}" in - ""|auto|fp8) ;; - *) die "Unsupported --kv-cache-dtype: $KV_CACHE_DTYPE" ;; -esac - -if [[ -n "$TOTAL_CPU_DRAM_GB" ]] && ! [[ "$TOTAL_CPU_DRAM_GB" =~ ^[0-9]+([.][0-9]+)?$ ]]; then - die "--total-cpu-dram-gb must be numeric" -fi -if [[ -n "$MAX_CONCURRENCY_OVERRIDE" ]] && ! [[ "$MAX_CONCURRENCY_OVERRIDE" =~ ^[0-9]+$ ]]; then - die "--max-concurrency must be a positive integer" -fi -if [[ -n "$BENCHMARK_DURATION_S" ]] && ! [[ "$BENCHMARK_DURATION_S" =~ ^[0-9]+([.][0-9]+)?$ ]]; then - die "--benchmark-duration-s must be numeric" -fi - -require_cmd docker -require_cmd curl -require_cmd python3 -require_cmd nvidia-smi - -HF_TOKEN_VALUE=${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-}} -[[ -n "$HF_TOKEN_VALUE" ]] || die "Set HF_TOKEN or HUGGING_FACE_HUB_TOKEN before running" - -if [[ -z "$TOTAL_CPU_DRAM_GB" ]]; then - if [[ -r /proc/meminfo ]]; then - TOTAL_CPU_DRAM_GB=$(awk '/MemTotal:/ {printf "%.0f", $2/1048576}' /proc/meminfo) - else - TOTAL_CPU_DRAM_GB=0 - fi -fi - -case "$MODEL_KEY" in - qwen3.5) - MODEL_HF_ID="Qwen/Qwen3.5-397B-A17B-FP8" - MODEL_PREFIX="qwen3.5" - CANONICAL_MODEL_ID="qwen3_5_397b_a17b" - PRECISION="fp8" - ;; - gptoss) - MODEL_HF_ID="openai/gpt-oss-120b" - MODEL_PREFIX="gptoss" - CANONICAL_MODEL_ID="gpt_oss_120b" - PRECISION="fp4" - ;; - dsr1) - MODEL_HF_ID="deepseek-ai/DeepSeek-R1-0528" - MODEL_PREFIX="dsr1" - CANONICAL_MODEL_ID="deepseek_r1_0528" - PRECISION="fp8" - ;; - *) - die "Unsupported --model: $MODEL_KEY" - ;; -esac - -case "$GPU_TYPE" in - b200) - HARDWARE_PROFILE_ID="nvidia:b200_sxm_180gb" - RUNNER_TYPE="b200-gmi-baremetal" - ;; - h100) - HARDWARE_PROFILE_ID="nvidia:h100_sxm_80gb" - RUNNER_TYPE="h100-gmi-baremetal" - ;; - h200) - HARDWARE_PROFILE_ID="nvidia:h200_sxm_141gb" - RUNNER_TYPE="h200-gmi-baremetal" - ;; -esac - -case "$ENGINE" in - vllm) - RUNTIME_STACK_ID="standalone:vllm" - if [[ "$GPU_TYPE" == "b200" ]]; then - IMAGE="vllm/vllm-openai:v0.19.0-cu130" - else - IMAGE="vllm/vllm-openai:v0.18.0" - fi - ;; - sglang) - RUNTIME_STACK_ID="standalone:sglang" - IMAGE="lmsysorg/sglang:v0.5.9-cu130" - ;; -esac - -case "$CONTEXT_BAND" in - 8k) - MAX_MODEL_LEN=10240 - MAX_CONCURRENCY=4 - NUM_WARMUP_SESSIONS=1 - MAX_SESSIONS="" - MAX_TURNS_PER_SESSION="" - MAX_NUM_BATCHED_TOKENS=8192 - MAX_ACTIVE_REQUESTS=128 - ;; - 32k) - MAX_MODEL_LEN=33792 - MAX_CONCURRENCY=4 - NUM_WARMUP_SESSIONS=1 - MAX_SESSIONS="" - MAX_TURNS_PER_SESSION="" - MAX_NUM_BATCHED_TOKENS=8192 - MAX_ACTIVE_REQUESTS=64 - ;; - 64k) - MAX_MODEL_LEN=66560 - MAX_CONCURRENCY=4 - NUM_WARMUP_SESSIONS=1 - MAX_SESSIONS="" - MAX_TURNS_PER_SESSION="" - MAX_NUM_BATCHED_TOKENS=4096 - MAX_ACTIVE_REQUESTS=64 - ;; - 131k) - MAX_MODEL_LEN=132296 - MAX_CONCURRENCY=2 - NUM_WARMUP_SESSIONS=1 - MAX_SESSIONS="" - MAX_TURNS_PER_SESSION="" - MAX_NUM_BATCHED_TOKENS=2048 - MAX_ACTIVE_REQUESTS=32 - ;; - 500k) - MAX_MODEL_LEN=524288 - MAX_CONCURRENCY=1 - NUM_WARMUP_SESSIONS=0 - MAX_SESSIONS=2 - MAX_TURNS_PER_SESSION=4 - MAX_NUM_BATCHED_TOKENS=1024 - MAX_ACTIVE_REQUESTS=8 - ;; - 1m) - MAX_MODEL_LEN=1048576 - MAX_CONCURRENCY=1 - NUM_WARMUP_SESSIONS=0 - MAX_SESSIONS=1 - MAX_TURNS_PER_SESSION=3 - MAX_NUM_BATCHED_TOKENS=1024 - MAX_ACTIVE_REQUESTS=4 - ;; -esac - -if [[ -n "$MAX_CONCURRENCY_OVERRIDE" ]]; then - MAX_CONCURRENCY="$MAX_CONCURRENCY_OVERRIDE" -fi - -select_export_file() { - case "$MODEL_KEY:$CONTEXT_BAND:$ENGINE:$WORKLOAD" in - # ── Chat exports (committed at 8k–131k) ────────────────────── - qwen3.5:8k:*:chat) - printf 'datasets/isb1/exports/core/%s/chat_8k1k_qwen3.5.json\n' "$ENGINE" - ;; - qwen3.5:32k:*:chat) - printf 'datasets/isb1/exports/extension_32k/%s/chat_32k1k_qwen3.5.json\n' "$ENGINE" - ;; - qwen3.5:64k:*:chat) - printf 'datasets/isb1/exports/extension_64k/%s/chat_64k1k_qwen3.5.json\n' "$ENGINE" - ;; - *:8k:*:chat) - printf 'datasets/isb1/exports/core/%s/chat_8k1k.json\n' "$ENGINE" - ;; - *:32k:*:chat) - printf 'datasets/isb1/exports/extension_32k/%s/chat_32k1k.json\n' "$ENGINE" - ;; - *:64k:*:chat) - printf 'datasets/isb1/exports/extension_64k/%s/chat_64k1k.json\n' "$ENGINE" - ;; - gptoss:131k:*:chat) - printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k.json\n' "$ENGINE" - ;; - qwen3.5:131k:*:chat) - printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k_qwen3.5.json\n' "$ENGINE" - ;; - dsr1:131k:*:chat) - printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k_dsr1.json\n' "$ENGINE" - ;; - gptoss:500k:*:chat) - printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" - ;; - qwen3.5:500k:*:chat) - printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" - ;; - # dsr1:500k:chat — model max 164k, exceeds capability - qwen3.5:1m:*:chat) - printf 'datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1__%s.json\n' "$ENGINE" - ;; - # dsr1:1m:chat, gptoss:1m:chat — models don't support 1M context - - # ── Code exports ────────────────────────────────────────────── - qwen3.5:8k:*:code) - printf 'datasets/isb1/exports/core/%s/code_8k1k_qwen3.5.json\n' "$ENGINE" - ;; - qwen3.5:32k:*:code) - printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k_qwen3.5.json\n' "$ENGINE" - ;; - qwen3.5:64k:*:code) - printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k_qwen3.5.json\n' "$ENGINE" - ;; - qwen3.5:131k:*:code) - printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k_qwen3.5.json\n' "$ENGINE" - ;; - qwen3.5:500k:*:code) - printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" - ;; - qwen3.5:1m:*:code) - printf 'datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__%s.json\n' "$ENGINE" - ;; - gptoss:8k:*:code) - printf 'datasets/isb1/exports/core/%s/code_8k1k.json\n' "$ENGINE" - ;; - gptoss:32k:*:code) - printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k.json\n' "$ENGINE" - ;; - gptoss:64k:*:code) - printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k.json\n' "$ENGINE" - ;; - gptoss:131k:*:code) - printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k.json\n' "$ENGINE" - ;; - gptoss:500k:*:code) - printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" - ;; - # gptoss:1m — GPT-OSS max_position_embeddings=131072; 1M exceeds model capability - dsr1:8k:*:code) - printf 'datasets/isb1/exports/core/%s/code_8k1k.json\n' "$ENGINE" - ;; - dsr1:32k:*:code) - printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k.json\n' "$ENGINE" - ;; - dsr1:64k:*:code) - printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k.json\n' "$ENGINE" - ;; - dsr1:131k:*:code) - printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k.json\n' "$ENGINE" - ;; - # dsr1:500k/1m — DeepSeek R1 max_position_embeddings=163840; 500k/1M exceed model capability - *) - return 1 - ;; - esac -} - -TRACE_DIR="" -TRACE_REPLAY_SUMMARY_JSON="" -if [[ "$TRACE_SOURCE" == "isb1" ]]; then - EXPORT_FILE=$(select_export_file) || die "No committed ISB1 export for model=$MODEL_KEY engine=$ENGINE context=$CONTEXT_BAND workload=$WORKLOAD" - EXPORT_PATH="$REPO_ROOT/$EXPORT_FILE" - [[ -f "$EXPORT_PATH" ]] || die "Export file not found: $EXPORT_FILE" - - readarray -t EXPORT_METADATA < <( - python3 - "$EXPORT_PATH" "$RUNTIME_STACK_ID" "$HARDWARE_PROFILE_ID" "$CANONICAL_MODEL_ID" <<'PY' -import json -import sys -from pathlib import Path - -export_path = Path(sys.argv[1]) -runtime_stack_id = sys.argv[2] -hardware_profile_id = sys.argv[3] -canonical_model_id = sys.argv[4] -payload = json.loads(export_path.read_text()) -matches = [ - cell - for cell in payload.get("exports", []) - if cell.get("runtime_stack_id") == runtime_stack_id - and cell.get("hardware_profile_id") == hardware_profile_id - and cell.get("canonical_model_id") == canonical_model_id -] -if not matches: - raise SystemExit( - f"No matching export cells for runtime={runtime_stack_id} hardware={hardware_profile_id} model={canonical_model_id}" - ) -support_statuses = sorted({cell.get("support_status") for cell in matches if cell.get("support_status")}) -cert_statuses = sorted( - {cell.get("benchmark_certification_status") for cell in matches if cell.get("benchmark_certification_status")} -) -trace_ids = sorted({cell.get("trace_id") for cell in matches if cell.get("trace_id")}) -if len(support_statuses) > 1: - raise SystemExit(f"Ambiguous support statuses: {support_statuses}") -if len(cert_statuses) > 1: - raise SystemExit(f"Ambiguous certification statuses: {cert_statuses}") -print(support_statuses[0] if support_statuses else "") -print(cert_statuses[0] if cert_statuses else "") -print(",".join(trace_ids)) -print(len(matches)) -PY - ) - - SUPPORT_STATUS=${EXPORT_METADATA[0]} - BENCHMARK_CERTIFICATION_STATUS=${EXPORT_METADATA[1]} - TRACE_IDS=${EXPORT_METADATA[2]} - MATCHED_CELL_COUNT=${EXPORT_METADATA[3]} -else - SUPPORT_STATUS=${SUPPORT_STATUS:-reviewed_preview} - BENCHMARK_CERTIFICATION_STATUS=${BENCHMARK_CERTIFICATION_STATUS:-dataset_replay_verified} - TRACE_IDS="$TRACE_SOURCE" - MATCHED_CELL_COUNT="n/a" - if [[ "$TRACE_SOURCE" == "kv_cache_tester" ]]; then - TRACE_DIR=${TRACE_DIR:-"$REPO_ROOT/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces"} - EXPORT_FILE="experimental/multiturn/vllm_benchmark/trace_source_kv_cache_tester.json" - else - TRACE_DIR=${TRACE_DIR:-"$REPO_ROOT/experimental/multiturn/vllm_benchmark/aiperf_traces"} - EXPORT_FILE="experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json" - fi - EXPORT_PATH="$REPO_ROOT/$EXPORT_FILE" -fi - -case "$ENGINE" in - vllm) - VLLM_CPU_OFFLOAD_GB="" - VLLM_SWAP_SPACE_GB="" - if [[ "$CONTEXT_BAND" == "500k" ]]; then - VLLM_CPU_OFFLOAD_GB=40 - VLLM_SWAP_SPACE_GB=32 - elif [[ "$CONTEXT_BAND" == "1m" ]]; then - VLLM_CPU_OFFLOAD_GB=80 - VLLM_SWAP_SPACE_GB=64 - fi - case "$CONTEXT_BAND" in - 8k|32k) VLLM_MAX_NUM_SEQS=128 ;; - 64k) VLLM_MAX_NUM_SEQS=64 ;; - 131k) VLLM_MAX_NUM_SEQS=32 ;; - 500k) VLLM_MAX_NUM_SEQS=8 ;; - 1m) VLLM_MAX_NUM_SEQS=4 ;; - esac - ;; - sglang) - case "$GPU_TYPE" in - h100) - SGLANG_MEM_FRACTION_STATIC=0.80 - SGLANG_CHUNKED_PREFILL_SIZE=8192 - ;; - h200) - SGLANG_MEM_FRACTION_STATIC=0.82 - SGLANG_CHUNKED_PREFILL_SIZE=16384 - ;; - b200) - SGLANG_MEM_FRACTION_STATIC=0.85 - SGLANG_CHUNKED_PREFILL_SIZE=32768 - ;; - esac - if [[ "$CONTEXT_BAND" == "500k" || "$CONTEXT_BAND" == "1m" ]]; then - SGLANG_MEM_FRACTION_STATIC=0.85 - SGLANG_CHUNKED_PREFILL_SIZE=8192 - fi - ;; -esac - -DATE_STAMP=$(date +%Y%m%d-%H%M%S) -SAFE_CONTEXT=${CONTEXT_BAND//[^[:alnum:]]/_} -SAFE_MODEL=${MODEL_KEY//[^[:alnum:]._-]/_} -SAFE_ENGINE=${ENGINE//[^[:alnum:]._-]/_} -SAFE_GPU=${GPU_TYPE//[^[:alnum:]._-]/_} -SAFE_WORKLOAD=${WORKLOAD//[^[:alnum:]._-]/_} -RUN_LABEL=${GMI_RUN_LABEL:-} -if [[ -n "$RUN_LABEL" ]]; then - RUN_LABEL="-${RUN_LABEL//[^[:alnum:]._-]/_}" -fi -RESULT_STEM="gmi-${SAFE_GPU}-${SAFE_MODEL}-${SAFE_ENGINE}-${SAFE_WORKLOAD}-${SAFE_CONTEXT}-${DATE_STAMP}${RUN_LABEL}" -RUN_DIR="$BENCHMARK_OUTPUT_ROOT/$RESULT_STEM" -SERVER_LOG="$RUN_DIR/server.log" -SUMMARY_JSON="$RUN_DIR/agg_${RESULT_STEM}.json" -TRACE_REPLAY_SUMMARY_JSON="$RUN_DIR/trace_replay_summary.json" -GPU_PROFILE_CSV="$RUN_DIR/${RESULT_STEM}_gpu_profile.csv" -GPU_PROFILER_PID="" -GPU_MEM_PEAK=0 -GPU_MEM_AVG=0 -GPU_UTIL_AVG=0 -mkdir -p "$RUN_DIR" -mkdir -p "$HOME/.cache/huggingface" - -CONTAINER_NAME="isb1-${RESULT_STEM}" -LOG_TAIL_PID="" -CONTAINER_ID="" -ISB1_RESULTS_DB_PATH=${ISB1_RESULTS_DB_PATH:-} - -stop_gpu_profiler() { - if [[ -n "$GPU_PROFILER_PID" ]]; then - kill "$GPU_PROFILER_PID" >/dev/null 2>&1 || true - wait "$GPU_PROFILER_PID" >/dev/null 2>&1 || true - GPU_PROFILER_PID="" - fi -} - -cleanup() { - local exit_code=$? - set +e - stop_gpu_profiler - if [[ -n "$LOG_TAIL_PID" ]]; then - kill "$LOG_TAIL_PID" >/dev/null 2>&1 || true - fi - if [[ -n "$CONTAINER_NAME" ]]; then - docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true - fi - exit $exit_code -} -trap cleanup EXIT - -launch_server() { - # Apply YaRN for Qwen long-context - apply_yarn_config_if_needed "$MODEL_HF_ID" "$MAX_MODEL_LEN" 2>/dev/null || true - - local docker_cmd=() - docker_cmd=( - docker run -d --rm - --name "$CONTAINER_NAME" - --gpus all - --ipc host - --network host - --shm-size 16g - -e HF_TOKEN="$HF_TOKEN_VALUE" - -e HUGGING_FACE_HUB_TOKEN="$HF_TOKEN_VALUE" - -e NVIDIA_VISIBLE_DEVICES=all - -e PYTHONUNBUFFERED=1 - -v "$HOME/.cache/huggingface:/root/.cache/huggingface" - -v "$REPO_ROOT:/workspace" - -w /workspace - ) - - if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then - docker_cmd+=(-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1) - docker_cmd+=(-e SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1) - fi - - if [[ "$ENGINE" == "vllm" ]]; then - local cmd=( - vllm serve "$MODEL_HF_ID" - --host 0.0.0.0 - --port "$PORT" - --tensor-parallel-size "$TP" - --gpu-memory-utilization 0.90 - --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" - --max-model-len "$MAX_MODEL_LEN" - --max-num-seqs "$VLLM_MAX_NUM_SEQS" - --disable-log-requests - --trust-remote-code - ) - - case "${OFFLOAD_MODE:-}" in - on) - cmd+=( - --kv_offloading_backend native - --kv_offloading_size "$TOTAL_CPU_DRAM_GB" - --disable-hybrid-kv-cache-manager - ) - ;; - off) - ;; - noprefix) - cmd+=(--no-enable-prefix-caching) - ;; - legacy|"") - if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then - cmd+=(--cpu-offload-gb "$VLLM_CPU_OFFLOAD_GB") - fi - if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then - cmd+=(--swap-space "$VLLM_SWAP_SPACE_GB") - fi - ;; - esac - - if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then - cmd+=(--no-enable-prefix-caching) - fi - - if [[ "${KV_CACHE_DTYPE:-}" == "fp8" ]]; then - cmd+=(--kv-cache-dtype fp8) - fi - - if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then - cmd+=(--hf-overrides "$YARN_OVERRIDE_JSON") - fi - - CONTAINER_ID=$("${docker_cmd[@]}" "$IMAGE" bash -lc "$(printf '%q ' "${cmd[@]}")") - else - local cmd=( - python3 -m sglang.launch_server - --model-path "$MODEL_HF_ID" - --host 0.0.0.0 - --port "$PORT" - --trust-remote-code - --tensor-parallel-size "$TP" - --data-parallel-size 1 - --context-length "$MAX_MODEL_LEN" - --max-running-requests "$MAX_ACTIVE_REQUESTS" - --cuda-graph-max-bs "$MAX_ACTIVE_REQUESTS" - --chunked-prefill-size "$SGLANG_CHUNKED_PREFILL_SIZE" - --max-prefill-tokens "$SGLANG_CHUNKED_PREFILL_SIZE" - --mem-fraction-static "$SGLANG_MEM_FRACTION_STATIC" - --attention-backend flashinfer - --stream-interval 10 - --decode-log-interval 1 - ) - - case "${OFFLOAD_MODE:-}" in - on) - echo "WARNING: OFFLOAD_MODE=on is not supported for SGLang; continuing without native offload" >&2 - ;; - noprefix) - cmd+=(--disable-radix-cache) - ;; - off|legacy|"") - ;; - esac - - if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then - cmd+=(--disable-radix-cache) - fi - - if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then - cmd+=(--json-model-override-args "$YARN_OVERRIDE_JSON") - fi - - CONTAINER_ID=$("${docker_cmd[@]}" "$IMAGE" bash -lc "$(printf '%q ' "${cmd[@]}")") - fi - - [[ -n "$CONTAINER_ID" ]] || die "Failed to start Docker container" - docker logs -f "$CONTAINER_NAME" > "$SERVER_LOG" 2>&1 & - LOG_TAIL_PID=$! -} - -wait_for_server_ready() { - local deadline=$((SECONDS + HEALTH_TIMEOUT_S)) - until curl --output /dev/null --silent --fail "http://127.0.0.1:${PORT}/health"; do - if ! docker ps --format '{{.Names}}' | grep -Fxq "$CONTAINER_NAME"; then - echo "Container exited before becoming healthy. Recent logs:" >&2 - docker logs "$CONTAINER_NAME" >&2 || true - return 1 - fi - if (( SECONDS >= deadline )); then - echo "Timed out waiting for http://127.0.0.1:${PORT}/health" >&2 - docker logs "$CONTAINER_NAME" | tail -n 200 >&2 || true - return 1 - fi - sleep "$HEALTH_POLL_INTERVAL_S" - done -} - -echo "==> GMI portable benchmark" -echo "repo: $REPO_ROOT" -echo "gpu-type: $GPU_TYPE" -echo "model: $MODEL_KEY ($MODEL_HF_ID)" -echo "engine: $ENGINE" -echo "context-band: $CONTEXT_BAND" -echo "workload: $WORKLOAD" -echo "benchmark-type: $BENCHMARK_TYPE" -echo "trace-source: $TRACE_SOURCE" -echo "max-concurrency: $MAX_CONCURRENCY" -echo "max-model-len: $MAX_MODEL_LEN" -echo "docker image: $IMAGE" -echo "export-file: $EXPORT_FILE" -if [[ "$TRACE_SOURCE" != "isb1" ]]; then - echo "trace-dir: $TRACE_DIR" -fi -echo "runtime-stack-id: $RUNTIME_STACK_ID" -echo "hardware-profile-id: $HARDWARE_PROFILE_ID" -echo "canonical-model-id: $CANONICAL_MODEL_ID" -echo "support-status: ${SUPPORT_STATUS:-}" -echo "certification: ${BENCHMARK_CERTIFICATION_STATUS:-}" -echo "matched export cells: $MATCHED_CELL_COUNT" -echo "trace-ids: ${TRACE_IDS:-}" -echo "output dir: $RUN_DIR" -echo "offload-mode: ${OFFLOAD_MODE:-legacy}" -echo "kv-cache-dtype: ${KV_CACHE_DTYPE:-auto}" -echo "disable-prefix-cache: $DISABLE_PREFIX_CACHING" -echo "total-cpu-dram-gb: $TOTAL_CPU_DRAM_GB" -if [[ "$ENGINE" == "vllm" ]]; then - echo "vllm cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB:-0}" - echo "vllm swap-space-gb: ${VLLM_SWAP_SPACE_GB:-0}" -else - echo "sglang mem fraction: $SGLANG_MEM_FRACTION_STATIC" - echo "sglang chunked pf: $SGLANG_CHUNKED_PREFILL_SIZE" -fi - -"$SCRIPT_DIR/gpu_profile_collector.sh" --output "$GPU_PROFILE_CSV" --interval 2 & -GPU_PROFILER_PID=$! - -launch_server -wait_for_server_ready - -if [[ "$TRACE_SOURCE" == "isb1" ]]; then - echo "==> Server is healthy; starting export replay" - - benchmark_cmd=( - python3 "$REPO_ROOT/utils/bench_serving/benchmark_export_replay.py" - --model "$MODEL_HF_ID" - --base-url "http://127.0.0.1:${PORT}" - --export-file "$EXPORT_PATH" - --request-mode "$HARNESS_REQUEST_MODE" - --max-concurrency "$MAX_CONCURRENCY" - --num-warmup-sessions "$NUM_WARMUP_SESSIONS" - --save-result - --result-dir "$RUN_DIR" - --result-filename "$RESULT_STEM.json" - --runtime-stack-id "$RUNTIME_STACK_ID" - --hardware-profile-id "$HARDWARE_PROFILE_ID" - --canonical-model-id "$CANONICAL_MODEL_ID" - --metadata "benchmark_type=$BENCHMARK_TYPE" - --metadata "export_file=$EXPORT_FILE" - --metadata "runtime_stack_id=$RUNTIME_STACK_ID" - --metadata "hardware_profile_id=$HARDWARE_PROFILE_ID" - --metadata "canonical_model_id=$CANONICAL_MODEL_ID" - --metadata "request_mode=$REQUEST_MODE" - --metadata "gmi_gpu_type=$GPU_TYPE" - --metadata "gmi_engine=$ENGINE" - --metadata "gmi_context_band=$CONTEXT_BAND" - --metadata "gmi_workload=$WORKLOAD" - --trust-remote-code - ) - if [[ -n "$BENCHMARK_DURATION_S" ]]; then - benchmark_cmd+=(--metadata "benchmark_duration_s=$BENCHMARK_DURATION_S") - fi - if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then - benchmark_cmd+=(--metadata "campaign_class=kv_stress") - fi - if [[ -n "$SUPPORT_STATUS" ]]; then - benchmark_cmd+=(--support-status "$SUPPORT_STATUS") - fi - if [[ -n "$MAX_SESSIONS" ]]; then - benchmark_cmd+=(--max-sessions "$MAX_SESSIONS") - fi - if [[ -n "$MAX_TURNS_PER_SESSION" ]]; then - benchmark_cmd+=(--max-turns-per-session "$MAX_TURNS_PER_SESSION") - fi - if [[ "$IGNORE_WAITS" == "true" ]]; then - benchmark_cmd+=(--ignore-waits) - fi - if [[ "$ENGINE" == "vllm" ]]; then - if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then - benchmark_cmd+=(--metadata "vllm_cpu_offload_gb=$VLLM_CPU_OFFLOAD_GB") - fi - if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then - benchmark_cmd+=(--metadata "vllm_swap_space_gb=$VLLM_SWAP_SPACE_GB") - fi - else - benchmark_cmd+=(--metadata "sglang_mem_fraction_override=$SGLANG_MEM_FRACTION_STATIC") - benchmark_cmd+=(--metadata "sglang_chunked_prefill_override=$SGLANG_CHUNKED_PREFILL_SIZE") - fi - - "${benchmark_cmd[@]}" -else - echo "==> Server is healthy; starting trace replay ($TRACE_SOURCE)" - - trace_cmd=( - python3 "$REPO_ROOT/experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py" - --api-endpoint "http://localhost:$PORT" - --trace-directory "$TRACE_DIR" - --output-dir "$RUN_DIR" - --start-users "$MAX_CONCURRENCY" - --max-users "$MAX_CONCURRENCY" - --test-duration "${BENCHMARK_DURATION_S:-1800}" - --seed 42 - --no-color - ) - - "${trace_cmd[@]}" - - python3 "$SCRIPT_DIR/adapt_trace_replay_result.py" \ - --input-dir "$RUN_DIR" \ - --detailed-csv detailed_results.csv \ - --summary-json "$TRACE_REPLAY_SUMMARY_JSON" \ - --output-json "$RUN_DIR/${RESULT_STEM}.json" \ - --model-id "$MODEL_HF_ID" \ - --max-concurrency "$MAX_CONCURRENCY" \ - --request-mode "$REQUEST_MODE" \ - --support-status "$SUPPORT_STATUS" \ - --benchmark-certification-status "$BENCHMARK_CERTIFICATION_STATUS" \ - --result-stem "$RESULT_STEM" -fi - -echo "==> Processing ISB1 result" -( - cd "$RUN_DIR" - export RUNNER_TYPE="$RUNNER_TYPE" - export FRAMEWORK="$ENGINE" - export PRECISION="$PRECISION" - export RESULT_FILENAME="$RESULT_STEM" - export MODEL_PREFIX="$MODEL_PREFIX" - export IMAGE="$IMAGE" - export TP="$TP" - export EP_SIZE=1 - export DP_ATTENTION=false - export BENCHMARK_TYPE="$BENCHMARK_TYPE" - export EXPORT_FILE="$EXPORT_FILE" - export RUNTIME_STACK_ID="$RUNTIME_STACK_ID" - export HARDWARE_PROFILE_ID="$HARDWARE_PROFILE_ID" - export CANONICAL_MODEL_ID="$CANONICAL_MODEL_ID" - export REQUEST_MODE="$REQUEST_MODE" - export TRACE_SOURCE="$TRACE_SOURCE" - export WORKLOAD_TYPE="$WORKLOAD" - export MAX_CONCURRENCY="$MAX_CONCURRENCY" - export IGNORE_WAITS="$IGNORE_WAITS" - export DISPATCH_REF="manual:gmi-portable" - export MAX_MODEL_LEN="$MAX_MODEL_LEN" - export OFFLOAD_MODE="${OFFLOAD_MODE:-}" - export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-}" - export DISABLE_PREFIX_CACHING="$DISABLE_PREFIX_CACHING" - if [[ -n "$BENCHMARK_DURATION_S" ]]; then - export BENCHMARK_DURATION_S="$BENCHMARK_DURATION_S" - fi - if [[ -n "$SUPPORT_STATUS" ]]; then - export SUPPORT_STATUS="$SUPPORT_STATUS" - fi - if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then - export VLLM_CPU_OFFLOAD_GB="$VLLM_CPU_OFFLOAD_GB" - fi - if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then - export VLLM_SWAP_SPACE_GB="$VLLM_SWAP_SPACE_GB" - fi - if [[ -n "${SGLANG_MEM_FRACTION_STATIC:-}" ]]; then - export SGLANG_MEM_FRACTION_OVERRIDE="$SGLANG_MEM_FRACTION_STATIC" - fi - if [[ -n "${SGLANG_CHUNKED_PREFILL_SIZE:-}" ]]; then - export SGLANG_CHUNKED_PREFILL_OVERRIDE="$SGLANG_CHUNKED_PREFILL_SIZE" - fi - python3 "$REPO_ROOT/utils/process_result_isb1.py" | tee "$SUMMARY_JSON" -) - -stop_gpu_profiler - -if [[ -f "$GPU_PROFILE_CSV" ]]; then - GPU_STATS=$(python3 - "$GPU_PROFILE_CSV" <<'PY' -import csv -import sys - -with open(sys.argv[1], newline="") as handle: - rows = list(csv.DictReader(handle)) - -if rows: - mems = [float(row.get("mem_used_mb", "0") or 0) for row in rows] - utils = [float(row.get("gpu_util_pct", "0") or 0) for row in rows] - print(f"{max(mems) / 1024:.2f} {sum(mems) / len(mems) / 1024:.2f} {sum(utils) / len(utils):.1f}") -else: - print("0 0 0") -PY - 2>/dev/null) || GPU_STATS="0 0 0" - read -r GPU_MEM_PEAK GPU_MEM_AVG GPU_UTIL_AVG <<< "$GPU_STATS" -fi - -if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then - CAMPAIGN_METADATA_JSON="$RUN_DIR/kv_stress_campaign_metadata.json" - python3 - \ - "$CAMPAIGN_METADATA_JSON" \ - "$BENCHMARK_TYPE" \ - "$WORKLOAD" \ - "$MAX_CONCURRENCY" \ - "${OFFLOAD_MODE:-}" \ - "${KV_CACHE_DTYPE:-}" \ - "$DISABLE_PREFIX_CACHING" \ - "${BENCHMARK_DURATION_S:-}" <<'PY' -import json -import sys - -payload = { - "benchmark_type": sys.argv[2], - "campaign_class": "kv_stress", - "workload_type": sys.argv[3], - "max_concurrency": sys.argv[4], - "offload_mode": sys.argv[5] or None, - "kv_cache_dtype": sys.argv[6] or None, - "disable_prefix_caching": sys.argv[7], - "benchmark_duration_s": sys.argv[8] or None, -} -with open(sys.argv[1], "w", encoding="utf-8") as f: - json.dump(payload, f, indent=2, sort_keys=True) -PY -fi - -if [[ -f "$SUMMARY_JSON" ]] && command -v python3 >/dev/null 2>&1; then - db_ingest_cmd=( - python3 "$SCRIPT_DIR/isb1_results_db.py" ingest "$SUMMARY_JSON" - --gpu-type "$GPU_TYPE" - --model "$MODEL_KEY" - --engine "$ENGINE" - --context-band "$CONTEXT_BAND" - --workload-type "$WORKLOAD" - --trace-source "$TRACE_SOURCE" - --max-model-len "$MAX_MODEL_LEN" - --tp "$TP" - --gpu-mem-peak-gb "${GPU_MEM_PEAK:-0}" - --gpu-mem-avg-gb "${GPU_MEM_AVG:-0}" - --gpu-util-avg-pct "${GPU_UTIL_AVG:-0}" - --gpu-profile-csv "$GPU_PROFILE_CSV" - ) - if [[ -n "$ISB1_RESULTS_DB_PATH" ]]; then - db_ingest_cmd+=(--db-path "$ISB1_RESULTS_DB_PATH") - fi - if [[ -n "${OFFLOAD_MODE:-}" ]]; then - db_ingest_cmd+=(--offload-mode "$OFFLOAD_MODE") - fi - if [[ -n "${KV_CACHE_DTYPE:-}" ]]; then - db_ingest_cmd+=(--kv-cache-dtype "$KV_CACHE_DTYPE") - fi - if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then - db_ingest_cmd+=(--disable-prefix-caching 1) - fi - if [[ -n "$BENCHMARK_DURATION_S" ]]; then - db_ingest_cmd+=(--benchmark-duration-s "$BENCHMARK_DURATION_S") - fi - if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then - db_ingest_cmd+=(--campaign-class kv_stress) - fi - if [[ "$ENGINE" == "vllm" ]]; then - if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then - db_ingest_cmd+=(--vllm-cpu-offload-gb "$VLLM_CPU_OFFLOAD_GB") - fi - if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then - db_ingest_cmd+=(--vllm-swap-space-gb "$VLLM_SWAP_SPACE_GB") - fi - else - db_ingest_cmd+=(--sglang-mem-fraction "$SGLANG_MEM_FRACTION_STATIC") - db_ingest_cmd+=(--sglang-chunked-prefill "$SGLANG_CHUNKED_PREFILL_SIZE") - fi - "${db_ingest_cmd[@]}" 2>/dev/null || echo "WARNING: DB ingest failed" >&2 -fi - -python3 - "$SUMMARY_JSON" <<'PY' -import json -import sys -from pathlib import Path - -summary = json.loads(Path(sys.argv[1]).read_text()) -print("==> Summary") -for key, value in [ - ("result_filename", summary.get("result_filename")), - ("support_status", summary.get("support_status")), - ("benchmark_certification_status", summary.get("benchmark_certification_status")), - ("completed_sessions", f"{summary.get('completed_sessions')}/{summary.get('total_sessions')}"), - ("effective_max_context_depth", summary.get("effective_max_context_depth")), - ("context_pressure_class", summary.get("context_pressure_class")), - ("context_pressure_signal", summary.get("context_pressure_signal", {}).get("status")), - ("depth_coverage_ratio", summary.get("depth_coverage_ratio")), - ("depth_coverage_class", summary.get("depth_coverage_class")), - ("max_actual_context_len", summary.get("max_actual_context_len_per_turn")), - ("preemption_count", summary.get("preemption_count")), - ("session_throughput_sps", summary.get("session_throughput_sps")), - ("tput_per_gpu", summary.get("tput_per_gpu")), - ("output_tput_per_gpu", summary.get("output_tput_per_gpu")), - ("mean_ttft_s", summary.get("mean_ttft")), - ("p99_ttft_s", summary.get("p99_ttft")), - ("server_logs", Path(sys.argv[1]).with_name("server.log")), - ("raw_replay_result", Path(sys.argv[1]).with_name(summary.get("result_filename", "run") + ".json")), - ("processed_result", Path(sys.argv[1])), -]: - print(f" {key}: {value}") -PY diff --git a/datasets/isb1/scripts/gmi_test_matrix.sh b/datasets/isb1/scripts/gmi_test_matrix.sh deleted file mode 100755 index 5deadb072..000000000 --- a/datasets/isb1/scripts/gmi_test_matrix.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env bash -set -Eeuo pipefail - -usage() { - cat <<'EOF' -Usage: - gmi_test_matrix.sh --gpu-type - -Runs a curated GMI Cloud matrix: - - Qwen3.5 × vllm × 131k - - Qwen3.5 × vllm × 500k - - Qwen3.5 × sglang × 500k - - GPT-OSS × vllm × 131k - - DSR1 × sglang × 131k -EOF -} - -GPU_TYPE="" -while [[ $# -gt 0 ]]; do - case "$1" in - --gpu-type) - GPU_TYPE="$2" - shift 2 - ;; - -h|--help) - usage - exit 0 - ;; - *) - echo "Unknown argument: $1" >&2 - usage >&2 - exit 1 - ;; - esac -done - -[[ -n "$GPU_TYPE" ]] || { - usage >&2 - exit 1 -} - -case "$GPU_TYPE" in - h100|h200|b200) ;; - *) - echo "Unsupported --gpu-type: $GPU_TYPE" >&2 - exit 1 - ;; -esac - -SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh" -[[ -x "$PORTABLE_SCRIPT" ]] || { - echo "Expected executable helper at $PORTABLE_SCRIPT" >&2 - exit 1 -} - -run_case() { - local model="$1" - local engine="$2" - local context_band="$3" - local workload="${4:-code}" - - echo - echo "============================================================" - echo "Running: gpu=${GPU_TYPE} model=${model} engine=${engine} context=${context_band} workload=${workload}" - echo "============================================================" - - "$PORTABLE_SCRIPT" \ - --gpu-type "$GPU_TYPE" \ - --model "$model" \ - --engine "$engine" \ - --context-band "$context_band" \ - --workload "$workload" -} - -run_case qwen3.5 vllm 8k chat -run_case qwen3.5 vllm 131k code -run_case qwen3.5 vllm 500k code -run_case qwen3.5 sglang 500k chat -run_case gptoss vllm 131k code -run_case gptoss vllm 131k chat -run_case gptoss vllm 500k chat -run_case dsr1 sglang 131k code -run_case dsr1 sglang 131k chat -run_case qwen3.5 vllm 1m code - -echo -echo "Curated GMI test matrix completed successfully." diff --git a/datasets/isb1/scripts/gpu_profile_collector.sh b/datasets/isb1/scripts/gpu_profile_collector.sh deleted file mode 100755 index 4ba03f223..000000000 --- a/datasets/isb1/scripts/gpu_profile_collector.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash -set -Eeuo pipefail - -# Usage: gpu_profile_collector.sh --output /tmp/gpu.csv [--interval 2] -# Runs nvidia-smi polling until killed (SIGTERM/SIGINT) - -OUTPUT="" -INTERVAL=2 - -while [[ $# -gt 0 ]]; do - case "$1" in - --output) - OUTPUT="$2" - shift 2 - ;; - --interval) - INTERVAL="$2" - shift 2 - ;; - *) - echo "Unknown arg: $1" >&2 - exit 1 - ;; - esac -done - -[[ -n "$OUTPUT" ]] || { - echo "ERROR: --output required" >&2 - exit 1 -} - -mkdir -p "$(dirname "$OUTPUT")" -echo "timestamp,gpu_bus_id,gpu_util_pct,mem_util_pct,mem_used_mb,mem_total_mb,temp_c,power_w" > "$OUTPUT" - -trap 'exit 0' SIGTERM SIGINT - -while true; do - nvidia-smi \ - --query-gpu=timestamp,gpu_bus_id,utilization.gpu,utilization.memory,memory.used,memory.total,temperature.gpu,power.draw \ - --format=csv,noheader,nounits >> "$OUTPUT" 2>/dev/null || true - sleep "$INTERVAL" -done diff --git a/datasets/isb1/scripts/isb1_results_db.py b/datasets/isb1/scripts/isb1_results_db.py deleted file mode 100644 index e052fa766..000000000 --- a/datasets/isb1/scripts/isb1_results_db.py +++ /dev/null @@ -1,816 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import csv -import json -import sqlite3 -import sys -import uuid -from datetime import datetime, timezone -from pathlib import Path -from typing import Any, Iterable, Sequence - -SCRIPT_DIR = Path(__file__).resolve().parent -REPO_ROOT = SCRIPT_DIR.parent.parent.parent -DEFAULT_DB_PATH = REPO_ROOT / "datasets/isb1/results/isb1_results.db" -TABLE_NAME = "benchmark_runs" - -SCHEMA_SQL = f""" -CREATE TABLE IF NOT EXISTS {TABLE_NAME} ( - id INTEGER PRIMARY KEY, - run_id TEXT, - timestamp TEXT, - gpu_type TEXT, - model TEXT, - engine TEXT, - context_band TEXT, - workload_type TEXT, - max_model_len INTEGER, - tp INTEGER, - vllm_cpu_offload_gb REAL, - vllm_swap_space_gb REAL, - sglang_mem_fraction REAL, - sglang_chunked_prefill INTEGER, - ttft_p50_ms REAL, - ttft_p99_ms REAL, - tpot_p50_ms REAL, - tpot_p99_ms REAL, - throughput_tok_s REAL, - total_sessions INTEGER, - completed_sessions INTEGER, - total_turns INTEGER, - completed_turns INTEGER, - preemption_count INTEGER, - gpu_mem_peak_gb REAL, - gpu_mem_avg_gb REAL, - gpu_util_avg_pct REAL, - kv_cache_usage_pct REAL, - server_startup_s REAL, - benchmark_duration_s REAL, - campaign_class TEXT, - trace_source TEXT, - total_actual_input_tokens INTEGER, - max_actual_context_len INTEGER, - depth_coverage_ratio REAL, - depth_coverage_class TEXT, - producer_estimated_kv_bytes_peak INTEGER, - producer_expected_offload_mode TEXT, - offload_mode_match INTEGER, - offload_mode TEXT, - kv_cache_dtype TEXT, - disable_prefix_caching INTEGER, - cpu_cache_usage_peak_pct REAL, - raw_result_json TEXT, - status TEXT, - error_message TEXT -) -""" - -INSERT_COLUMNS = [ - "run_id", - "timestamp", - "gpu_type", - "model", - "engine", - "context_band", - "workload_type", - "max_model_len", - "tp", - "vllm_cpu_offload_gb", - "vllm_swap_space_gb", - "sglang_mem_fraction", - "sglang_chunked_prefill", - "ttft_p50_ms", - "ttft_p99_ms", - "tpot_p50_ms", - "tpot_p99_ms", - "throughput_tok_s", - "total_sessions", - "completed_sessions", - "total_turns", - "completed_turns", - "preemption_count", - "gpu_mem_peak_gb", - "gpu_mem_avg_gb", - "gpu_util_avg_pct", - "kv_cache_usage_pct", - "server_startup_s", - "benchmark_duration_s", - "campaign_class", - "trace_source", - "total_actual_input_tokens", - "max_actual_context_len", - "depth_coverage_ratio", - "depth_coverage_class", - "producer_estimated_kv_bytes_peak", - "producer_expected_offload_mode", - "offload_mode_match", - "offload_mode", - "kv_cache_dtype", - "disable_prefix_caching", - "cpu_cache_usage_peak_pct", - "raw_result_json", - "status", - "error_message", -] - -GROUPABLE_COLUMNS = { - "gpu_type", - "model", - "engine", - "context_band", - "workload_type", - "status", - "tp", - "max_model_len", - "depth_coverage_class", - "offload_mode", - "campaign_class", - "trace_source", -} - -DEFAULT_QUERY_COLUMNS = [ - "timestamp", - "gpu_type", - "model", - "engine", - "context_band", - "workload_type", - "status", - "ttft_p50_ms", - "ttft_p99_ms", - "throughput_tok_s", - "gpu_mem_peak_gb", - "gpu_util_avg_pct", - "preemption_count", - "depth_coverage_ratio", - "max_actual_context_len", - "depth_coverage_class", - "run_id", -] - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Store and analyze ISB1 benchmark runs in SQLite.") - subparsers = parser.add_subparsers(dest="command", required=True) - - ingest = subparsers.add_parser("ingest", help="Read a processed ISB1 JSON file and insert a benchmark run.") - ingest.add_argument("json_file", help="Path to utils/process_result_isb1.py output JSON.") - ingest.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") - ingest.add_argument("--gpu-type", required=True, choices=["h100", "h200", "b200"]) - ingest.add_argument("--model", required=True, choices=["qwen3.5", "gptoss", "dsr1"]) - ingest.add_argument("--engine", required=True, choices=["vllm", "sglang"]) - ingest.add_argument("--context-band", required=True, choices=["8k", "32k", "64k", "131k", "500k", "1m"]) - ingest.add_argument("--workload-type", choices=["chat", "code"], help="Workload type (chat or code)") - ingest.add_argument("--run-id", help="Optional run UUID. Generated if omitted.") - ingest.add_argument("--timestamp", help="Optional ISO-8601 timestamp. Uses current UTC time if omitted.") - ingest.add_argument("--max-model-len", type=int) - ingest.add_argument("--tp", type=int) - ingest.add_argument("--vllm-cpu-offload-gb", type=float) - ingest.add_argument("--vllm-swap-space-gb", type=float) - ingest.add_argument("--sglang-mem-fraction", type=float) - ingest.add_argument("--sglang-chunked-prefill", type=int) - ingest.add_argument("--ttft-p50-ms", type=float) - ingest.add_argument("--ttft-p99-ms", type=float) - ingest.add_argument("--tpot-p50-ms", type=float) - ingest.add_argument("--tpot-p99-ms", type=float) - ingest.add_argument("--throughput-tok-s", type=float) - ingest.add_argument("--total-sessions", type=int) - ingest.add_argument("--completed-sessions", type=int) - ingest.add_argument("--total-turns", type=int) - ingest.add_argument("--completed-turns", type=int) - ingest.add_argument("--preemption-count", type=int) - ingest.add_argument("--gpu-mem-peak-gb", type=float) - ingest.add_argument("--gpu-mem-avg-gb", type=float) - ingest.add_argument("--gpu-util-avg-pct", type=float) - ingest.add_argument("--kv-cache-usage-pct", type=float) - ingest.add_argument("--server-startup-s", type=float) - ingest.add_argument("--benchmark-duration-s", type=float) - ingest.add_argument("--campaign-class") - ingest.add_argument("--trace-source", choices=["isb1", "kv_cache_tester", "aiperf"]) - ingest.add_argument("--offload-mode", choices=["on", "off", "noprefix", "legacy"]) - ingest.add_argument("--kv-cache-dtype", choices=["auto", "fp8"]) - ingest.add_argument("--disable-prefix-caching", type=int, choices=[0, 1]) - ingest.add_argument("--gpu-profile-csv", help="Optional GPU profile CSV path to stash in raw_result_json metadata.") - ingest.add_argument("--status", default="success", choices=["success", "failed", "timeout"]) - ingest.add_argument("--error-message") - - query = subparsers.add_parser("query", help="Print runs or an aggregated grouped view.") - query.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") - query.add_argument("--group-by", help="Comma-separated columns to group by, for example gpu_type,context_band.") - - export_csv = subparsers.add_parser("export-csv", help="Export all benchmark rows to CSV.") - export_csv.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") - export_csv.add_argument("--output", help="Destination CSV path. Defaults to stdout.") - - summary = subparsers.add_parser("summary", help="Print a concise findings summary.") - summary.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") - - return parser.parse_args() - - -_MIGRATIONS = [ - f"ALTER TABLE {TABLE_NAME} ADD COLUMN total_actual_input_tokens INTEGER", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN max_actual_context_len INTEGER", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN depth_coverage_ratio REAL", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN depth_coverage_class TEXT", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN producer_estimated_kv_bytes_peak INTEGER", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN producer_expected_offload_mode TEXT", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN offload_mode_match INTEGER", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN offload_mode TEXT", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN kv_cache_dtype TEXT", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN disable_prefix_caching INTEGER", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN cpu_cache_usage_peak_pct REAL", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN workload_type TEXT", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN campaign_class TEXT", - f"ALTER TABLE {TABLE_NAME} ADD COLUMN trace_source TEXT", -] - - -def ensure_db(conn: sqlite3.Connection) -> None: - conn.execute(SCHEMA_SQL) - conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_run_id ON {TABLE_NAME}(run_id)") - conn.execute( - f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_grouping " - f"ON {TABLE_NAME}(gpu_type, model, engine, context_band, status)" - ) - # Idempotent migrations for existing databases - for migration_sql in _MIGRATIONS: - try: - conn.execute(migration_sql) - except sqlite3.OperationalError: - pass # Column already exists - conn.commit() - - -def connect_db(db_path: str | Path) -> sqlite3.Connection: - db_path = Path(db_path) - db_path.parent.mkdir(parents=True, exist_ok=True) - conn = sqlite3.connect(db_path) - conn.row_factory = sqlite3.Row - ensure_db(conn) - return conn - - -def utc_now_iso() -> str: - return datetime.now(timezone.utc).replace(microsecond=0).isoformat() - - -def to_float(value: Any) -> float | None: - if value in (None, ""): - return None - try: - return float(value) - except (TypeError, ValueError): - return None - - -def to_int(value: Any) -> int | None: - if value in (None, ""): - return None - try: - return int(float(value)) - except (TypeError, ValueError): - return None - - -def seconds_to_ms(value: Any) -> float | None: - parsed = to_float(value) - return None if parsed is None else parsed * 1000.0 - - -def choose(*values: Any) -> Any: - for value in values: - if value not in (None, ""): - return value - return None - - -def load_payload(path: str | Path) -> dict[str, Any]: - payload = json.loads(Path(path).read_text()) - if not isinstance(payload, dict): - raise SystemExit(f"Expected a JSON object in {path}") - return payload - - -def derive_total_turns(payload: dict[str, Any], total_sessions: int | None) -> int | None: - max_turns = to_int(payload.get("max_turns")) - if max_turns is not None and total_sessions is not None: - return max_turns * total_sessions - per_turn_metrics = payload.get("per_turn_metrics") or {} - if isinstance(per_turn_metrics, dict) and total_sessions is not None: - return len(per_turn_metrics) * total_sessions - return None - - -def derive_completed_turns(payload: dict[str, Any]) -> int | None: - per_turn_metrics = payload.get("per_turn_metrics") or {} - if not isinstance(per_turn_metrics, dict): - return None - completed = 0 - saw_value = False - for turn_metrics in per_turn_metrics.values(): - if not isinstance(turn_metrics, dict): - continue - value = to_int(turn_metrics.get("completed")) - if value is None: - continue - completed += value - saw_value = True - return completed if saw_value else None - - -def build_raw_payload(payload: dict[str, Any], args: argparse.Namespace) -> dict[str, Any]: - enriched = dict(payload) - metadata = { - "source_json": str(Path(args.json_file).resolve()), - "db_path": str(Path(args.db_path).resolve()), - } - if args.gpu_profile_csv: - metadata["gpu_profile_csv"] = str(Path(args.gpu_profile_csv).resolve()) - if args.status != "success": - metadata["status_override"] = args.status - if args.error_message: - metadata["error_message"] = args.error_message - enriched["_isb1_results_db"] = metadata - return enriched - - -def insert_run(args: argparse.Namespace) -> None: - payload = load_payload(args.json_file) - aggregate = payload.get("aggregate_metrics") or {} - runtime_overrides = payload.get("runtime_overrides") or {} - server_metrics_summary = payload.get("server_metrics_summary") or {} - - total_sessions = to_int(choose(args.total_sessions, payload.get("total_sessions"), aggregate.get("total_sessions"))) - completed_sessions = to_int( - choose(args.completed_sessions, payload.get("completed_sessions"), aggregate.get("completed_sessions")) - ) - - gpu_cache_peak = to_float(server_metrics_summary.get("gpu_cache_usage_peak")) - if gpu_cache_peak is None: - gpu_cache_peak = to_float(payload.get("peak_gpu_cache_usage")) - - row = { - "run_id": args.run_id or str(uuid.uuid4()), - "timestamp": args.timestamp or utc_now_iso(), - "gpu_type": args.gpu_type, - "model": args.model, - "engine": args.engine, - "context_band": args.context_band, - "workload_type": choose( - getattr(args, 'workload_type', None), - payload.get("benchmark_surface"), - ), - "max_model_len": to_int(choose(args.max_model_len, payload.get("max_model_len"))), - "tp": to_int(choose(args.tp, payload.get("tp"))), - "vllm_cpu_offload_gb": to_float( - choose( - args.vllm_cpu_offload_gb, - runtime_overrides.get("vllm_cpu_offload_gb"), - payload.get("vllm_cpu_offload_gb"), - ) - ), - "vllm_swap_space_gb": to_float( - choose( - args.vllm_swap_space_gb, - runtime_overrides.get("vllm_swap_space_gb"), - payload.get("vllm_swap_space_gb"), - ) - ), - "sglang_mem_fraction": to_float( - choose( - args.sglang_mem_fraction, - runtime_overrides.get("sglang_mem_fraction_override"), - payload.get("sglang_mem_fraction_override"), - ) - ), - "sglang_chunked_prefill": to_int( - choose( - args.sglang_chunked_prefill, - runtime_overrides.get("sglang_chunked_prefill_override"), - payload.get("sglang_chunked_prefill_override"), - ) - ), - "ttft_p50_ms": to_float( - choose(args.ttft_p50_ms, aggregate.get("median_ttft_ms"), seconds_to_ms(payload.get("median_ttft"))) - ), - "ttft_p99_ms": to_float( - choose(args.ttft_p99_ms, aggregate.get("p99_ttft_ms"), seconds_to_ms(payload.get("p99_ttft"))) - ), - "tpot_p50_ms": to_float( - choose(args.tpot_p50_ms, aggregate.get("median_tpot_ms"), seconds_to_ms(payload.get("median_tpot"))) - ), - "tpot_p99_ms": to_float( - choose(args.tpot_p99_ms, aggregate.get("p99_tpot_ms"), seconds_to_ms(payload.get("p99_tpot"))) - ), - "throughput_tok_s": to_float( - choose(args.throughput_tok_s, aggregate.get("total_token_throughput_tps"), payload.get("throughput_tok_s")) - ), - "total_sessions": total_sessions, - "completed_sessions": completed_sessions, - "total_turns": to_int(choose(args.total_turns, derive_total_turns(payload, total_sessions))), - "completed_turns": to_int(choose(args.completed_turns, derive_completed_turns(payload))), - "preemption_count": to_int(choose(args.preemption_count, payload.get("preemption_count"))), - "gpu_mem_peak_gb": to_float(choose(args.gpu_mem_peak_gb, payload.get("gpu_mem_peak_gb"))), - "gpu_mem_avg_gb": to_float(choose(args.gpu_mem_avg_gb, payload.get("gpu_mem_avg_gb"))), - "gpu_util_avg_pct": to_float(choose(args.gpu_util_avg_pct, payload.get("gpu_util_avg_pct"))), - "kv_cache_usage_pct": to_float( - choose(args.kv_cache_usage_pct, payload.get("kv_cache_usage_pct"), gpu_cache_peak * 100.0 if gpu_cache_peak is not None else None) - ), - "server_startup_s": to_float(choose(args.server_startup_s, payload.get("server_startup_s"))), - "benchmark_duration_s": to_float( - choose(args.benchmark_duration_s, payload.get("benchmark_duration_s"), aggregate.get("total_wall_time_s")) - ), - "campaign_class": choose( - getattr(args, 'campaign_class', None), - payload.get("campaign_class"), - ), - "trace_source": choose( - getattr(args, 'trace_source', None), - payload.get("trace_source"), - ), - "total_actual_input_tokens": to_int( - (payload.get("depth_telemetry") or {}).get("total_actual_input_tokens") - or payload.get("total_actual_input_tokens") - ), - "max_actual_context_len": to_int( - (payload.get("depth_telemetry") or {}).get("max_actual_context_len_per_turn") - or payload.get("max_actual_context_len_per_turn") - ), - "depth_coverage_ratio": to_float(payload.get("depth_coverage_ratio")), - "depth_coverage_class": payload.get("depth_coverage_class"), - "producer_estimated_kv_bytes_peak": to_int(payload.get("producer_estimated_kv_bytes_peak")), - "producer_expected_offload_mode": payload.get("producer_expected_offload_mode"), - "offload_mode_match": ( - 1 if payload.get("producer_expectation_validation", {}).get("offload_mode_match") is True - else 0 if payload.get("producer_expectation_validation", {}).get("offload_mode_match") is False - else None - ), - "offload_mode": choose(getattr(args, 'offload_mode', None), payload.get("offload_mode")), - "kv_cache_dtype": choose(getattr(args, 'kv_cache_dtype', None), payload.get("kv_cache_dtype")), - "disable_prefix_caching": to_int( - choose( - getattr(args, 'disable_prefix_caching', None), - payload.get("disable_prefix_caching"), - ) - ), - "cpu_cache_usage_peak_pct": to_float( - payload.get("peak_cpu_cache_usage", 0.0) * 100.0 - if payload.get("peak_cpu_cache_usage") is not None else None - ), - "raw_result_json": json.dumps(build_raw_payload(payload, args), sort_keys=True), - "status": args.status, - "error_message": choose(args.error_message, payload.get("error_message")), - } - - conn = connect_db(args.db_path) - placeholders = ", ".join("?" for _ in INSERT_COLUMNS) - sql = f"INSERT INTO {TABLE_NAME} ({', '.join(INSERT_COLUMNS)}) VALUES ({placeholders})" - conn.execute(sql, [row[column] for column in INSERT_COLUMNS]) - conn.commit() - conn.close() - - print( - f"Inserted run {row['run_id']} into {Path(args.db_path)} " - f"({row['gpu_type']} {row['model']} {row['engine']} {row['context_band']}, status={row['status']})." - ) - - -def fetch_rows(conn: sqlite3.Connection, sql: str, params: Sequence[Any] = ()) -> list[sqlite3.Row]: - return list(conn.execute(sql, params)) - - -def stringify(value: Any) -> str: - if value is None: - return "" - if isinstance(value, float): - return f"{value:.2f}" - return str(value) - - -def render_table(headers: Sequence[str], rows: Iterable[Sequence[Any]]) -> str: - normalized_rows = [[stringify(value) for value in row] for row in rows] - widths = [len(header) for header in headers] - for row in normalized_rows: - for idx, value in enumerate(row): - widths[idx] = max(widths[idx], len(value)) - - def fmt_row(row: Sequence[str]) -> str: - return " | ".join(value.ljust(widths[idx]) for idx, value in enumerate(row)) - - divider = "-+-".join("-" * width for width in widths) - lines = [fmt_row(headers), divider] - for row in normalized_rows: - lines.append(fmt_row(row)) - return "\n".join(lines) - - -def print_query(args: argparse.Namespace) -> None: - conn = connect_db(args.db_path) - - if args.group_by: - group_columns = [column.strip() for column in args.group_by.split(",") if column.strip()] - if not group_columns: - raise SystemExit("--group-by requires at least one column") - invalid = [column for column in group_columns if column not in GROUPABLE_COLUMNS] - if invalid: - raise SystemExit( - f"Unsupported --group-by columns: {', '.join(invalid)}. " - f"Allowed: {', '.join(sorted(GROUPABLE_COLUMNS))}" - ) - - select_prefix = ", ".join(group_columns) - sql = f""" - SELECT - {select_prefix}, - COUNT(*) AS runs, - SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) AS success_runs, - SUM(CASE WHEN status != 'success' THEN 1 ELSE 0 END) AS non_success_runs, - ROUND(AVG(ttft_p50_ms), 2) AS avg_ttft_p50_ms, - ROUND(AVG(throughput_tok_s), 2) AS avg_throughput_tok_s, - ROUND(MAX(gpu_mem_peak_gb), 2) AS max_gpu_mem_peak_gb, - SUM(CASE WHEN COALESCE(preemption_count, 0) > 0 THEN 1 ELSE 0 END) AS preemption_runs - FROM {TABLE_NAME} - GROUP BY {select_prefix} - ORDER BY {select_prefix} - """ - rows = fetch_rows(conn, sql) - headers = group_columns + [ - "runs", - "success_runs", - "non_success_runs", - "avg_ttft_p50_ms", - "avg_throughput_tok_s", - "max_gpu_mem_peak_gb", - "preemption_runs", - ] - print(render_table(headers, ([row[header] for header in headers] for row in rows))) - else: - sql = f"SELECT {', '.join(DEFAULT_QUERY_COLUMNS)} FROM {TABLE_NAME} ORDER BY id DESC" - rows = fetch_rows(conn, sql) - print(render_table(DEFAULT_QUERY_COLUMNS, ([row[column] for column in DEFAULT_QUERY_COLUMNS] for row in rows))) - - conn.close() - - -def export_csv_rows(args: argparse.Namespace) -> None: - conn = connect_db(args.db_path) - rows = fetch_rows(conn, f"SELECT * FROM {TABLE_NAME} ORDER BY id ASC") - headers = [description[0] for description in conn.execute(f"SELECT * FROM {TABLE_NAME} LIMIT 0").description] - - if args.output: - output_path = Path(args.output) - output_path.parent.mkdir(parents=True, exist_ok=True) - handle = output_path.open("w", newline="") - else: - handle = sys.stdout - - try: - writer = csv.writer(handle) - writer.writerow(headers) - for row in rows: - writer.writerow([row[header] for header in headers]) - finally: - if args.output: - handle.close() - print(f"Exported {len(rows)} rows to {args.output}") - - conn.close() - - -def print_summary(args: argparse.Namespace) -> None: - conn = connect_db(args.db_path) - total_runs = conn.execute(f"SELECT COUNT(*) FROM {TABLE_NAME}").fetchone()[0] - if total_runs == 0: - print(f"No runs found in {args.db_path}") - conn.close() - return - - status_rows = fetch_rows(conn, f"SELECT status, COUNT(*) AS count FROM {TABLE_NAME} GROUP BY status ORDER BY status") - preemption_rows = fetch_rows( - conn, - f""" - SELECT gpu_type, model, engine, context_band, preemption_count, status - FROM {TABLE_NAME} - WHERE COALESCE(preemption_count, 0) > 0 - ORDER BY preemption_count DESC, id DESC - LIMIT 10 - """, - ) - highest_memory_rows = fetch_rows( - conn, - f""" - SELECT gpu_type, model, engine, context_band, gpu_mem_peak_gb, kv_cache_usage_pct, status - FROM {TABLE_NAME} - WHERE gpu_mem_peak_gb IS NOT NULL - ORDER BY gpu_mem_peak_gb DESC, id DESC - LIMIT 5 - """, - ) - slowest_ttft_rows = fetch_rows( - conn, - f""" - SELECT gpu_type, model, engine, context_band, ttft_p50_ms, ttft_p99_ms, status - FROM {TABLE_NAME} - WHERE ttft_p50_ms IS NOT NULL - ORDER BY ttft_p50_ms DESC, id DESC - LIMIT 5 - """, - ) - highest_kv_rows = fetch_rows( - conn, - f""" - SELECT gpu_type, model, engine, context_band, kv_cache_usage_pct, gpu_mem_peak_gb, status - FROM {TABLE_NAME} - WHERE kv_cache_usage_pct IS NOT NULL - ORDER BY kv_cache_usage_pct DESC, id DESC - LIMIT 5 - """, - ) - long_context_rollup = fetch_rows( - conn, - f""" - SELECT - context_band, - COUNT(*) AS runs, - SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) AS success_runs, - ROUND(AVG(ttft_p50_ms), 2) AS avg_ttft_p50_ms, - ROUND(MAX(gpu_mem_peak_gb), 2) AS max_gpu_mem_peak_gb, - SUM(CASE WHEN COALESCE(preemption_count, 0) > 0 THEN 1 ELSE 0 END) AS preemption_runs - FROM {TABLE_NAME} - WHERE context_band IN ('131k', '500k', '1m') - GROUP BY context_band - ORDER BY CASE context_band WHEN '131k' THEN 1 WHEN '500k' THEN 2 WHEN '1m' THEN 3 ELSE 99 END - """, - ) - - print(f"ISB1 results summary ({args.db_path})") - print(f"Total runs: {total_runs}") - print(render_table(["status", "count"], ([row["status"], row["count"]] for row in status_rows))) - print() - - if long_context_rollup: - print("Long-context rollup") - print( - render_table( - ["context_band", "runs", "success_runs", "avg_ttft_p50_ms", "max_gpu_mem_peak_gb", "preemption_runs"], - ( - [ - row["context_band"], - row["runs"], - row["success_runs"], - row["avg_ttft_p50_ms"], - row["max_gpu_mem_peak_gb"], - row["preemption_runs"], - ] - for row in long_context_rollup - ), - ) - ) - print() - - # Depth coverage rollup - depth_coverage_rows = fetch_rows( - conn, - f""" - SELECT - context_band, - COUNT(*) AS runs, - ROUND(AVG(depth_coverage_ratio), 4) AS avg_depth_coverage, - MAX(max_actual_context_len) AS max_actual_ctx, - SUM(CASE WHEN depth_coverage_class = 'configuration_only' THEN 1 ELSE 0 END) AS config_only_runs, - SUM(CASE WHEN depth_coverage_class = 'full' THEN 1 ELSE 0 END) AS full_depth_runs - FROM {TABLE_NAME} - WHERE context_band IN ('131k', '500k', '1m') - AND depth_coverage_ratio IS NOT NULL - GROUP BY context_band - ORDER BY CASE context_band WHEN '131k' THEN 1 WHEN '500k' THEN 2 WHEN '1m' THEN 3 ELSE 99 END - """, - ) - if depth_coverage_rows: - print("Depth coverage (actual vs configured)") - print( - render_table( - ["context_band", "runs", "avg_depth_coverage", "max_actual_ctx", "config_only_runs", "full_depth_runs"], - ( - [ - row["context_band"], - row["runs"], - row["avg_depth_coverage"], - row["max_actual_ctx"], - row["config_only_runs"], - row["full_depth_runs"], - ] - for row in depth_coverage_rows - ), - ) - ) - print() - - if preemption_rows: - print("Runs with preemptions") - print( - render_table( - ["gpu_type", "model", "engine", "context_band", "preemption_count", "status"], - ( - [ - row["gpu_type"], - row["model"], - row["engine"], - row["context_band"], - row["preemption_count"], - row["status"], - ] - for row in preemption_rows - ), - ) - ) - print() - else: - print("Runs with preemptions: none") - print() - - if highest_memory_rows: - print("Highest peak GPU memory") - print( - render_table( - ["gpu_type", "model", "engine", "context_band", "gpu_mem_peak_gb", "kv_cache_usage_pct", "status"], - ( - [ - row["gpu_type"], - row["model"], - row["engine"], - row["context_band"], - row["gpu_mem_peak_gb"], - row["kv_cache_usage_pct"], - row["status"], - ] - for row in highest_memory_rows - ), - ) - ) - print() - - if slowest_ttft_rows: - print("Slowest TTFT p50 runs") - print( - render_table( - ["gpu_type", "model", "engine", "context_band", "ttft_p50_ms", "ttft_p99_ms", "status"], - ( - [ - row["gpu_type"], - row["model"], - row["engine"], - row["context_band"], - row["ttft_p50_ms"], - row["ttft_p99_ms"], - row["status"], - ] - for row in slowest_ttft_rows - ), - ) - ) - print() - - if highest_kv_rows: - print("Highest KV-cache usage") - print( - render_table( - ["gpu_type", "model", "engine", "context_band", "kv_cache_usage_pct", "gpu_mem_peak_gb", "status"], - ( - [ - row["gpu_type"], - row["model"], - row["engine"], - row["context_band"], - row["kv_cache_usage_pct"], - row["gpu_mem_peak_gb"], - row["status"], - ] - for row in highest_kv_rows - ), - ) - ) - - conn.close() - - -def main() -> int: - args = parse_args() - if args.command == "ingest": - insert_run(args) - elif args.command == "query": - print_query(args) - elif args.command == "export-csv": - export_csv_rows(args) - elif args.command == "summary": - print_summary(args) - else: - raise SystemExit(f"Unknown command: {args.command}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/datasets/isb1/scripts/metrics_collector.py b/datasets/isb1/scripts/metrics_collector.py deleted file mode 100644 index 3de1f7615..000000000 --- a/datasets/isb1/scripts/metrics_collector.py +++ /dev/null @@ -1,356 +0,0 @@ -#!/usr/bin/env python3 -"""Prometheus metrics scraper for ISB1 KV stress benchmarks.""" - -from __future__ import annotations - -import argparse -import asyncio -import csv -import json -import re -import signal -import statistics -import time -from pathlib import Path -from typing import Dict -from urllib.request import Request, urlopen - -PROM_LINE_RE = re.compile( - r"^\s*([a-zA-Z_:][a-zA-Z0-9_:]*)(?:\{[^}]*\})?\s+([-+]?(?:\d+\.\d*|\d*\.\d+|\d+)(?:[eE][-+]?\d+)?)\s*$" -) - -CANONICAL_METRICS: dict[str, tuple[str, ...]] = { - # Required vLLM metrics - "vllm:gpu_cache_usage_perc": ( - "vllm:gpu_cache_usage_perc", - "vllm_gpu_cache_usage_perc", - ), - "vllm:cpu_cache_usage_perc": ( - "vllm:cpu_cache_usage_perc", - "vllm_cpu_cache_usage_perc", - ), - "vllm:num_preemptions_total": ( - "vllm:num_preemptions_total", - "vllm_num_preemptions_total", - ), - "vllm:num_requests_running": ( - "vllm:num_requests_running", - "vllm_num_requests_running", - ), - "vllm:num_requests_waiting": ( - "vllm:num_requests_waiting", - "vllm_num_requests_waiting", - ), - "vllm:kv_offload_bytes_gpu_to_cpu": ( - "vllm:kv_offload_bytes_gpu_to_cpu", - "vllm_kv_offload_bytes_gpu_to_cpu", - ), - "vllm:kv_offload_bytes_cpu_to_gpu": ( - "vllm:kv_offload_bytes_cpu_to_gpu", - "vllm_kv_offload_bytes_cpu_to_gpu", - ), - "vllm:prompt_tokens_total": ( - "vllm:prompt_tokens_total", - "vllm_prompt_tokens_total", - ), - "vllm:generation_tokens_total": ( - "vllm:generation_tokens_total", - "vllm_generation_tokens_total", - ), - # Optional but useful in vLLM - "vllm:num_requests_swapped": ( - "vllm:num_requests_swapped", - "vllm_num_requests_swapped", - ), - # PR #993 parity metrics (vLLM) - "vllm:prefix_cache_hit_rate": ( - "vllm:prefix_cache_hit_rate", - "vllm_prefix_cache_hit_rate", - ), - "vllm:cpu_prefix_cache_hit_rate": ( - "vllm:cpu_prefix_cache_hit_rate", - "vllm_cpu_prefix_cache_hit_rate", - ), - "vllm:kv_offload_time_gpu_to_cpu_seconds": ( - "vllm:kv_offload_time_gpu_to_cpu_seconds", - "vllm_kv_offload_time_gpu_to_cpu_seconds", - ), - "vllm:kv_offload_time_cpu_to_gpu_seconds": ( - "vllm:kv_offload_time_cpu_to_gpu_seconds", - "vllm_kv_offload_time_cpu_to_gpu_seconds", - ), - "vllm:prompt_tokens_local_compute": ( - "vllm:prompt_tokens_local_compute", - "vllm_prompt_tokens_local_compute", - ), - "vllm:prompt_tokens_local_cache_hit": ( - "vllm:prompt_tokens_local_cache_hit", - "vllm_prompt_tokens_local_cache_hit", - ), - "vllm:prompt_tokens_external_kv_transfer": ( - "vllm:prompt_tokens_external_kv_transfer", - "vllm_prompt_tokens_external_kv_transfer", - ), - # SGLang equivalents (best-effort) - "sglang:kv_cache_usage": ( - "sglang:kv_cache_usage", - "sglang_kv_cache_usage", - "sglang_kv_cache_utilization", - ), - "sglang:cache_hit_rate": ( - "sglang:cache_hit_rate", - "sglang_cache_hit_rate", - "sglang_radix_cache_hit_rate", - ), - "sglang:num_requests_running": ( - "sglang:num_requests_running", - "sglang_num_requests_running", - "sglang_scheduler_num_running_requests", - ), - "sglang:num_requests_waiting": ( - "sglang:num_requests_waiting", - "sglang_num_requests_waiting", - "sglang_scheduler_num_waiting_requests", - ), - "sglang:prompt_tokens_total": ( - "sglang:prompt_tokens_total", - "sglang_prompt_tokens_total", - "sglang_num_prompt_tokens_total", - ), - "sglang:generation_tokens_total": ( - "sglang:generation_tokens_total", - "sglang_generation_tokens_total", - "sglang_num_generation_tokens_total", - ), - # PR #993 parity metrics (SGLang) - "sglang:num_preemptions_total": ( - "sglang:num_preemptions_total", - "sglang_num_preemptions_total", - ), - "sglang:prefix_cache_queries_total": ( - "sglang:prefix_cache_queries_total", - "sglang_prefix_cache_queries_total", - ), -} - - -def _normalize_name(name: str) -> str: - return name.replace(":", "_") - - -def parse_prometheus_rows(payload: str) -> list[tuple[str, float]]: - rows: list[tuple[str, float]] = [] - for line in payload.splitlines(): - if not line or line.startswith("#"): - continue - match = PROM_LINE_RE.match(line) - if not match: - continue - name, raw_value = match.groups() - try: - rows.append((name, float(raw_value))) - except ValueError: - continue - return rows - - -def parse_prometheus_text(payload: str) -> Dict[str, float]: - samples: Dict[str, float] = {} - for name, value in parse_prometheus_rows(payload): - samples[name] = value - return samples - - -def map_canonical_metrics(samples: Dict[str, float]) -> Dict[str, float]: - mapped: Dict[str, float] = {} - - normalized_index: Dict[str, float] = {} - for key, value in samples.items(): - normalized_index[_normalize_name(key)] = value - - for canonical_name, aliases in CANONICAL_METRICS.items(): - value = None - for alias in aliases: - if alias in samples: - value = samples[alias] - break - alias_norm = _normalize_name(alias) - if alias_norm in normalized_index: - value = normalized_index[alias_norm] - break - if value is not None: - mapped[canonical_name] = value - - return mapped - - -def fetch_metrics(metrics_url: str, timeout_s: float = 5.0) -> str: - request = Request(metrics_url, headers={"Accept": "text/plain"}) - with urlopen(request, timeout=timeout_s) as response: # nosec B310 - return response.read().decode("utf-8", errors="replace") - - -def _percentile(values: list[float], p: float) -> float: - if not values: - return 0.0 - if len(values) == 1: - return values[0] - sorted_values = sorted(values) - rank = (len(sorted_values) - 1) * p - lo = int(rank) - hi = min(lo + 1, len(sorted_values) - 1) - frac = rank - lo - return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac - - -def _build_summary(metric_values: dict[str, list[float]]) -> dict[str, dict[str, float]]: - summary: dict[str, dict[str, float]] = {} - for metric_name, values in metric_values.items(): - if not values: - continue - summary[metric_name] = { - "count": float(len(values)), - "min": min(values), - "max": max(values), - "mean": statistics.fmean(values), - "p50": _percentile(values, 0.50), - "p99": _percentile(values, 0.99), - } - return summary - - -async def scrape_loop( - metrics_url: str, - output_path: Path, - interval_s: float, - duration_s: float, - wide: bool, - summary_json_path: Path | None, -) -> None: - output_path.parent.mkdir(parents=True, exist_ok=True) - - stop_event = asyncio.Event() - - def _request_stop(*_: object) -> None: - stop_event.set() - - try: - loop = asyncio.get_running_loop() - loop.add_signal_handler(signal.SIGINT, _request_stop) - loop.add_signal_handler(signal.SIGTERM, _request_stop) - except NotImplementedError: - pass - - started_at = time.time() - metric_values: dict[str, list[float]] = {} - - wide_path = output_path.with_name("kv_metrics_wide.csv") - - with output_path.open("w", newline="", encoding="utf-8") as f: - writer = csv.writer(f) - writer.writerow(["timestamp", "metric_name", "metric_value"]) - - wide_file = None - wide_writer = None - if wide: - wide_file = wide_path.open("w", newline="", encoding="utf-8") - wide_writer = csv.writer(wide_file) - wide_writer.writerow(["timestamp", "metric_name", "metric_value"]) - - try: - while not stop_event.is_set(): - now = time.time() - if duration_s > 0 and (now - started_at) >= duration_s: - break - - try: - raw_text = await asyncio.to_thread(fetch_metrics, metrics_url) - raw_rows = parse_prometheus_rows(raw_text) - samples = parse_prometheus_text(raw_text) - mapped = map_canonical_metrics(samples) - - if wide_writer is not None: - for raw_metric_name, raw_metric_value in raw_rows: - wide_writer.writerow( - [f"{now:.3f}", raw_metric_name, f"{raw_metric_value:.8f}"] - ) - wide_file.flush() - - for metric_name, metric_value in mapped.items(): - writer.writerow([f"{now:.3f}", metric_name, f"{metric_value:.8f}"]) - metric_values.setdefault(metric_name, []).append(metric_value) - f.flush() - except Exception as exc: - writer.writerow([f"{now:.3f}", "collector:error", repr(exc)]) - f.flush() - - await asyncio.sleep(interval_s) - finally: - if wide_file is not None: - wide_file.close() - - if summary_json_path is not None: - summary_json_path.parent.mkdir(parents=True, exist_ok=True) - summary_json_path.write_text( - json.dumps(_build_summary(metric_values), indent=2, sort_keys=True), - encoding="utf-8", - ) - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Scrape Prometheus metrics into CSV") - parser.add_argument( - "--metrics-url", - default="http://0.0.0.0:8888/metrics", - help="Prometheus endpoint URL", - ) - parser.add_argument( - "--output", - default="kv_metrics.csv", - help="CSV output path", - ) - parser.add_argument( - "--interval", - type=float, - default=2.0, - help="Scrape interval in seconds", - ) - parser.add_argument( - "--duration", - type=float, - default=0.0, - help="Optional max duration in seconds (0 means run until interrupted)", - ) - parser.add_argument( - "--wide", - action="store_true", - help="Also scrape all non-comment Prometheus metric lines into kv_metrics_wide.csv", - ) - parser.add_argument( - "--summary-json", - nargs="?", - const="kv_metrics_summary.json", - default=None, - help="Write per-metric min/max/mean/p50/p99 summary JSON (default: kv_metrics_summary.json)", - ) - return parser.parse_args() - - -def main() -> int: - args = parse_args() - summary_json_path = Path(args.summary_json) if args.summary_json else None - asyncio.run( - scrape_loop( - metrics_url=args.metrics_url, - output_path=Path(args.output), - interval_s=max(args.interval, 0.1), - duration_s=max(args.duration, 0.0), - wide=args.wide, - summary_json_path=summary_json_path, - ) - ) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/datasets/isb1/scripts/plot_pareto.py b/datasets/isb1/scripts/plot_pareto.py deleted file mode 100644 index 964696ad1..000000000 --- a/datasets/isb1/scripts/plot_pareto.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import csv -import json -import sqlite3 -from pathlib import Path -from typing import Any - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Compute Pareto frontier for KV sweep throughput vs p99 TTFT") - parser.add_argument("--db-path", default=None, help="SQLite DB path (benchmark_runs)") - parser.add_argument("--json-dir", default=None, help="Directory containing sweep summary JSON files") - parser.add_argument("--output-dir", required=True, help="Directory for pareto outputs") - return parser.parse_args() - - -def _to_float(value: Any) -> float | None: - if value in (None, ""): - return None - try: - return float(value) - except (TypeError, ValueError): - return None - - -def load_rows_from_db(db_path: Path) -> list[dict[str, Any]]: - conn = sqlite3.connect(db_path) - conn.row_factory = sqlite3.Row - rows = conn.execute( - """ - SELECT offload_mode, ttft_p99_ms, throughput_tok_s, max_concurrency, raw_result_json - FROM benchmark_runs - WHERE offload_mode IS NOT NULL - AND ttft_p99_ms IS NOT NULL - AND throughput_tok_s IS NOT NULL - ORDER BY id ASC - """ - ).fetchall() - conn.close() - - normalized: list[dict[str, Any]] = [] - for row in rows: - concurrency = row["max_concurrency"] - if concurrency in (None, "") and row["raw_result_json"]: - try: - payload = json.loads(row["raw_result_json"]) - concurrency = payload.get("conc") or payload.get("max_concurrency") - except Exception: - pass - normalized.append( - { - "offload_mode": row["offload_mode"], - "concurrency": int(concurrency) if concurrency not in (None, "") else None, - "throughput_tok_s": _to_float(row["throughput_tok_s"]), - "ttft_p99_ms": _to_float(row["ttft_p99_ms"]), - "source": "db", - } - ) - return normalized - - -def load_rows_from_json_dir(json_dir: Path) -> list[dict[str, Any]]: - rows: list[dict[str, Any]] = [] - for path in sorted(json_dir.glob("*.json")): - try: - payload = json.loads(path.read_text(encoding="utf-8")) - except Exception: - continue - - if isinstance(payload, dict) and isinstance(payload.get("summary"), list): - for row in payload["summary"]: - rows.append( - { - "offload_mode": row.get("offload_mode"), - "concurrency": row.get("concurrency"), - "throughput_tok_s": _to_float(row.get("throughput_tok_s")), - "ttft_p99_ms": _to_float(row.get("ttft_p99_ms")), - "source": str(path.name), - } - ) - elif isinstance(payload, list): - for row in payload: - if isinstance(row, dict): - rows.append( - { - "offload_mode": row.get("offload_mode"), - "concurrency": row.get("concurrency"), - "throughput_tok_s": _to_float(row.get("throughput_tok_s")), - "ttft_p99_ms": _to_float(row.get("ttft_p99_ms")), - "source": str(path.name), - } - ) - return rows - - -def compute_pareto_frontier(points: list[dict[str, Any]]) -> list[dict[str, Any]]: - valid = [p for p in points if p["throughput_tok_s"] is not None and p["ttft_p99_ms"] is not None] - if not valid: - return [] - - # maximize throughput, minimize ttft_p99_ms - sorted_points = sorted(valid, key=lambda p: (p["throughput_tok_s"], -p["ttft_p99_ms"]), reverse=True) - frontier: list[dict[str, Any]] = [] - best_latency = float("inf") - for point in sorted_points: - latency = point["ttft_p99_ms"] - if latency <= best_latency: - frontier.append(point) - best_latency = latency - return sorted(frontier, key=lambda p: (p["throughput_tok_s"], p["ttft_p99_ms"])) - - -def write_csv(path: Path, rows: list[dict[str, Any]], frontier_keys: set[tuple[str, int | None, float, float]]) -> None: - with path.open("w", newline="", encoding="utf-8") as handle: - writer = csv.writer(handle) - writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms", "is_frontier", "source"]) - for row in rows: - key = (row.get("offload_mode") or "", row.get("concurrency"), row.get("throughput_tok_s") or 0.0, row.get("ttft_p99_ms") or 0.0) - writer.writerow([ - row.get("offload_mode"), - row.get("concurrency"), - row.get("throughput_tok_s"), - row.get("ttft_p99_ms"), - key in frontier_keys, - row.get("source"), - ]) - - -def maybe_write_plot(output_path: Path, grouped_frontiers: dict[str, list[dict[str, Any]]]) -> bool: - try: - import matplotlib.pyplot as plt # type: ignore - except Exception: - return False - - plt.figure(figsize=(10, 6)) - for mode, frontier in sorted(grouped_frontiers.items()): - x = [p["throughput_tok_s"] for p in frontier] - y = [p["ttft_p99_ms"] for p in frontier] - if not x: - continue - plt.plot(x, y, marker="o", label=mode) - plt.xlabel("Throughput (tokens/sec)") - plt.ylabel("p99 TTFT (ms)") - plt.title("Pareto Frontier by Offload Mode") - plt.legend() - plt.grid(True, alpha=0.3) - output_path.parent.mkdir(parents=True, exist_ok=True) - plt.tight_layout() - plt.savefig(output_path) - plt.close() - return True - - -def main() -> int: - args = parse_args() - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - if not args.db_path and not args.json_dir: - raise SystemExit("Provide --db-path or --json-dir") - - rows: list[dict[str, Any]] = [] - if args.db_path: - rows.extend(load_rows_from_db(Path(args.db_path))) - if args.json_dir: - rows.extend(load_rows_from_json_dir(Path(args.json_dir))) - - grouped: dict[str, list[dict[str, Any]]] = {} - for row in rows: - mode = row.get("offload_mode") - if not mode: - continue - grouped.setdefault(mode, []).append(row) - - grouped_frontiers: dict[str, list[dict[str, Any]]] = {} - for mode, points in grouped.items(): - grouped_frontiers[mode] = compute_pareto_frontier(points) - - frontier_keys: set[tuple[str, int | None, float, float]] = set() - for mode, frontier in grouped_frontiers.items(): - for point in frontier: - frontier_keys.add((mode, point.get("concurrency"), point.get("throughput_tok_s") or 0.0, point.get("ttft_p99_ms") or 0.0)) - - csv_path = output_dir / "pareto_data.csv" - write_csv(csv_path, rows, frontier_keys) - - summary = { - "total_points": len(rows), - "offload_modes": sorted(grouped.keys()), - "frontier": {mode: frontier for mode, frontier in grouped_frontiers.items()}, - } - summary_path = output_dir / "pareto_summary.json" - summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True), encoding="utf-8") - - plot_written = maybe_write_plot(output_dir / "pareto_frontier.png", grouped_frontiers) - - print(f"Wrote: {csv_path}") - print(f"Wrote: {summary_path}") - if plot_written: - print(f"Wrote: {output_dir / 'pareto_frontier.png'}") - else: - print("Skipped pareto_frontier.png (matplotlib unavailable)") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/experimental/README.md b/experimental/README.md index 8ba1ba9b5..f39dfc4af 100644 --- a/experimental/README.md +++ b/experimental/README.md @@ -1,11 +1,5 @@ # Experimental -This folder contains experimental WIP code and planning material. +This folder contains experimental WIP code that is mostly Claude Code generated. -Relevant roadmap docs: - -For the current official ISB1 support statement, use: -- `datasets/isb1/SUPPORT_MATRIX.md` -- `datasets/isb1/README.md` - -**Warning:** code and notes in this directory may be incomplete, experimental, or future-looking. They are not by themselves the official statement of supported InferenceX ISB1 capability. +**Warning:** Code in this directory is very basic and likely contains errors or incomplete implementations. It is not intended for production use or as part of the official InferenceMAX results. diff --git a/experimental/multiturn/README.md b/experimental/multiturn/README.md index fd9114b37..05b22f67e 100644 --- a/experimental/multiturn/README.md +++ b/experimental/multiturn/README.md @@ -1,27 +1,16 @@ -# Experimental multiturn notes - -This directory contains working notes, investigations, and planning material for multiturn and long-context benchmarking. - -## Official ISB1 replay status lives elsewhere - -Do **not** treat this directory as the source of truth for the currently supported InferenceX ISB1 surface. - -For the official, reviewable statement of what is landed now, use: -- `datasets/isb1/SUPPORT_MATRIX.md` -- `datasets/isb1/README.md` -- `.github/configs/isb1-master.yaml` - -## Relevant roadmap docs - -- `ISB1_MULTITURN_LONG_CONTEXT_CANONICAL_SYNTHESIS_2026-04-09.md` — canonical synthesis for next implementation phases; use this first for planning context. -- `ISB1_INFERENCEX_PHASED_PR_ROADMAP_2026-04-09.md` — phased landing plan used to split schema/workflow/data/extension/polish work into mergeable stages. - -## Scope warning - -Files in this directory may discuss future or experimental directions such as: -- KV offload investigations -- synthetic multiturn ideas -- broader long-context expansion -- experiments outside the currently merged official replay lane - -Those notes are useful for planning, but they are **not** themselves an official support claim. +## Experimental WIP: Multi turn with/without CPU KVCache Offloading + +lit review +- https://lmsys.org/blog/2025-09-10-sglang-hicache/ +- sglang calls GPU HBM as (L1) and CPU DRAM as (L2) +- https://lmsys.org/images/blog/hicache/mooncake_benchmark.png +- single turn long context Q&A https://arxiv.org/abs/2311.04939 (seems more like an shared prefix style similar to cascade attention (pre cursor to sglang radix attention )) https://flashinfer.ai/2024/02/02/cascade-inference.html +- synethic & sharegpt vllm multi turn datasets https://github.com/vllm-project/vllm/tree/main/benchmarks/multi_turn +- Production Alibiba Multi turn dataset https://arxiv.org/abs/2506.02634 (seem to not provide the acutal prompts and outputs tho, more just prompt lengths and output lengths, etc.) +- sglang synthetic multi turn benchmark script here https://github.com/sgl-project/sglang/tree/main/benchmark/hicache +- interestingly sglang blog simulates PD disagg via just setting OSL as 1 +- MT-bench https://arxiv.org/abs/2402.14762 +```bash +python3 benchmark/hicache/bench_multiturn.py --model-path $MODEL_PATH --disable-random-sample \ +--output-length 1 --request-length 2048 \ # simulate P-D disaggregation +``` diff --git a/experimental/multiturn/vllm_benchmark/.gitignore b/experimental/multiturn/vllm_benchmark/.gitignore deleted file mode 100644 index 5c371b81e..000000000 --- a/experimental/multiturn/vllm_benchmark/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -# Python -__pycache__/ -*.pyc - -# Generated artifacts -*.log -*.tmp diff --git a/experimental/multiturn/vllm_benchmark/README.md b/experimental/multiturn/vllm_benchmark/README.md deleted file mode 100644 index b2ea6f175..000000000 --- a/experimental/multiturn/vllm_benchmark/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# vLLM Benchmark (Experimental) - -This directory tracks the PR #993 parity surface for multi-turn trace replay and KV stress experiments. - -## Trace sources - -- **ISB-1 exports**: existing committed replay exports. -- **kv-cache-tester**: `kv-cache-tester/` is a placeholder for the external trace replay repo. -- **AIPerf synthetic traces**: `aiperf_traces/` provides fallback synthetic traces. - -## Analysis tools - -The parity analysis scripts live under `datasets/isb1/scripts/`: - -- `plot_pareto.py` -- `analyze_benchmark_distributions.py` -- `collect_sweep_results.py` -- `adapt_trace_replay_result.py` - -## LMCache variants - -LMCache launch helpers are under `launch/`: - -- `lmcache_vllm_h200.sh` -- `lmcache_vllm_b200.sh` - -## Per-hardware replay scripts - -Trace replay scripts are under `scripts/` for per-model/per-engine/per-hardware combinations. - ---- - -**Experimental infrastructure. Not part of official ISB-1 support matrix.** diff --git a/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json b/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json deleted file mode 100644 index 683556038..000000000 --- a/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json +++ /dev/null @@ -1,5559 +0,0 @@ -{ - "sessions": [ - { - "turns": [ - { - "role": "user", - "content_token_count": 4355, - "target_output_tokens": 229 - }, - { - "role": "user", - "content_token_count": 13955, - "target_output_tokens": 384 - }, - { - "role": "user", - "content_token_count": 1941, - "target_output_tokens": 89 - }, - { - "role": "user", - "content_token_count": 11403, - "target_output_tokens": 2247 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 13567, - "target_output_tokens": 663 - }, - { - "role": "user", - "content_token_count": 49742, - "target_output_tokens": 366 - }, - { - "role": "user", - "content_token_count": 13186, - "target_output_tokens": 686 - }, - { - "role": "user", - "content_token_count": 7600, - "target_output_tokens": 418 - }, - { - "role": "user", - "content_token_count": 5978, - "target_output_tokens": 385 - }, - { - "role": "user", - "content_token_count": 1998, - "target_output_tokens": 706 - }, - { - "role": "user", - "content_token_count": 1582, - "target_output_tokens": 667 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 14644, - "target_output_tokens": 467 - }, - { - "role": "user", - "content_token_count": 20321, - "target_output_tokens": 971 - }, - { - "role": "user", - "content_token_count": 2950, - "target_output_tokens": 274 - }, - { - "role": "user", - "content_token_count": 4932, - "target_output_tokens": 680 - }, - { - "role": "user", - "content_token_count": 9971, - "target_output_tokens": 706 - }, - { - "role": "user", - "content_token_count": 3348, - "target_output_tokens": 440 - }, - { - "role": "user", - "content_token_count": 13343, - "target_output_tokens": 431 - }, - { - "role": "user", - "content_token_count": 6230, - "target_output_tokens": 2231 - }, - { - "role": "user", - "content_token_count": 8168, - "target_output_tokens": 421 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 1487, - "target_output_tokens": 986 - }, - { - "role": "user", - "content_token_count": 2684, - "target_output_tokens": 549 - }, - { - "role": "user", - "content_token_count": 3065, - "target_output_tokens": 366 - }, - { - "role": "user", - "content_token_count": 12135, - "target_output_tokens": 1145 - }, - { - "role": "user", - "content_token_count": 14716, - "target_output_tokens": 1074 - }, - { - "role": "user", - "content_token_count": 16644, - "target_output_tokens": 1062 - }, - { - "role": "user", - "content_token_count": 12355, - "target_output_tokens": 285 - }, - { - "role": "user", - "content_token_count": 3108, - "target_output_tokens": 291 - }, - { - "role": "user", - "content_token_count": 7234, - "target_output_tokens": 1235 - }, - { - "role": "user", - "content_token_count": 25179, - "target_output_tokens": 493 - }, - { - "role": "user", - "content_token_count": 6480, - "target_output_tokens": 431 - }, - { - "role": "user", - "content_token_count": 13902, - "target_output_tokens": 652 - }, - { - "role": "user", - "content_token_count": 6014, - "target_output_tokens": 1037 - }, - { - "role": "user", - "content_token_count": 41352, - "target_output_tokens": 649 - }, - { - "role": "user", - "content_token_count": 8852, - "target_output_tokens": 319 - }, - { - "role": "user", - "content_token_count": 8795, - "target_output_tokens": 736 - }, - { - "role": "user", - "content_token_count": 27778, - "target_output_tokens": 373 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6962, - "target_output_tokens": 1351 - }, - { - "role": "user", - "content_token_count": 2614, - "target_output_tokens": 248 - }, - { - "role": "user", - "content_token_count": 11529, - "target_output_tokens": 248 - }, - { - "role": "user", - "content_token_count": 5165, - "target_output_tokens": 653 - }, - { - "role": "user", - "content_token_count": 2132, - "target_output_tokens": 318 - }, - { - "role": "user", - "content_token_count": 5290, - "target_output_tokens": 614 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 23469, - "target_output_tokens": 546 - }, - { - "role": "user", - "content_token_count": 7665, - "target_output_tokens": 360 - }, - { - "role": "user", - "content_token_count": 27018, - "target_output_tokens": 1332 - }, - { - "role": "user", - "content_token_count": 1887, - "target_output_tokens": 326 - }, - { - "role": "user", - "content_token_count": 5249, - "target_output_tokens": 346 - }, - { - "role": "user", - "content_token_count": 7443, - "target_output_tokens": 828 - }, - { - "role": "user", - "content_token_count": 6496, - "target_output_tokens": 100 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 9221, - "target_output_tokens": 430 - }, - { - "role": "user", - "content_token_count": 7697, - "target_output_tokens": 1197 - }, - { - "role": "user", - "content_token_count": 5421, - "target_output_tokens": 277 - }, - { - "role": "user", - "content_token_count": 8799, - "target_output_tokens": 540 - }, - { - "role": "user", - "content_token_count": 14993, - "target_output_tokens": 768 - }, - { - "role": "user", - "content_token_count": 28612, - "target_output_tokens": 581 - }, - { - "role": "user", - "content_token_count": 42160, - "target_output_tokens": 366 - }, - { - "role": "user", - "content_token_count": 9846, - "target_output_tokens": 544 - }, - { - "role": "user", - "content_token_count": 15085, - "target_output_tokens": 302 - }, - { - "role": "user", - "content_token_count": 8267, - "target_output_tokens": 596 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 23256, - "target_output_tokens": 821 - }, - { - "role": "user", - "content_token_count": 36819, - "target_output_tokens": 183 - }, - { - "role": "user", - "content_token_count": 1590, - "target_output_tokens": 2201 - }, - { - "role": "user", - "content_token_count": 12229, - "target_output_tokens": 1265 - }, - { - "role": "user", - "content_token_count": 7483, - "target_output_tokens": 1819 - }, - { - "role": "user", - "content_token_count": 2288, - "target_output_tokens": 970 - }, - { - "role": "user", - "content_token_count": 33871, - "target_output_tokens": 703 - }, - { - "role": "user", - "content_token_count": 8650, - "target_output_tokens": 147 - }, - { - "role": "user", - "content_token_count": 10018, - "target_output_tokens": 487 - }, - { - "role": "user", - "content_token_count": 21103, - "target_output_tokens": 805 - }, - { - "role": "user", - "content_token_count": 17500, - "target_output_tokens": 493 - }, - { - "role": "user", - "content_token_count": 1678, - "target_output_tokens": 129 - }, - { - "role": "user", - "content_token_count": 29345, - "target_output_tokens": 303 - }, - { - "role": "user", - "content_token_count": 4555, - "target_output_tokens": 483 - }, - { - "role": "user", - "content_token_count": 39008, - "target_output_tokens": 631 - }, - { - "role": "user", - "content_token_count": 3284, - "target_output_tokens": 142 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 7400, - "target_output_tokens": 948 - }, - { - "role": "user", - "content_token_count": 3992, - "target_output_tokens": 387 - }, - { - "role": "user", - "content_token_count": 8450, - "target_output_tokens": 313 - }, - { - "role": "user", - "content_token_count": 8606, - "target_output_tokens": 89 - }, - { - "role": "user", - "content_token_count": 4775, - "target_output_tokens": 3004 - }, - { - "role": "user", - "content_token_count": 44546, - "target_output_tokens": 758 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 10548, - "target_output_tokens": 522 - }, - { - "role": "user", - "content_token_count": 23492, - "target_output_tokens": 463 - }, - { - "role": "user", - "content_token_count": 2803, - "target_output_tokens": 3146 - }, - { - "role": "user", - "content_token_count": 2080, - "target_output_tokens": 257 - }, - { - "role": "user", - "content_token_count": 8416, - "target_output_tokens": 1401 - }, - { - "role": "user", - "content_token_count": 3410, - "target_output_tokens": 4096 - }, - { - "role": "user", - "content_token_count": 20886, - "target_output_tokens": 246 - }, - { - "role": "user", - "content_token_count": 16891, - "target_output_tokens": 111 - }, - { - "role": "user", - "content_token_count": 4933, - "target_output_tokens": 654 - }, - { - "role": "user", - "content_token_count": 5560, - "target_output_tokens": 634 - }, - { - "role": "user", - "content_token_count": 8380, - "target_output_tokens": 158 - }, - { - "role": "user", - "content_token_count": 17894, - "target_output_tokens": 278 - }, - { - "role": "user", - "content_token_count": 4907, - "target_output_tokens": 312 - }, - { - "role": "user", - "content_token_count": 5810, - "target_output_tokens": 1418 - }, - { - "role": "user", - "content_token_count": 6056, - "target_output_tokens": 515 - }, - { - "role": "user", - "content_token_count": 6750, - "target_output_tokens": 279 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6845, - "target_output_tokens": 83 - }, - { - "role": "user", - "content_token_count": 3847, - "target_output_tokens": 2093 - }, - { - "role": "user", - "content_token_count": 2327, - "target_output_tokens": 926 - }, - { - "role": "user", - "content_token_count": 11838, - "target_output_tokens": 453 - }, - { - "role": "user", - "content_token_count": 5787, - "target_output_tokens": 1590 - }, - { - "role": "user", - "content_token_count": 16091, - "target_output_tokens": 84 - }, - { - "role": "user", - "content_token_count": 15625, - "target_output_tokens": 168 - }, - { - "role": "user", - "content_token_count": 24568, - "target_output_tokens": 789 - }, - { - "role": "user", - "content_token_count": 25763, - "target_output_tokens": 605 - }, - { - "role": "user", - "content_token_count": 20307, - "target_output_tokens": 570 - }, - { - "role": "user", - "content_token_count": 6868, - "target_output_tokens": 294 - }, - { - "role": "user", - "content_token_count": 18094, - "target_output_tokens": 170 - }, - { - "role": "user", - "content_token_count": 4778, - "target_output_tokens": 511 - }, - { - "role": "user", - "content_token_count": 3934, - "target_output_tokens": 495 - }, - { - "role": "user", - "content_token_count": 12163, - "target_output_tokens": 795 - }, - { - "role": "user", - "content_token_count": 12752, - "target_output_tokens": 3072 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 17618, - "target_output_tokens": 1691 - }, - { - "role": "user", - "content_token_count": 12217, - "target_output_tokens": 164 - }, - { - "role": "user", - "content_token_count": 31341, - "target_output_tokens": 777 - }, - { - "role": "user", - "content_token_count": 2248, - "target_output_tokens": 1106 - }, - { - "role": "user", - "content_token_count": 11819, - "target_output_tokens": 812 - }, - { - "role": "user", - "content_token_count": 5636, - "target_output_tokens": 187 - }, - { - "role": "user", - "content_token_count": 5477, - "target_output_tokens": 403 - }, - { - "role": "user", - "content_token_count": 19604, - "target_output_tokens": 390 - }, - { - "role": "user", - "content_token_count": 8663, - "target_output_tokens": 865 - }, - { - "role": "user", - "content_token_count": 16969, - "target_output_tokens": 407 - }, - { - "role": "user", - "content_token_count": 22672, - "target_output_tokens": 371 - }, - { - "role": "user", - "content_token_count": 4500, - "target_output_tokens": 257 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6952, - "target_output_tokens": 1454 - }, - { - "role": "user", - "content_token_count": 21170, - "target_output_tokens": 1383 - }, - { - "role": "user", - "content_token_count": 9252, - "target_output_tokens": 209 - }, - { - "role": "user", - "content_token_count": 6023, - "target_output_tokens": 155 - }, - { - "role": "user", - "content_token_count": 30200, - "target_output_tokens": 2025 - }, - { - "role": "user", - "content_token_count": 8146, - "target_output_tokens": 132 - }, - { - "role": "user", - "content_token_count": 15151, - "target_output_tokens": 300 - }, - { - "role": "user", - "content_token_count": 6381, - "target_output_tokens": 739 - }, - { - "role": "user", - "content_token_count": 3225, - "target_output_tokens": 454 - }, - { - "role": "user", - "content_token_count": 5177, - "target_output_tokens": 2094 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 17308, - "target_output_tokens": 484 - }, - { - "role": "user", - "content_token_count": 27306, - "target_output_tokens": 413 - }, - { - "role": "user", - "content_token_count": 24589, - "target_output_tokens": 1070 - }, - { - "role": "user", - "content_token_count": 7202, - "target_output_tokens": 256 - }, - { - "role": "user", - "content_token_count": 6018, - "target_output_tokens": 200 - }, - { - "role": "user", - "content_token_count": 3867, - "target_output_tokens": 593 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 16341, - "target_output_tokens": 1754 - }, - { - "role": "user", - "content_token_count": 4374, - "target_output_tokens": 1779 - }, - { - "role": "user", - "content_token_count": 5850, - "target_output_tokens": 290 - }, - { - "role": "user", - "content_token_count": 5391, - "target_output_tokens": 2242 - }, - { - "role": "user", - "content_token_count": 18534, - "target_output_tokens": 187 - }, - { - "role": "user", - "content_token_count": 1541, - "target_output_tokens": 1352 - }, - { - "role": "user", - "content_token_count": 512, - "target_output_tokens": 917 - }, - { - "role": "user", - "content_token_count": 6840, - "target_output_tokens": 397 - }, - { - "role": "user", - "content_token_count": 4664, - "target_output_tokens": 585 - }, - { - "role": "user", - "content_token_count": 7184, - "target_output_tokens": 846 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 7488, - "target_output_tokens": 545 - }, - { - "role": "user", - "content_token_count": 6149, - "target_output_tokens": 180 - }, - { - "role": "user", - "content_token_count": 18544, - "target_output_tokens": 1062 - }, - { - "role": "user", - "content_token_count": 23779, - "target_output_tokens": 962 - }, - { - "role": "user", - "content_token_count": 7158, - "target_output_tokens": 624 - }, - { - "role": "user", - "content_token_count": 5401, - "target_output_tokens": 264 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6126, - "target_output_tokens": 366 - }, - { - "role": "user", - "content_token_count": 10891, - "target_output_tokens": 787 - }, - { - "role": "user", - "content_token_count": 7206, - "target_output_tokens": 446 - }, - { - "role": "user", - "content_token_count": 14885, - "target_output_tokens": 534 - }, - { - "role": "user", - "content_token_count": 16761, - "target_output_tokens": 418 - }, - { - "role": "user", - "content_token_count": 8153, - "target_output_tokens": 322 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6173, - "target_output_tokens": 792 - }, - { - "role": "user", - "content_token_count": 7491, - "target_output_tokens": 360 - }, - { - "role": "user", - "content_token_count": 11004, - "target_output_tokens": 522 - }, - { - "role": "user", - "content_token_count": 30822, - "target_output_tokens": 733 - }, - { - "role": "user", - "content_token_count": 16828, - "target_output_tokens": 660 - }, - { - "role": "user", - "content_token_count": 10930, - "target_output_tokens": 2180 - }, - { - "role": "user", - "content_token_count": 9511, - "target_output_tokens": 182 - }, - { - "role": "user", - "content_token_count": 9162, - "target_output_tokens": 683 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 28818, - "target_output_tokens": 245 - }, - { - "role": "user", - "content_token_count": 6134, - "target_output_tokens": 472 - }, - { - "role": "user", - "content_token_count": 6634, - "target_output_tokens": 813 - }, - { - "role": "user", - "content_token_count": 10762, - "target_output_tokens": 182 - }, - { - "role": "user", - "content_token_count": 5519, - "target_output_tokens": 1891 - }, - { - "role": "user", - "content_token_count": 9813, - "target_output_tokens": 544 - }, - { - "role": "user", - "content_token_count": 27459, - "target_output_tokens": 1087 - }, - { - "role": "user", - "content_token_count": 11085, - "target_output_tokens": 192 - }, - { - "role": "user", - "content_token_count": 13108, - "target_output_tokens": 444 - }, - { - "role": "user", - "content_token_count": 24568, - "target_output_tokens": 203 - }, - { - "role": "user", - "content_token_count": 12813, - "target_output_tokens": 800 - }, - { - "role": "user", - "content_token_count": 6876, - "target_output_tokens": 126 - }, - { - "role": "user", - "content_token_count": 9155, - "target_output_tokens": 4096 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 5653, - "target_output_tokens": 908 - }, - { - "role": "user", - "content_token_count": 2275, - "target_output_tokens": 410 - }, - { - "role": "user", - "content_token_count": 3348, - "target_output_tokens": 708 - }, - { - "role": "user", - "content_token_count": 7689, - "target_output_tokens": 448 - }, - { - "role": "user", - "content_token_count": 8998, - "target_output_tokens": 1126 - }, - { - "role": "user", - "content_token_count": 1847, - "target_output_tokens": 1767 - }, - { - "role": "user", - "content_token_count": 5015, - "target_output_tokens": 484 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 37087, - "target_output_tokens": 360 - }, - { - "role": "user", - "content_token_count": 9919, - "target_output_tokens": 3052 - }, - { - "role": "user", - "content_token_count": 3728, - "target_output_tokens": 265 - }, - { - "role": "user", - "content_token_count": 13398, - "target_output_tokens": 274 - }, - { - "role": "user", - "content_token_count": 5429, - "target_output_tokens": 994 - }, - { - "role": "user", - "content_token_count": 998, - "target_output_tokens": 116 - }, - { - "role": "user", - "content_token_count": 1326, - "target_output_tokens": 718 - }, - { - "role": "user", - "content_token_count": 9401, - "target_output_tokens": 712 - }, - { - "role": "user", - "content_token_count": 9097, - "target_output_tokens": 84 - }, - { - "role": "user", - "content_token_count": 5568, - "target_output_tokens": 126 - }, - { - "role": "user", - "content_token_count": 29693, - "target_output_tokens": 361 - }, - { - "role": "user", - "content_token_count": 4150, - "target_output_tokens": 804 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 13188, - "target_output_tokens": 1389 - }, - { - "role": "user", - "content_token_count": 20963, - "target_output_tokens": 792 - }, - { - "role": "user", - "content_token_count": 15129, - "target_output_tokens": 325 - }, - { - "role": "user", - "content_token_count": 7575, - "target_output_tokens": 149 - }, - { - "role": "user", - "content_token_count": 20166, - "target_output_tokens": 668 - }, - { - "role": "user", - "content_token_count": 7192, - "target_output_tokens": 332 - }, - { - "role": "user", - "content_token_count": 10367, - "target_output_tokens": 610 - }, - { - "role": "user", - "content_token_count": 5248, - "target_output_tokens": 157 - }, - { - "role": "user", - "content_token_count": 9240, - "target_output_tokens": 216 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 2873, - "target_output_tokens": 154 - }, - { - "role": "user", - "content_token_count": 10140, - "target_output_tokens": 2818 - }, - { - "role": "user", - "content_token_count": 4864, - "target_output_tokens": 1018 - }, - { - "role": "user", - "content_token_count": 10400, - "target_output_tokens": 210 - }, - { - "role": "user", - "content_token_count": 9931, - "target_output_tokens": 431 - }, - { - "role": "user", - "content_token_count": 19920, - "target_output_tokens": 1335 - }, - { - "role": "user", - "content_token_count": 12765, - "target_output_tokens": 479 - }, - { - "role": "user", - "content_token_count": 16121, - "target_output_tokens": 634 - }, - { - "role": "user", - "content_token_count": 16426, - "target_output_tokens": 303 - }, - { - "role": "user", - "content_token_count": 8657, - "target_output_tokens": 606 - }, - { - "role": "user", - "content_token_count": 3219, - "target_output_tokens": 126 - }, - { - "role": "user", - "content_token_count": 3934, - "target_output_tokens": 90 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 29139, - "target_output_tokens": 283 - }, - { - "role": "user", - "content_token_count": 11018, - "target_output_tokens": 2117 - }, - { - "role": "user", - "content_token_count": 12413, - "target_output_tokens": 123 - }, - { - "role": "user", - "content_token_count": 4620, - "target_output_tokens": 1279 - }, - { - "role": "user", - "content_token_count": 14998, - "target_output_tokens": 857 - }, - { - "role": "user", - "content_token_count": 6874, - "target_output_tokens": 377 - }, - { - "role": "user", - "content_token_count": 9962, - "target_output_tokens": 369 - }, - { - "role": "user", - "content_token_count": 35116, - "target_output_tokens": 178 - }, - { - "role": "user", - "content_token_count": 9970, - "target_output_tokens": 516 - }, - { - "role": "user", - "content_token_count": 11643, - "target_output_tokens": 543 - }, - { - "role": "user", - "content_token_count": 14700, - "target_output_tokens": 547 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 1351, - "target_output_tokens": 2192 - }, - { - "role": "user", - "content_token_count": 23550, - "target_output_tokens": 200 - }, - { - "role": "user", - "content_token_count": 2511, - "target_output_tokens": 347 - }, - { - "role": "user", - "content_token_count": 20677, - "target_output_tokens": 589 - }, - { - "role": "user", - "content_token_count": 3425, - "target_output_tokens": 1138 - }, - { - "role": "user", - "content_token_count": 22755, - "target_output_tokens": 1462 - }, - { - "role": "user", - "content_token_count": 6087, - "target_output_tokens": 840 - }, - { - "role": "user", - "content_token_count": 9876, - "target_output_tokens": 164 - }, - { - "role": "user", - "content_token_count": 5481, - "target_output_tokens": 787 - }, - { - "role": "user", - "content_token_count": 4935, - "target_output_tokens": 471 - }, - { - "role": "user", - "content_token_count": 4601, - "target_output_tokens": 373 - }, - { - "role": "user", - "content_token_count": 7449, - "target_output_tokens": 1129 - }, - { - "role": "user", - "content_token_count": 7437, - "target_output_tokens": 664 - }, - { - "role": "user", - "content_token_count": 18022, - "target_output_tokens": 609 - }, - { - "role": "user", - "content_token_count": 6651, - "target_output_tokens": 593 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 3803, - "target_output_tokens": 185 - }, - { - "role": "user", - "content_token_count": 4171, - "target_output_tokens": 471 - }, - { - "role": "user", - "content_token_count": 2991, - "target_output_tokens": 2486 - }, - { - "role": "user", - "content_token_count": 11107, - "target_output_tokens": 846 - }, - { - "role": "user", - "content_token_count": 12672, - "target_output_tokens": 1246 - }, - { - "role": "user", - "content_token_count": 9802, - "target_output_tokens": 404 - }, - { - "role": "user", - "content_token_count": 7244, - "target_output_tokens": 665 - }, - { - "role": "user", - "content_token_count": 11618, - "target_output_tokens": 1037 - }, - { - "role": "user", - "content_token_count": 4494, - "target_output_tokens": 365 - }, - { - "role": "user", - "content_token_count": 3666, - "target_output_tokens": 262 - }, - { - "role": "user", - "content_token_count": 10055, - "target_output_tokens": 395 - }, - { - "role": "user", - "content_token_count": 5900, - "target_output_tokens": 778 - }, - { - "role": "user", - "content_token_count": 2260, - "target_output_tokens": 112 - }, - { - "role": "user", - "content_token_count": 3803, - "target_output_tokens": 1263 - }, - { - "role": "user", - "content_token_count": 38195, - "target_output_tokens": 1187 - }, - { - "role": "user", - "content_token_count": 15430, - "target_output_tokens": 304 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 15126, - "target_output_tokens": 363 - }, - { - "role": "user", - "content_token_count": 11997, - "target_output_tokens": 65 - }, - { - "role": "user", - "content_token_count": 12124, - "target_output_tokens": 304 - }, - { - "role": "user", - "content_token_count": 2942, - "target_output_tokens": 722 - }, - { - "role": "user", - "content_token_count": 10438, - "target_output_tokens": 1058 - }, - { - "role": "user", - "content_token_count": 11401, - "target_output_tokens": 517 - }, - { - "role": "user", - "content_token_count": 22839, - "target_output_tokens": 1334 - }, - { - "role": "user", - "content_token_count": 4480, - "target_output_tokens": 409 - }, - { - "role": "user", - "content_token_count": 8627, - "target_output_tokens": 625 - }, - { - "role": "user", - "content_token_count": 2553, - "target_output_tokens": 1775 - }, - { - "role": "user", - "content_token_count": 5008, - "target_output_tokens": 1304 - }, - { - "role": "user", - "content_token_count": 14883, - "target_output_tokens": 920 - }, - { - "role": "user", - "content_token_count": 14845, - "target_output_tokens": 188 - }, - { - "role": "user", - "content_token_count": 7446, - "target_output_tokens": 116 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 1555, - "target_output_tokens": 87 - }, - { - "role": "user", - "content_token_count": 4544, - "target_output_tokens": 466 - }, - { - "role": "user", - "content_token_count": 3256, - "target_output_tokens": 560 - }, - { - "role": "user", - "content_token_count": 3753, - "target_output_tokens": 201 - }, - { - "role": "user", - "content_token_count": 12476, - "target_output_tokens": 1849 - }, - { - "role": "user", - "content_token_count": 8975, - "target_output_tokens": 1635 - }, - { - "role": "user", - "content_token_count": 2877, - "target_output_tokens": 355 - }, - { - "role": "user", - "content_token_count": 4514, - "target_output_tokens": 181 - }, - { - "role": "user", - "content_token_count": 5382, - "target_output_tokens": 458 - }, - { - "role": "user", - "content_token_count": 3729, - "target_output_tokens": 292 - }, - { - "role": "user", - "content_token_count": 23202, - "target_output_tokens": 850 - }, - { - "role": "user", - "content_token_count": 6266, - "target_output_tokens": 373 - }, - { - "role": "user", - "content_token_count": 2491, - "target_output_tokens": 651 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 5699, - "target_output_tokens": 448 - }, - { - "role": "user", - "content_token_count": 8399, - "target_output_tokens": 96 - }, - { - "role": "user", - "content_token_count": 24606, - "target_output_tokens": 892 - }, - { - "role": "user", - "content_token_count": 1881, - "target_output_tokens": 404 - }, - { - "role": "user", - "content_token_count": 14270, - "target_output_tokens": 302 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 2662, - "target_output_tokens": 159 - }, - { - "role": "user", - "content_token_count": 27451, - "target_output_tokens": 742 - }, - { - "role": "user", - "content_token_count": 6138, - "target_output_tokens": 752 - }, - { - "role": "user", - "content_token_count": 3040, - "target_output_tokens": 95 - }, - { - "role": "user", - "content_token_count": 3937, - "target_output_tokens": 394 - }, - { - "role": "user", - "content_token_count": 10143, - "target_output_tokens": 205 - }, - { - "role": "user", - "content_token_count": 4055, - "target_output_tokens": 665 - }, - { - "role": "user", - "content_token_count": 4486, - "target_output_tokens": 491 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 11225, - "target_output_tokens": 3158 - }, - { - "role": "user", - "content_token_count": 5709, - "target_output_tokens": 206 - }, - { - "role": "user", - "content_token_count": 8289, - "target_output_tokens": 2061 - }, - { - "role": "user", - "content_token_count": 11501, - "target_output_tokens": 625 - }, - { - "role": "user", - "content_token_count": 3024, - "target_output_tokens": 131 - }, - { - "role": "user", - "content_token_count": 6949, - "target_output_tokens": 743 - }, - { - "role": "user", - "content_token_count": 3555, - "target_output_tokens": 205 - }, - { - "role": "user", - "content_token_count": 4155, - "target_output_tokens": 478 - }, - { - "role": "user", - "content_token_count": 11184, - "target_output_tokens": 279 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 15198, - "target_output_tokens": 865 - }, - { - "role": "user", - "content_token_count": 27300, - "target_output_tokens": 352 - }, - { - "role": "user", - "content_token_count": 4084, - "target_output_tokens": 694 - }, - { - "role": "user", - "content_token_count": 2879, - "target_output_tokens": 643 - }, - { - "role": "user", - "content_token_count": 8411, - "target_output_tokens": 1094 - }, - { - "role": "user", - "content_token_count": 3496, - "target_output_tokens": 845 - }, - { - "role": "user", - "content_token_count": 14540, - "target_output_tokens": 288 - }, - { - "role": "user", - "content_token_count": 4651, - "target_output_tokens": 385 - }, - { - "role": "user", - "content_token_count": 14792, - "target_output_tokens": 842 - }, - { - "role": "user", - "content_token_count": 6271, - "target_output_tokens": 317 - }, - { - "role": "user", - "content_token_count": 7613, - "target_output_tokens": 763 - }, - { - "role": "user", - "content_token_count": 5852, - "target_output_tokens": 418 - }, - { - "role": "user", - "content_token_count": 11166, - "target_output_tokens": 2196 - }, - { - "role": "user", - "content_token_count": 19005, - "target_output_tokens": 1055 - }, - { - "role": "user", - "content_token_count": 5886, - "target_output_tokens": 492 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 4062, - "target_output_tokens": 1211 - }, - { - "role": "user", - "content_token_count": 2190, - "target_output_tokens": 717 - }, - { - "role": "user", - "content_token_count": 7556, - "target_output_tokens": 257 - }, - { - "role": "user", - "content_token_count": 5768, - "target_output_tokens": 1324 - }, - { - "role": "user", - "content_token_count": 5463, - "target_output_tokens": 1404 - }, - { - "role": "user", - "content_token_count": 19173, - "target_output_tokens": 808 - }, - { - "role": "user", - "content_token_count": 7797, - "target_output_tokens": 808 - }, - { - "role": "user", - "content_token_count": 4039, - "target_output_tokens": 414 - }, - { - "role": "user", - "content_token_count": 2391, - "target_output_tokens": 436 - }, - { - "role": "user", - "content_token_count": 1957, - "target_output_tokens": 1098 - }, - { - "role": "user", - "content_token_count": 16198, - "target_output_tokens": 852 - }, - { - "role": "user", - "content_token_count": 3101, - "target_output_tokens": 532 - }, - { - "role": "user", - "content_token_count": 4035, - "target_output_tokens": 833 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 1220, - "target_output_tokens": 138 - }, - { - "role": "user", - "content_token_count": 14648, - "target_output_tokens": 168 - }, - { - "role": "user", - "content_token_count": 8228, - "target_output_tokens": 537 - }, - { - "role": "user", - "content_token_count": 2352, - "target_output_tokens": 462 - }, - { - "role": "user", - "content_token_count": 7794, - "target_output_tokens": 259 - }, - { - "role": "user", - "content_token_count": 2734, - "target_output_tokens": 819 - }, - { - "role": "user", - "content_token_count": 17235, - "target_output_tokens": 1471 - }, - { - "role": "user", - "content_token_count": 1357, - "target_output_tokens": 762 - }, - { - "role": "user", - "content_token_count": 10804, - "target_output_tokens": 156 - }, - { - "role": "user", - "content_token_count": 16389, - "target_output_tokens": 983 - }, - { - "role": "user", - "content_token_count": 5074, - "target_output_tokens": 431 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 10280, - "target_output_tokens": 119 - }, - { - "role": "user", - "content_token_count": 4370, - "target_output_tokens": 817 - }, - { - "role": "user", - "content_token_count": 6854, - "target_output_tokens": 1795 - }, - { - "role": "user", - "content_token_count": 15223, - "target_output_tokens": 543 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6116, - "target_output_tokens": 309 - }, - { - "role": "user", - "content_token_count": 6257, - "target_output_tokens": 1301 - }, - { - "role": "user", - "content_token_count": 16623, - "target_output_tokens": 1520 - }, - { - "role": "user", - "content_token_count": 9563, - "target_output_tokens": 1403 - }, - { - "role": "user", - "content_token_count": 9134, - "target_output_tokens": 840 - }, - { - "role": "user", - "content_token_count": 6453, - "target_output_tokens": 388 - }, - { - "role": "user", - "content_token_count": 2951, - "target_output_tokens": 376 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 3444, - "target_output_tokens": 414 - }, - { - "role": "user", - "content_token_count": 2321, - "target_output_tokens": 901 - }, - { - "role": "user", - "content_token_count": 3638, - "target_output_tokens": 1425 - }, - { - "role": "user", - "content_token_count": 7123, - "target_output_tokens": 1696 - }, - { - "role": "user", - "content_token_count": 2057, - "target_output_tokens": 351 - }, - { - "role": "user", - "content_token_count": 18346, - "target_output_tokens": 587 - }, - { - "role": "user", - "content_token_count": 9716, - "target_output_tokens": 640 - }, - { - "role": "user", - "content_token_count": 6768, - "target_output_tokens": 388 - }, - { - "role": "user", - "content_token_count": 3788, - "target_output_tokens": 250 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 2734, - "target_output_tokens": 1979 - }, - { - "role": "user", - "content_token_count": 4136, - "target_output_tokens": 2452 - }, - { - "role": "user", - "content_token_count": 7721, - "target_output_tokens": 550 - }, - { - "role": "user", - "content_token_count": 1881, - "target_output_tokens": 648 - }, - { - "role": "user", - "content_token_count": 6673, - "target_output_tokens": 406 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6955, - "target_output_tokens": 1459 - }, - { - "role": "user", - "content_token_count": 1014, - "target_output_tokens": 1007 - }, - { - "role": "user", - "content_token_count": 13098, - "target_output_tokens": 1459 - }, - { - "role": "user", - "content_token_count": 4876, - "target_output_tokens": 947 - }, - { - "role": "user", - "content_token_count": 9889, - "target_output_tokens": 1563 - }, - { - "role": "user", - "content_token_count": 2544, - "target_output_tokens": 3149 - }, - { - "role": "user", - "content_token_count": 9006, - "target_output_tokens": 245 - }, - { - "role": "user", - "content_token_count": 18694, - "target_output_tokens": 1384 - }, - { - "role": "user", - "content_token_count": 1467, - "target_output_tokens": 1471 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 17406, - "target_output_tokens": 286 - }, - { - "role": "user", - "content_token_count": 3679, - "target_output_tokens": 636 - }, - { - "role": "user", - "content_token_count": 2184, - "target_output_tokens": 321 - }, - { - "role": "user", - "content_token_count": 7967, - "target_output_tokens": 187 - }, - { - "role": "user", - "content_token_count": 6174, - "target_output_tokens": 654 - }, - { - "role": "user", - "content_token_count": 7180, - "target_output_tokens": 270 - }, - { - "role": "user", - "content_token_count": 10946, - "target_output_tokens": 95 - }, - { - "role": "user", - "content_token_count": 2518, - "target_output_tokens": 430 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6603, - "target_output_tokens": 646 - }, - { - "role": "user", - "content_token_count": 10518, - "target_output_tokens": 1096 - }, - { - "role": "user", - "content_token_count": 14848, - "target_output_tokens": 408 - }, - { - "role": "user", - "content_token_count": 2262, - "target_output_tokens": 499 - }, - { - "role": "user", - "content_token_count": 6591, - "target_output_tokens": 662 - }, - { - "role": "user", - "content_token_count": 5042, - "target_output_tokens": 540 - }, - { - "role": "user", - "content_token_count": 14974, - "target_output_tokens": 3408 - }, - { - "role": "user", - "content_token_count": 5658, - "target_output_tokens": 1060 - }, - { - "role": "user", - "content_token_count": 5558, - "target_output_tokens": 1785 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 3100, - "target_output_tokens": 849 - }, - { - "role": "user", - "content_token_count": 12776, - "target_output_tokens": 945 - }, - { - "role": "user", - "content_token_count": 2376, - "target_output_tokens": 1003 - }, - { - "role": "user", - "content_token_count": 6865, - "target_output_tokens": 462 - }, - { - "role": "user", - "content_token_count": 3111, - "target_output_tokens": 509 - }, - { - "role": "user", - "content_token_count": 16078, - "target_output_tokens": 342 - }, - { - "role": "user", - "content_token_count": 16493, - "target_output_tokens": 733 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 8957, - "target_output_tokens": 307 - }, - { - "role": "user", - "content_token_count": 19094, - "target_output_tokens": 427 - }, - { - "role": "user", - "content_token_count": 2869, - "target_output_tokens": 405 - }, - { - "role": "user", - "content_token_count": 18384, - "target_output_tokens": 185 - }, - { - "role": "user", - "content_token_count": 6443, - "target_output_tokens": 1522 - }, - { - "role": "user", - "content_token_count": 5348, - "target_output_tokens": 662 - }, - { - "role": "user", - "content_token_count": 3869, - "target_output_tokens": 175 - }, - { - "role": "user", - "content_token_count": 5106, - "target_output_tokens": 761 - }, - { - "role": "user", - "content_token_count": 16260, - "target_output_tokens": 2221 - }, - { - "role": "user", - "content_token_count": 3983, - "target_output_tokens": 90 - }, - { - "role": "user", - "content_token_count": 2900, - "target_output_tokens": 809 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 4829, - "target_output_tokens": 226 - }, - { - "role": "user", - "content_token_count": 2384, - "target_output_tokens": 491 - }, - { - "role": "user", - "content_token_count": 26292, - "target_output_tokens": 659 - }, - { - "role": "user", - "content_token_count": 12843, - "target_output_tokens": 692 - }, - { - "role": "user", - "content_token_count": 3004, - "target_output_tokens": 300 - }, - { - "role": "user", - "content_token_count": 21070, - "target_output_tokens": 1321 - }, - { - "role": "user", - "content_token_count": 12368, - "target_output_tokens": 129 - }, - { - "role": "user", - "content_token_count": 6159, - "target_output_tokens": 1480 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 5460, - "target_output_tokens": 249 - }, - { - "role": "user", - "content_token_count": 9185, - "target_output_tokens": 229 - }, - { - "role": "user", - "content_token_count": 29343, - "target_output_tokens": 319 - }, - { - "role": "user", - "content_token_count": 7542, - "target_output_tokens": 1027 - }, - { - "role": "user", - "content_token_count": 3182, - "target_output_tokens": 248 - }, - { - "role": "user", - "content_token_count": 9888, - "target_output_tokens": 1865 - }, - { - "role": "user", - "content_token_count": 7401, - "target_output_tokens": 854 - }, - { - "role": "user", - "content_token_count": 6561, - "target_output_tokens": 654 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6488, - "target_output_tokens": 77 - }, - { - "role": "user", - "content_token_count": 6158, - "target_output_tokens": 374 - }, - { - "role": "user", - "content_token_count": 12575, - "target_output_tokens": 1325 - }, - { - "role": "user", - "content_token_count": 18730, - "target_output_tokens": 325 - }, - { - "role": "user", - "content_token_count": 2581, - "target_output_tokens": 1027 - }, - { - "role": "user", - "content_token_count": 65536, - "target_output_tokens": 1888 - }, - { - "role": "user", - "content_token_count": 1787, - "target_output_tokens": 970 - }, - { - "role": "user", - "content_token_count": 7304, - "target_output_tokens": 181 - }, - { - "role": "user", - "content_token_count": 4038, - "target_output_tokens": 2854 - }, - { - "role": "user", - "content_token_count": 9441, - "target_output_tokens": 985 - }, - { - "role": "user", - "content_token_count": 5386, - "target_output_tokens": 550 - }, - { - "role": "user", - "content_token_count": 895, - "target_output_tokens": 550 - }, - { - "role": "user", - "content_token_count": 3238, - "target_output_tokens": 467 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 9749, - "target_output_tokens": 594 - }, - { - "role": "user", - "content_token_count": 6586, - "target_output_tokens": 303 - }, - { - "role": "user", - "content_token_count": 13734, - "target_output_tokens": 1592 - }, - { - "role": "user", - "content_token_count": 4723, - "target_output_tokens": 2155 - }, - { - "role": "user", - "content_token_count": 19342, - "target_output_tokens": 161 - }, - { - "role": "user", - "content_token_count": 7921, - "target_output_tokens": 130 - }, - { - "role": "user", - "content_token_count": 26045, - "target_output_tokens": 613 - }, - { - "role": "user", - "content_token_count": 9327, - "target_output_tokens": 158 - }, - { - "role": "user", - "content_token_count": 5054, - "target_output_tokens": 652 - }, - { - "role": "user", - "content_token_count": 65536, - "target_output_tokens": 753 - }, - { - "role": "user", - "content_token_count": 13763, - "target_output_tokens": 501 - }, - { - "role": "user", - "content_token_count": 7809, - "target_output_tokens": 618 - }, - { - "role": "user", - "content_token_count": 1780, - "target_output_tokens": 1609 - }, - { - "role": "user", - "content_token_count": 13566, - "target_output_tokens": 219 - }, - { - "role": "user", - "content_token_count": 8244, - "target_output_tokens": 707 - }, - { - "role": "user", - "content_token_count": 3690, - "target_output_tokens": 2575 - }, - { - "role": "user", - "content_token_count": 8579, - "target_output_tokens": 289 - }, - { - "role": "user", - "content_token_count": 13461, - "target_output_tokens": 835 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 7460, - "target_output_tokens": 564 - }, - { - "role": "user", - "content_token_count": 12306, - "target_output_tokens": 643 - }, - { - "role": "user", - "content_token_count": 4237, - "target_output_tokens": 436 - }, - { - "role": "user", - "content_token_count": 2239, - "target_output_tokens": 1437 - }, - { - "role": "user", - "content_token_count": 4323, - "target_output_tokens": 1610 - }, - { - "role": "user", - "content_token_count": 8322, - "target_output_tokens": 628 - }, - { - "role": "user", - "content_token_count": 8307, - "target_output_tokens": 321 - }, - { - "role": "user", - "content_token_count": 8038, - "target_output_tokens": 221 - }, - { - "role": "user", - "content_token_count": 9312, - "target_output_tokens": 119 - }, - { - "role": "user", - "content_token_count": 8570, - "target_output_tokens": 1070 - }, - { - "role": "user", - "content_token_count": 43634, - "target_output_tokens": 801 - }, - { - "role": "user", - "content_token_count": 9896, - "target_output_tokens": 559 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 11595, - "target_output_tokens": 458 - }, - { - "role": "user", - "content_token_count": 8292, - "target_output_tokens": 942 - }, - { - "role": "user", - "content_token_count": 3946, - "target_output_tokens": 490 - }, - { - "role": "user", - "content_token_count": 2955, - "target_output_tokens": 712 - }, - { - "role": "user", - "content_token_count": 4839, - "target_output_tokens": 272 - }, - { - "role": "user", - "content_token_count": 4011, - "target_output_tokens": 335 - }, - { - "role": "user", - "content_token_count": 5086, - "target_output_tokens": 315 - }, - { - "role": "user", - "content_token_count": 5209, - "target_output_tokens": 764 - }, - { - "role": "user", - "content_token_count": 6710, - "target_output_tokens": 146 - }, - { - "role": "user", - "content_token_count": 2382, - "target_output_tokens": 277 - }, - { - "role": "user", - "content_token_count": 18762, - "target_output_tokens": 312 - }, - { - "role": "user", - "content_token_count": 3554, - "target_output_tokens": 393 - }, - { - "role": "user", - "content_token_count": 10240, - "target_output_tokens": 130 - }, - { - "role": "user", - "content_token_count": 10301, - "target_output_tokens": 986 - }, - { - "role": "user", - "content_token_count": 4008, - "target_output_tokens": 461 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 21422, - "target_output_tokens": 346 - }, - { - "role": "user", - "content_token_count": 5246, - "target_output_tokens": 217 - }, - { - "role": "user", - "content_token_count": 13646, - "target_output_tokens": 499 - }, - { - "role": "user", - "content_token_count": 5532, - "target_output_tokens": 249 - }, - { - "role": "user", - "content_token_count": 5178, - "target_output_tokens": 149 - }, - { - "role": "user", - "content_token_count": 1034, - "target_output_tokens": 316 - }, - { - "role": "user", - "content_token_count": 3570, - "target_output_tokens": 318 - }, - { - "role": "user", - "content_token_count": 9334, - "target_output_tokens": 1761 - }, - { - "role": "user", - "content_token_count": 4071, - "target_output_tokens": 227 - }, - { - "role": "user", - "content_token_count": 11734, - "target_output_tokens": 340 - }, - { - "role": "user", - "content_token_count": 5927, - "target_output_tokens": 302 - }, - { - "role": "user", - "content_token_count": 7918, - "target_output_tokens": 337 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 2647, - "target_output_tokens": 301 - }, - { - "role": "user", - "content_token_count": 14271, - "target_output_tokens": 1313 - }, - { - "role": "user", - "content_token_count": 5670, - "target_output_tokens": 954 - }, - { - "role": "user", - "content_token_count": 5014, - "target_output_tokens": 2103 - }, - { - "role": "user", - "content_token_count": 14137, - "target_output_tokens": 997 - }, - { - "role": "user", - "content_token_count": 8872, - "target_output_tokens": 1332 - }, - { - "role": "user", - "content_token_count": 2096, - "target_output_tokens": 4096 - }, - { - "role": "user", - "content_token_count": 16766, - "target_output_tokens": 587 - }, - { - "role": "user", - "content_token_count": 5742, - "target_output_tokens": 493 - }, - { - "role": "user", - "content_token_count": 21664, - "target_output_tokens": 696 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 3432, - "target_output_tokens": 203 - }, - { - "role": "user", - "content_token_count": 4013, - "target_output_tokens": 79 - }, - { - "role": "user", - "content_token_count": 23484, - "target_output_tokens": 220 - }, - { - "role": "user", - "content_token_count": 1546, - "target_output_tokens": 289 - }, - { - "role": "user", - "content_token_count": 4542, - "target_output_tokens": 515 - }, - { - "role": "user", - "content_token_count": 5260, - "target_output_tokens": 378 - }, - { - "role": "user", - "content_token_count": 5487, - "target_output_tokens": 654 - }, - { - "role": "user", - "content_token_count": 7881, - "target_output_tokens": 380 - }, - { - "role": "user", - "content_token_count": 3358, - "target_output_tokens": 687 - }, - { - "role": "user", - "content_token_count": 11898, - "target_output_tokens": 180 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 38833, - "target_output_tokens": 534 - }, - { - "role": "user", - "content_token_count": 5781, - "target_output_tokens": 725 - }, - { - "role": "user", - "content_token_count": 7261, - "target_output_tokens": 165 - }, - { - "role": "user", - "content_token_count": 1280, - "target_output_tokens": 129 - }, - { - "role": "user", - "content_token_count": 5792, - "target_output_tokens": 466 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 10544, - "target_output_tokens": 692 - }, - { - "role": "user", - "content_token_count": 15136, - "target_output_tokens": 836 - }, - { - "role": "user", - "content_token_count": 5686, - "target_output_tokens": 1758 - }, - { - "role": "user", - "content_token_count": 12712, - "target_output_tokens": 2240 - }, - { - "role": "user", - "content_token_count": 4875, - "target_output_tokens": 482 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 60523, - "target_output_tokens": 271 - }, - { - "role": "user", - "content_token_count": 10297, - "target_output_tokens": 631 - }, - { - "role": "user", - "content_token_count": 16059, - "target_output_tokens": 648 - }, - { - "role": "user", - "content_token_count": 20684, - "target_output_tokens": 487 - }, - { - "role": "user", - "content_token_count": 6343, - "target_output_tokens": 637 - }, - { - "role": "user", - "content_token_count": 29821, - "target_output_tokens": 436 - }, - { - "role": "user", - "content_token_count": 2615, - "target_output_tokens": 187 - }, - { - "role": "user", - "content_token_count": 4564, - "target_output_tokens": 980 - }, - { - "role": "user", - "content_token_count": 7889, - "target_output_tokens": 907 - }, - { - "role": "user", - "content_token_count": 14777, - "target_output_tokens": 361 - }, - { - "role": "user", - "content_token_count": 5646, - "target_output_tokens": 1521 - }, - { - "role": "user", - "content_token_count": 13268, - "target_output_tokens": 554 - }, - { - "role": "user", - "content_token_count": 10637, - "target_output_tokens": 1013 - }, - { - "role": "user", - "content_token_count": 5757, - "target_output_tokens": 1339 - }, - { - "role": "user", - "content_token_count": 5184, - "target_output_tokens": 628 - }, - { - "role": "user", - "content_token_count": 12479, - "target_output_tokens": 792 - }, - { - "role": "user", - "content_token_count": 18012, - "target_output_tokens": 167 - }, - { - "role": "user", - "content_token_count": 14643, - "target_output_tokens": 532 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 1938, - "target_output_tokens": 1098 - }, - { - "role": "user", - "content_token_count": 685, - "target_output_tokens": 986 - }, - { - "role": "user", - "content_token_count": 3023, - "target_output_tokens": 292 - }, - { - "role": "user", - "content_token_count": 26370, - "target_output_tokens": 332 - }, - { - "role": "user", - "content_token_count": 7935, - "target_output_tokens": 179 - }, - { - "role": "user", - "content_token_count": 2052, - "target_output_tokens": 99 - }, - { - "role": "user", - "content_token_count": 5165, - "target_output_tokens": 747 - }, - { - "role": "user", - "content_token_count": 13734, - "target_output_tokens": 435 - }, - { - "role": "user", - "content_token_count": 979, - "target_output_tokens": 760 - }, - { - "role": "user", - "content_token_count": 4084, - "target_output_tokens": 604 - }, - { - "role": "user", - "content_token_count": 19546, - "target_output_tokens": 183 - }, - { - "role": "user", - "content_token_count": 1609, - "target_output_tokens": 191 - }, - { - "role": "user", - "content_token_count": 3857, - "target_output_tokens": 1024 - }, - { - "role": "user", - "content_token_count": 21131, - "target_output_tokens": 1830 - }, - { - "role": "user", - "content_token_count": 4129, - "target_output_tokens": 343 - }, - { - "role": "user", - "content_token_count": 30740, - "target_output_tokens": 635 - }, - { - "role": "user", - "content_token_count": 10871, - "target_output_tokens": 995 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 8416, - "target_output_tokens": 664 - }, - { - "role": "user", - "content_token_count": 6856, - "target_output_tokens": 360 - }, - { - "role": "user", - "content_token_count": 12991, - "target_output_tokens": 1554 - }, - { - "role": "user", - "content_token_count": 2681, - "target_output_tokens": 1392 - }, - { - "role": "user", - "content_token_count": 2083, - "target_output_tokens": 1322 - }, - { - "role": "user", - "content_token_count": 2529, - "target_output_tokens": 862 - }, - { - "role": "user", - "content_token_count": 4854, - "target_output_tokens": 412 - }, - { - "role": "user", - "content_token_count": 5826, - "target_output_tokens": 904 - }, - { - "role": "user", - "content_token_count": 1412, - "target_output_tokens": 197 - }, - { - "role": "user", - "content_token_count": 16884, - "target_output_tokens": 319 - }, - { - "role": "user", - "content_token_count": 2209, - "target_output_tokens": 370 - }, - { - "role": "user", - "content_token_count": 6010, - "target_output_tokens": 1294 - }, - { - "role": "user", - "content_token_count": 19805, - "target_output_tokens": 2855 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 7510, - "target_output_tokens": 354 - }, - { - "role": "user", - "content_token_count": 20508, - "target_output_tokens": 390 - }, - { - "role": "user", - "content_token_count": 14364, - "target_output_tokens": 234 - }, - { - "role": "user", - "content_token_count": 5578, - "target_output_tokens": 672 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 7461, - "target_output_tokens": 2138 - }, - { - "role": "user", - "content_token_count": 8915, - "target_output_tokens": 721 - }, - { - "role": "user", - "content_token_count": 827, - "target_output_tokens": 458 - }, - { - "role": "user", - "content_token_count": 5858, - "target_output_tokens": 252 - }, - { - "role": "user", - "content_token_count": 3199, - "target_output_tokens": 864 - }, - { - "role": "user", - "content_token_count": 17479, - "target_output_tokens": 387 - }, - { - "role": "user", - "content_token_count": 6488, - "target_output_tokens": 768 - }, - { - "role": "user", - "content_token_count": 11265, - "target_output_tokens": 797 - }, - { - "role": "user", - "content_token_count": 6991, - "target_output_tokens": 802 - }, - { - "role": "user", - "content_token_count": 12962, - "target_output_tokens": 559 - }, - { - "role": "user", - "content_token_count": 6638, - "target_output_tokens": 2509 - }, - { - "role": "user", - "content_token_count": 2297, - "target_output_tokens": 803 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 11614, - "target_output_tokens": 248 - }, - { - "role": "user", - "content_token_count": 3234, - "target_output_tokens": 64 - }, - { - "role": "user", - "content_token_count": 18001, - "target_output_tokens": 64 - }, - { - "role": "user", - "content_token_count": 17797, - "target_output_tokens": 792 - }, - { - "role": "user", - "content_token_count": 15525, - "target_output_tokens": 341 - }, - { - "role": "user", - "content_token_count": 11380, - "target_output_tokens": 308 - }, - { - "role": "user", - "content_token_count": 20150, - "target_output_tokens": 336 - }, - { - "role": "user", - "content_token_count": 10705, - "target_output_tokens": 149 - }, - { - "role": "user", - "content_token_count": 5871, - "target_output_tokens": 432 - }, - { - "role": "user", - "content_token_count": 5526, - "target_output_tokens": 406 - }, - { - "role": "user", - "content_token_count": 7675, - "target_output_tokens": 1587 - }, - { - "role": "user", - "content_token_count": 2277, - "target_output_tokens": 1478 - }, - { - "role": "user", - "content_token_count": 9244, - "target_output_tokens": 168 - }, - { - "role": "user", - "content_token_count": 9135, - "target_output_tokens": 141 - }, - { - "role": "user", - "content_token_count": 6477, - "target_output_tokens": 847 - }, - { - "role": "user", - "content_token_count": 5213, - "target_output_tokens": 381 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 11902, - "target_output_tokens": 363 - }, - { - "role": "user", - "content_token_count": 4133, - "target_output_tokens": 763 - }, - { - "role": "user", - "content_token_count": 34974, - "target_output_tokens": 595 - }, - { - "role": "user", - "content_token_count": 3005, - "target_output_tokens": 748 - }, - { - "role": "user", - "content_token_count": 13140, - "target_output_tokens": 1585 - }, - { - "role": "user", - "content_token_count": 10800, - "target_output_tokens": 451 - }, - { - "role": "user", - "content_token_count": 7703, - "target_output_tokens": 308 - }, - { - "role": "user", - "content_token_count": 6180, - "target_output_tokens": 421 - }, - { - "role": "user", - "content_token_count": 7095, - "target_output_tokens": 2469 - }, - { - "role": "user", - "content_token_count": 27521, - "target_output_tokens": 645 - }, - { - "role": "user", - "content_token_count": 14207, - "target_output_tokens": 615 - }, - { - "role": "user", - "content_token_count": 7467, - "target_output_tokens": 736 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 20561, - "target_output_tokens": 111 - }, - { - "role": "user", - "content_token_count": 1000, - "target_output_tokens": 934 - }, - { - "role": "user", - "content_token_count": 32461, - "target_output_tokens": 115 - }, - { - "role": "user", - "content_token_count": 7010, - "target_output_tokens": 128 - }, - { - "role": "user", - "content_token_count": 65536, - "target_output_tokens": 567 - }, - { - "role": "user", - "content_token_count": 9176, - "target_output_tokens": 146 - }, - { - "role": "user", - "content_token_count": 11138, - "target_output_tokens": 2089 - }, - { - "role": "user", - "content_token_count": 24757, - "target_output_tokens": 204 - }, - { - "role": "user", - "content_token_count": 6580, - "target_output_tokens": 1229 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 4856, - "target_output_tokens": 587 - }, - { - "role": "user", - "content_token_count": 4192, - "target_output_tokens": 631 - }, - { - "role": "user", - "content_token_count": 7377, - "target_output_tokens": 358 - }, - { - "role": "user", - "content_token_count": 4030, - "target_output_tokens": 437 - }, - { - "role": "user", - "content_token_count": 8482, - "target_output_tokens": 404 - }, - { - "role": "user", - "content_token_count": 10934, - "target_output_tokens": 397 - }, - { - "role": "user", - "content_token_count": 5271, - "target_output_tokens": 105 - }, - { - "role": "user", - "content_token_count": 1504, - "target_output_tokens": 207 - }, - { - "role": "user", - "content_token_count": 12542, - "target_output_tokens": 497 - }, - { - "role": "user", - "content_token_count": 3169, - "target_output_tokens": 418 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 34022, - "target_output_tokens": 920 - }, - { - "role": "user", - "content_token_count": 4306, - "target_output_tokens": 383 - }, - { - "role": "user", - "content_token_count": 3490, - "target_output_tokens": 1086 - }, - { - "role": "user", - "content_token_count": 3939, - "target_output_tokens": 1038 - }, - { - "role": "user", - "content_token_count": 26508, - "target_output_tokens": 1136 - }, - { - "role": "user", - "content_token_count": 7044, - "target_output_tokens": 3317 - }, - { - "role": "user", - "content_token_count": 2441, - "target_output_tokens": 962 - }, - { - "role": "user", - "content_token_count": 2360, - "target_output_tokens": 442 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 13707, - "target_output_tokens": 159 - }, - { - "role": "user", - "content_token_count": 3362, - "target_output_tokens": 495 - }, - { - "role": "user", - "content_token_count": 3014, - "target_output_tokens": 156 - }, - { - "role": "user", - "content_token_count": 9534, - "target_output_tokens": 430 - }, - { - "role": "user", - "content_token_count": 8037, - "target_output_tokens": 724 - }, - { - "role": "user", - "content_token_count": 12462, - "target_output_tokens": 814 - }, - { - "role": "user", - "content_token_count": 18227, - "target_output_tokens": 371 - }, - { - "role": "user", - "content_token_count": 2077, - "target_output_tokens": 867 - }, - { - "role": "user", - "content_token_count": 10950, - "target_output_tokens": 412 - }, - { - "role": "user", - "content_token_count": 12169, - "target_output_tokens": 331 - }, - { - "role": "user", - "content_token_count": 4436, - "target_output_tokens": 260 - }, - { - "role": "user", - "content_token_count": 2961, - "target_output_tokens": 952 - }, - { - "role": "user", - "content_token_count": 21323, - "target_output_tokens": 1066 - }, - { - "role": "user", - "content_token_count": 14035, - "target_output_tokens": 1134 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 14500, - "target_output_tokens": 1813 - }, - { - "role": "user", - "content_token_count": 4751, - "target_output_tokens": 1726 - }, - { - "role": "user", - "content_token_count": 14083, - "target_output_tokens": 444 - }, - { - "role": "user", - "content_token_count": 2668, - "target_output_tokens": 199 - }, - { - "role": "user", - "content_token_count": 6391, - "target_output_tokens": 3392 - }, - { - "role": "user", - "content_token_count": 33050, - "target_output_tokens": 2319 - }, - { - "role": "user", - "content_token_count": 19617, - "target_output_tokens": 401 - }, - { - "role": "user", - "content_token_count": 9052, - "target_output_tokens": 220 - }, - { - "role": "user", - "content_token_count": 21741, - "target_output_tokens": 1047 - }, - { - "role": "user", - "content_token_count": 19064, - "target_output_tokens": 340 - }, - { - "role": "user", - "content_token_count": 1184, - "target_output_tokens": 804 - }, - { - "role": "user", - "content_token_count": 50708, - "target_output_tokens": 1268 - }, - { - "role": "user", - "content_token_count": 1043, - "target_output_tokens": 528 - }, - { - "role": "user", - "content_token_count": 7976, - "target_output_tokens": 600 - }, - { - "role": "user", - "content_token_count": 2967, - "target_output_tokens": 193 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 4241, - "target_output_tokens": 1292 - }, - { - "role": "user", - "content_token_count": 8073, - "target_output_tokens": 1244 - }, - { - "role": "user", - "content_token_count": 21650, - "target_output_tokens": 603 - }, - { - "role": "user", - "content_token_count": 30704, - "target_output_tokens": 109 - }, - { - "role": "user", - "content_token_count": 3793, - "target_output_tokens": 486 - }, - { - "role": "user", - "content_token_count": 65536, - "target_output_tokens": 455 - }, - { - "role": "user", - "content_token_count": 12867, - "target_output_tokens": 244 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 5205, - "target_output_tokens": 190 - }, - { - "role": "user", - "content_token_count": 9530, - "target_output_tokens": 323 - }, - { - "role": "user", - "content_token_count": 5813, - "target_output_tokens": 662 - }, - { - "role": "user", - "content_token_count": 6079, - "target_output_tokens": 710 - }, - { - "role": "user", - "content_token_count": 3766, - "target_output_tokens": 319 - }, - { - "role": "user", - "content_token_count": 10983, - "target_output_tokens": 419 - }, - { - "role": "user", - "content_token_count": 38098, - "target_output_tokens": 897 - }, - { - "role": "user", - "content_token_count": 7410, - "target_output_tokens": 1273 - }, - { - "role": "user", - "content_token_count": 6534, - "target_output_tokens": 439 - }, - { - "role": "user", - "content_token_count": 2603, - "target_output_tokens": 363 - }, - { - "role": "user", - "content_token_count": 4395, - "target_output_tokens": 72 - }, - { - "role": "user", - "content_token_count": 6739, - "target_output_tokens": 424 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 23588, - "target_output_tokens": 164 - }, - { - "role": "user", - "content_token_count": 17832, - "target_output_tokens": 506 - }, - { - "role": "user", - "content_token_count": 22461, - "target_output_tokens": 198 - }, - { - "role": "user", - "content_token_count": 10329, - "target_output_tokens": 1380 - }, - { - "role": "user", - "content_token_count": 16613, - "target_output_tokens": 523 - }, - { - "role": "user", - "content_token_count": 18924, - "target_output_tokens": 1091 - }, - { - "role": "user", - "content_token_count": 6640, - "target_output_tokens": 936 - }, - { - "role": "user", - "content_token_count": 5752, - "target_output_tokens": 1079 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 16422, - "target_output_tokens": 611 - }, - { - "role": "user", - "content_token_count": 8736, - "target_output_tokens": 1393 - }, - { - "role": "user", - "content_token_count": 30989, - "target_output_tokens": 357 - }, - { - "role": "user", - "content_token_count": 32378, - "target_output_tokens": 365 - }, - { - "role": "user", - "content_token_count": 4826, - "target_output_tokens": 1142 - }, - { - "role": "user", - "content_token_count": 7705, - "target_output_tokens": 2254 - }, - { - "role": "user", - "content_token_count": 1630, - "target_output_tokens": 1219 - }, - { - "role": "user", - "content_token_count": 5323, - "target_output_tokens": 838 - }, - { - "role": "user", - "content_token_count": 21581, - "target_output_tokens": 654 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 8355, - "target_output_tokens": 529 - }, - { - "role": "user", - "content_token_count": 33639, - "target_output_tokens": 650 - }, - { - "role": "user", - "content_token_count": 9794, - "target_output_tokens": 355 - }, - { - "role": "user", - "content_token_count": 5952, - "target_output_tokens": 608 - }, - { - "role": "user", - "content_token_count": 7696, - "target_output_tokens": 163 - }, - { - "role": "user", - "content_token_count": 8151, - "target_output_tokens": 108 - }, - { - "role": "user", - "content_token_count": 11377, - "target_output_tokens": 486 - }, - { - "role": "user", - "content_token_count": 2795, - "target_output_tokens": 765 - }, - { - "role": "user", - "content_token_count": 8478, - "target_output_tokens": 361 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 3254, - "target_output_tokens": 524 - }, - { - "role": "user", - "content_token_count": 13573, - "target_output_tokens": 1371 - }, - { - "role": "user", - "content_token_count": 4347, - "target_output_tokens": 538 - }, - { - "role": "user", - "content_token_count": 52807, - "target_output_tokens": 1303 - }, - { - "role": "user", - "content_token_count": 6319, - "target_output_tokens": 278 - }, - { - "role": "user", - "content_token_count": 4295, - "target_output_tokens": 640 - }, - { - "role": "user", - "content_token_count": 2030, - "target_output_tokens": 358 - }, - { - "role": "user", - "content_token_count": 13300, - "target_output_tokens": 504 - }, - { - "role": "user", - "content_token_count": 4151, - "target_output_tokens": 1040 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 10729, - "target_output_tokens": 621 - }, - { - "role": "user", - "content_token_count": 6674, - "target_output_tokens": 433 - }, - { - "role": "user", - "content_token_count": 11618, - "target_output_tokens": 156 - }, - { - "role": "user", - "content_token_count": 13713, - "target_output_tokens": 934 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 9731, - "target_output_tokens": 318 - }, - { - "role": "user", - "content_token_count": 65536, - "target_output_tokens": 507 - }, - { - "role": "user", - "content_token_count": 3019, - "target_output_tokens": 450 - }, - { - "role": "user", - "content_token_count": 10288, - "target_output_tokens": 668 - }, - { - "role": "user", - "content_token_count": 22301, - "target_output_tokens": 815 - }, - { - "role": "user", - "content_token_count": 5283, - "target_output_tokens": 275 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 3544, - "target_output_tokens": 843 - }, - { - "role": "user", - "content_token_count": 7783, - "target_output_tokens": 332 - }, - { - "role": "user", - "content_token_count": 2684, - "target_output_tokens": 845 - }, - { - "role": "user", - "content_token_count": 10549, - "target_output_tokens": 275 - }, - { - "role": "user", - "content_token_count": 9460, - "target_output_tokens": 608 - }, - { - "role": "user", - "content_token_count": 3164, - "target_output_tokens": 542 - }, - { - "role": "user", - "content_token_count": 3760, - "target_output_tokens": 494 - }, - { - "role": "user", - "content_token_count": 5991, - "target_output_tokens": 458 - }, - { - "role": "user", - "content_token_count": 3873, - "target_output_tokens": 800 - }, - { - "role": "user", - "content_token_count": 4054, - "target_output_tokens": 400 - }, - { - "role": "user", - "content_token_count": 3102, - "target_output_tokens": 2786 - }, - { - "role": "user", - "content_token_count": 5452, - "target_output_tokens": 3343 - }, - { - "role": "user", - "content_token_count": 2904, - "target_output_tokens": 483 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 2269, - "target_output_tokens": 738 - }, - { - "role": "user", - "content_token_count": 18252, - "target_output_tokens": 64 - }, - { - "role": "user", - "content_token_count": 16077, - "target_output_tokens": 369 - }, - { - "role": "user", - "content_token_count": 2591, - "target_output_tokens": 1498 - }, - { - "role": "user", - "content_token_count": 955, - "target_output_tokens": 964 - }, - { - "role": "user", - "content_token_count": 15421, - "target_output_tokens": 1148 - }, - { - "role": "user", - "content_token_count": 26417, - "target_output_tokens": 282 - }, - { - "role": "user", - "content_token_count": 2450, - "target_output_tokens": 641 - }, - { - "role": "user", - "content_token_count": 3723, - "target_output_tokens": 1544 - }, - { - "role": "user", - "content_token_count": 24848, - "target_output_tokens": 1652 - }, - { - "role": "user", - "content_token_count": 1198, - "target_output_tokens": 303 - }, - { - "role": "user", - "content_token_count": 3660, - "target_output_tokens": 378 - }, - { - "role": "user", - "content_token_count": 8385, - "target_output_tokens": 971 - }, - { - "role": "user", - "content_token_count": 17089, - "target_output_tokens": 146 - }, - { - "role": "user", - "content_token_count": 13626, - "target_output_tokens": 1436 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6980, - "target_output_tokens": 779 - }, - { - "role": "user", - "content_token_count": 14266, - "target_output_tokens": 998 - }, - { - "role": "user", - "content_token_count": 19395, - "target_output_tokens": 931 - }, - { - "role": "user", - "content_token_count": 27605, - "target_output_tokens": 864 - }, - { - "role": "user", - "content_token_count": 7245, - "target_output_tokens": 462 - }, - { - "role": "user", - "content_token_count": 3242, - "target_output_tokens": 90 - }, - { - "role": "user", - "content_token_count": 2781, - "target_output_tokens": 1296 - }, - { - "role": "user", - "content_token_count": 1676, - "target_output_tokens": 1609 - }, - { - "role": "user", - "content_token_count": 9287, - "target_output_tokens": 1339 - }, - { - "role": "user", - "content_token_count": 7842, - "target_output_tokens": 686 - }, - { - "role": "user", - "content_token_count": 7397, - "target_output_tokens": 133 - }, - { - "role": "user", - "content_token_count": 12946, - "target_output_tokens": 579 - }, - { - "role": "user", - "content_token_count": 6842, - "target_output_tokens": 1282 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 14195, - "target_output_tokens": 466 - }, - { - "role": "user", - "content_token_count": 4463, - "target_output_tokens": 558 - }, - { - "role": "user", - "content_token_count": 1089, - "target_output_tokens": 2126 - }, - { - "role": "user", - "content_token_count": 9114, - "target_output_tokens": 483 - }, - { - "role": "user", - "content_token_count": 4745, - "target_output_tokens": 810 - }, - { - "role": "user", - "content_token_count": 11648, - "target_output_tokens": 395 - }, - { - "role": "user", - "content_token_count": 2438, - "target_output_tokens": 444 - }, - { - "role": "user", - "content_token_count": 15094, - "target_output_tokens": 357 - }, - { - "role": "user", - "content_token_count": 5004, - "target_output_tokens": 1692 - }, - { - "role": "user", - "content_token_count": 17422, - "target_output_tokens": 161 - }, - { - "role": "user", - "content_token_count": 18830, - "target_output_tokens": 350 - }, - { - "role": "user", - "content_token_count": 3203, - "target_output_tokens": 1336 - }, - { - "role": "user", - "content_token_count": 4912, - "target_output_tokens": 1071 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 10200, - "target_output_tokens": 315 - }, - { - "role": "user", - "content_token_count": 43481, - "target_output_tokens": 953 - }, - { - "role": "user", - "content_token_count": 6381, - "target_output_tokens": 473 - }, - { - "role": "user", - "content_token_count": 2352, - "target_output_tokens": 361 - }, - { - "role": "user", - "content_token_count": 11246, - "target_output_tokens": 486 - }, - { - "role": "user", - "content_token_count": 38916, - "target_output_tokens": 252 - }, - { - "role": "user", - "content_token_count": 29292, - "target_output_tokens": 332 - }, - { - "role": "user", - "content_token_count": 7163, - "target_output_tokens": 737 - }, - { - "role": "user", - "content_token_count": 4145, - "target_output_tokens": 316 - }, - { - "role": "user", - "content_token_count": 4769, - "target_output_tokens": 298 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 5594, - "target_output_tokens": 1686 - }, - { - "role": "user", - "content_token_count": 4311, - "target_output_tokens": 398 - }, - { - "role": "user", - "content_token_count": 13684, - "target_output_tokens": 419 - }, - { - "role": "user", - "content_token_count": 33855, - "target_output_tokens": 188 - }, - { - "role": "user", - "content_token_count": 2118, - "target_output_tokens": 1128 - }, - { - "role": "user", - "content_token_count": 2030, - "target_output_tokens": 184 - }, - { - "role": "user", - "content_token_count": 10739, - "target_output_tokens": 561 - }, - { - "role": "user", - "content_token_count": 5555, - "target_output_tokens": 366 - }, - { - "role": "user", - "content_token_count": 16640, - "target_output_tokens": 668 - }, - { - "role": "user", - "content_token_count": 23253, - "target_output_tokens": 884 - }, - { - "role": "user", - "content_token_count": 3965, - "target_output_tokens": 740 - }, - { - "role": "user", - "content_token_count": 8551, - "target_output_tokens": 1807 - }, - { - "role": "user", - "content_token_count": 3578, - "target_output_tokens": 766 - }, - { - "role": "user", - "content_token_count": 4639, - "target_output_tokens": 1157 - }, - { - "role": "user", - "content_token_count": 6212, - "target_output_tokens": 437 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 5004, - "target_output_tokens": 178 - }, - { - "role": "user", - "content_token_count": 5596, - "target_output_tokens": 867 - }, - { - "role": "user", - "content_token_count": 12366, - "target_output_tokens": 1221 - }, - { - "role": "user", - "content_token_count": 5092, - "target_output_tokens": 167 - }, - { - "role": "user", - "content_token_count": 11259, - "target_output_tokens": 286 - }, - { - "role": "user", - "content_token_count": 18357, - "target_output_tokens": 1419 - }, - { - "role": "user", - "content_token_count": 12445, - "target_output_tokens": 425 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 1753, - "target_output_tokens": 457 - }, - { - "role": "user", - "content_token_count": 4410, - "target_output_tokens": 138 - }, - { - "role": "user", - "content_token_count": 3759, - "target_output_tokens": 295 - }, - { - "role": "user", - "content_token_count": 11816, - "target_output_tokens": 830 - }, - { - "role": "user", - "content_token_count": 16209, - "target_output_tokens": 141 - }, - { - "role": "user", - "content_token_count": 46023, - "target_output_tokens": 2056 - }, - { - "role": "user", - "content_token_count": 5420, - "target_output_tokens": 422 - }, - { - "role": "user", - "content_token_count": 2445, - "target_output_tokens": 2119 - }, - { - "role": "user", - "content_token_count": 3724, - "target_output_tokens": 1277 - }, - { - "role": "user", - "content_token_count": 3168, - "target_output_tokens": 391 - }, - { - "role": "user", - "content_token_count": 9061, - "target_output_tokens": 1199 - }, - { - "role": "user", - "content_token_count": 4255, - "target_output_tokens": 1880 - }, - { - "role": "user", - "content_token_count": 20542, - "target_output_tokens": 449 - }, - { - "role": "user", - "content_token_count": 18541, - "target_output_tokens": 211 - }, - { - "role": "user", - "content_token_count": 17405, - "target_output_tokens": 878 - }, - { - "role": "user", - "content_token_count": 7086, - "target_output_tokens": 396 - }, - { - "role": "user", - "content_token_count": 4469, - "target_output_tokens": 189 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 4594, - "target_output_tokens": 567 - }, - { - "role": "user", - "content_token_count": 15961, - "target_output_tokens": 276 - }, - { - "role": "user", - "content_token_count": 18817, - "target_output_tokens": 296 - }, - { - "role": "user", - "content_token_count": 8980, - "target_output_tokens": 446 - }, - { - "role": "user", - "content_token_count": 13739, - "target_output_tokens": 476 - }, - { - "role": "user", - "content_token_count": 4954, - "target_output_tokens": 1124 - }, - { - "role": "user", - "content_token_count": 7155, - "target_output_tokens": 2553 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 8108, - "target_output_tokens": 337 - }, - { - "role": "user", - "content_token_count": 7213, - "target_output_tokens": 198 - }, - { - "role": "user", - "content_token_count": 6441, - "target_output_tokens": 932 - }, - { - "role": "user", - "content_token_count": 25889, - "target_output_tokens": 494 - }, - { - "role": "user", - "content_token_count": 5672, - "target_output_tokens": 322 - }, - { - "role": "user", - "content_token_count": 6174, - "target_output_tokens": 984 - }, - { - "role": "user", - "content_token_count": 13080, - "target_output_tokens": 594 - }, - { - "role": "user", - "content_token_count": 23119, - "target_output_tokens": 64 - }, - { - "role": "user", - "content_token_count": 10812, - "target_output_tokens": 939 - }, - { - "role": "user", - "content_token_count": 27801, - "target_output_tokens": 925 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 3640, - "target_output_tokens": 108 - }, - { - "role": "user", - "content_token_count": 2053, - "target_output_tokens": 655 - }, - { - "role": "user", - "content_token_count": 16255, - "target_output_tokens": 1911 - }, - { - "role": "user", - "content_token_count": 13439, - "target_output_tokens": 629 - }, - { - "role": "user", - "content_token_count": 25472, - "target_output_tokens": 1323 - }, - { - "role": "user", - "content_token_count": 10114, - "target_output_tokens": 674 - }, - { - "role": "user", - "content_token_count": 1708, - "target_output_tokens": 1493 - }, - { - "role": "user", - "content_token_count": 5384, - "target_output_tokens": 1587 - }, - { - "role": "user", - "content_token_count": 6730, - "target_output_tokens": 408 - }, - { - "role": "user", - "content_token_count": 1746, - "target_output_tokens": 413 - }, - { - "role": "user", - "content_token_count": 1684, - "target_output_tokens": 1349 - }, - { - "role": "user", - "content_token_count": 22551, - "target_output_tokens": 426 - }, - { - "role": "user", - "content_token_count": 10297, - "target_output_tokens": 772 - }, - { - "role": "user", - "content_token_count": 13002, - "target_output_tokens": 1444 - }, - { - "role": "user", - "content_token_count": 16737, - "target_output_tokens": 1199 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 7675, - "target_output_tokens": 354 - }, - { - "role": "user", - "content_token_count": 5654, - "target_output_tokens": 220 - }, - { - "role": "user", - "content_token_count": 946, - "target_output_tokens": 515 - }, - { - "role": "user", - "content_token_count": 6573, - "target_output_tokens": 1712 - }, - { - "role": "user", - "content_token_count": 47344, - "target_output_tokens": 554 - }, - { - "role": "user", - "content_token_count": 10099, - "target_output_tokens": 1064 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 4184, - "target_output_tokens": 213 - }, - { - "role": "user", - "content_token_count": 20020, - "target_output_tokens": 727 - }, - { - "role": "user", - "content_token_count": 5788, - "target_output_tokens": 464 - }, - { - "role": "user", - "content_token_count": 16426, - "target_output_tokens": 188 - }, - { - "role": "user", - "content_token_count": 6170, - "target_output_tokens": 1080 - }, - { - "role": "user", - "content_token_count": 12316, - "target_output_tokens": 659 - }, - { - "role": "user", - "content_token_count": 2817, - "target_output_tokens": 148 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 14649, - "target_output_tokens": 769 - }, - { - "role": "user", - "content_token_count": 13707, - "target_output_tokens": 314 - }, - { - "role": "user", - "content_token_count": 1901, - "target_output_tokens": 480 - }, - { - "role": "user", - "content_token_count": 4892, - "target_output_tokens": 562 - }, - { - "role": "user", - "content_token_count": 18481, - "target_output_tokens": 195 - }, - { - "role": "user", - "content_token_count": 3762, - "target_output_tokens": 564 - }, - { - "role": "user", - "content_token_count": 8463, - "target_output_tokens": 286 - }, - { - "role": "user", - "content_token_count": 11078, - "target_output_tokens": 90 - }, - { - "role": "user", - "content_token_count": 1106, - "target_output_tokens": 2149 - }, - { - "role": "user", - "content_token_count": 3393, - "target_output_tokens": 1477 - }, - { - "role": "user", - "content_token_count": 65536, - "target_output_tokens": 285 - }, - { - "role": "user", - "content_token_count": 11370, - "target_output_tokens": 417 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 19821, - "target_output_tokens": 217 - }, - { - "role": "user", - "content_token_count": 20454, - "target_output_tokens": 689 - }, - { - "role": "user", - "content_token_count": 6158, - "target_output_tokens": 495 - }, - { - "role": "user", - "content_token_count": 10407, - "target_output_tokens": 172 - }, - { - "role": "user", - "content_token_count": 6777, - "target_output_tokens": 244 - }, - { - "role": "user", - "content_token_count": 52928, - "target_output_tokens": 476 - }, - { - "role": "user", - "content_token_count": 42478, - "target_output_tokens": 223 - }, - { - "role": "user", - "content_token_count": 4347, - "target_output_tokens": 593 - }, - { - "role": "user", - "content_token_count": 12237, - "target_output_tokens": 123 - }, - { - "role": "user", - "content_token_count": 17586, - "target_output_tokens": 598 - }, - { - "role": "user", - "content_token_count": 2461, - "target_output_tokens": 501 - }, - { - "role": "user", - "content_token_count": 4825, - "target_output_tokens": 168 - }, - { - "role": "user", - "content_token_count": 2679, - "target_output_tokens": 2852 - }, - { - "role": "user", - "content_token_count": 7837, - "target_output_tokens": 492 - }, - { - "role": "user", - "content_token_count": 65536, - "target_output_tokens": 277 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 5214, - "target_output_tokens": 2004 - }, - { - "role": "user", - "content_token_count": 11163, - "target_output_tokens": 2005 - }, - { - "role": "user", - "content_token_count": 25193, - "target_output_tokens": 211 - }, - { - "role": "user", - "content_token_count": 2010, - "target_output_tokens": 256 - }, - { - "role": "user", - "content_token_count": 9992, - "target_output_tokens": 1115 - }, - { - "role": "user", - "content_token_count": 12896, - "target_output_tokens": 623 - }, - { - "role": "user", - "content_token_count": 3791, - "target_output_tokens": 998 - }, - { - "role": "user", - "content_token_count": 8003, - "target_output_tokens": 338 - }, - { - "role": "user", - "content_token_count": 4495, - "target_output_tokens": 552 - }, - { - "role": "user", - "content_token_count": 1634, - "target_output_tokens": 2271 - }, - { - "role": "user", - "content_token_count": 5760, - "target_output_tokens": 97 - }, - { - "role": "user", - "content_token_count": 10434, - "target_output_tokens": 609 - }, - { - "role": "user", - "content_token_count": 23376, - "target_output_tokens": 112 - }, - { - "role": "user", - "content_token_count": 8046, - "target_output_tokens": 544 - }, - { - "role": "user", - "content_token_count": 1341, - "target_output_tokens": 1666 - }, - { - "role": "user", - "content_token_count": 12979, - "target_output_tokens": 341 - }, - { - "role": "user", - "content_token_count": 8061, - "target_output_tokens": 463 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 14288, - "target_output_tokens": 1379 - }, - { - "role": "user", - "content_token_count": 7502, - "target_output_tokens": 164 - }, - { - "role": "user", - "content_token_count": 2894, - "target_output_tokens": 68 - }, - { - "role": "user", - "content_token_count": 28437, - "target_output_tokens": 318 - }, - { - "role": "user", - "content_token_count": 9110, - "target_output_tokens": 780 - }, - { - "role": "user", - "content_token_count": 7833, - "target_output_tokens": 1300 - }, - { - "role": "user", - "content_token_count": 35537, - "target_output_tokens": 227 - }, - { - "role": "user", - "content_token_count": 6575, - "target_output_tokens": 341 - }, - { - "role": "user", - "content_token_count": 5057, - "target_output_tokens": 747 - }, - { - "role": "user", - "content_token_count": 1020, - "target_output_tokens": 566 - }, - { - "role": "user", - "content_token_count": 29797, - "target_output_tokens": 461 - }, - { - "role": "user", - "content_token_count": 6275, - "target_output_tokens": 244 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 5975, - "target_output_tokens": 713 - }, - { - "role": "user", - "content_token_count": 4182, - "target_output_tokens": 813 - }, - { - "role": "user", - "content_token_count": 31157, - "target_output_tokens": 394 - }, - { - "role": "user", - "content_token_count": 5352, - "target_output_tokens": 628 - }, - { - "role": "user", - "content_token_count": 5323, - "target_output_tokens": 468 - }, - { - "role": "user", - "content_token_count": 8404, - "target_output_tokens": 603 - }, - { - "role": "user", - "content_token_count": 10457, - "target_output_tokens": 528 - }, - { - "role": "user", - "content_token_count": 21616, - "target_output_tokens": 1002 - }, - { - "role": "user", - "content_token_count": 11231, - "target_output_tokens": 266 - }, - { - "role": "user", - "content_token_count": 3555, - "target_output_tokens": 981 - }, - { - "role": "user", - "content_token_count": 2347, - "target_output_tokens": 311 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 512, - "target_output_tokens": 1289 - }, - { - "role": "user", - "content_token_count": 14824, - "target_output_tokens": 595 - }, - { - "role": "user", - "content_token_count": 2459, - "target_output_tokens": 491 - }, - { - "role": "user", - "content_token_count": 5155, - "target_output_tokens": 854 - }, - { - "role": "user", - "content_token_count": 1706, - "target_output_tokens": 335 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 4693, - "target_output_tokens": 552 - }, - { - "role": "user", - "content_token_count": 3717, - "target_output_tokens": 321 - }, - { - "role": "user", - "content_token_count": 11640, - "target_output_tokens": 525 - }, - { - "role": "user", - "content_token_count": 7120, - "target_output_tokens": 1424 - }, - { - "role": "user", - "content_token_count": 6218, - "target_output_tokens": 1656 - }, - { - "role": "user", - "content_token_count": 11256, - "target_output_tokens": 3945 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 6313, - "target_output_tokens": 1528 - }, - { - "role": "user", - "content_token_count": 5148, - "target_output_tokens": 196 - }, - { - "role": "user", - "content_token_count": 15406, - "target_output_tokens": 461 - }, - { - "role": "user", - "content_token_count": 2451, - "target_output_tokens": 404 - }, - { - "role": "user", - "content_token_count": 9688, - "target_output_tokens": 847 - }, - { - "role": "user", - "content_token_count": 14736, - "target_output_tokens": 366 - }, - { - "role": "user", - "content_token_count": 8049, - "target_output_tokens": 1021 - }, - { - "role": "user", - "content_token_count": 5751, - "target_output_tokens": 3843 - }, - { - "role": "user", - "content_token_count": 11137, - "target_output_tokens": 390 - }, - { - "role": "user", - "content_token_count": 34636, - "target_output_tokens": 895 - }, - { - "role": "user", - "content_token_count": 11915, - "target_output_tokens": 599 - }, - { - "role": "user", - "content_token_count": 8409, - "target_output_tokens": 86 - }, - { - "role": "user", - "content_token_count": 3406, - "target_output_tokens": 2233 - }, - { - "role": "user", - "content_token_count": 15118, - "target_output_tokens": 677 - }, - { - "role": "user", - "content_token_count": 11251, - "target_output_tokens": 203 - }, - { - "role": "user", - "content_token_count": 7848, - "target_output_tokens": 198 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 19708, - "target_output_tokens": 526 - }, - { - "role": "user", - "content_token_count": 6199, - "target_output_tokens": 262 - }, - { - "role": "user", - "content_token_count": 5688, - "target_output_tokens": 957 - }, - { - "role": "user", - "content_token_count": 8993, - "target_output_tokens": 1558 - }, - { - "role": "user", - "content_token_count": 14718, - "target_output_tokens": 207 - }, - { - "role": "user", - "content_token_count": 10274, - "target_output_tokens": 744 - }, - { - "role": "user", - "content_token_count": 10756, - "target_output_tokens": 330 - }, - { - "role": "user", - "content_token_count": 55245, - "target_output_tokens": 171 - }, - { - "role": "user", - "content_token_count": 14177, - "target_output_tokens": 343 - }, - { - "role": "user", - "content_token_count": 11266, - "target_output_tokens": 370 - }, - { - "role": "user", - "content_token_count": 5359, - "target_output_tokens": 1273 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 1649, - "target_output_tokens": 218 - }, - { - "role": "user", - "content_token_count": 8871, - "target_output_tokens": 629 - }, - { - "role": "user", - "content_token_count": 11623, - "target_output_tokens": 247 - }, - { - "role": "user", - "content_token_count": 17643, - "target_output_tokens": 536 - }, - { - "role": "user", - "content_token_count": 1355, - "target_output_tokens": 127 - }, - { - "role": "user", - "content_token_count": 10824, - "target_output_tokens": 363 - }, - { - "role": "user", - "content_token_count": 3760, - "target_output_tokens": 810 - }, - { - "role": "user", - "content_token_count": 13120, - "target_output_tokens": 179 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 2614, - "target_output_tokens": 270 - }, - { - "role": "user", - "content_token_count": 4555, - "target_output_tokens": 271 - }, - { - "role": "user", - "content_token_count": 5387, - "target_output_tokens": 216 - }, - { - "role": "user", - "content_token_count": 3338, - "target_output_tokens": 694 - }, - { - "role": "user", - "content_token_count": 9274, - "target_output_tokens": 488 - }, - { - "role": "user", - "content_token_count": 41006, - "target_output_tokens": 1179 - }, - { - "role": "user", - "content_token_count": 11764, - "target_output_tokens": 336 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 4551, - "target_output_tokens": 391 - }, - { - "role": "user", - "content_token_count": 7744, - "target_output_tokens": 590 - }, - { - "role": "user", - "content_token_count": 6922, - "target_output_tokens": 1285 - }, - { - "role": "user", - "content_token_count": 15085, - "target_output_tokens": 881 - }, - { - "role": "user", - "content_token_count": 23696, - "target_output_tokens": 380 - }, - { - "role": "user", - "content_token_count": 13825, - "target_output_tokens": 1441 - }, - { - "role": "user", - "content_token_count": 7353, - "target_output_tokens": 686 - } - ] - }, - { - "turns": [ - { - "role": "user", - "content_token_count": 4844, - "target_output_tokens": 520 - }, - { - "role": "user", - "content_token_count": 11126, - "target_output_tokens": 170 - }, - { - "role": "user", - "content_token_count": 2742, - "target_output_tokens": 549 - }, - { - "role": "user", - "content_token_count": 4533, - "target_output_tokens": 309 - } - ] - } - ] -} \ No newline at end of file diff --git a/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py b/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py deleted file mode 100644 index ccc51ca7a..000000000 --- a/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python3 -"""Generate synthetic AIPerf-style trace sessions for kv-cache-tester-compatible replay.""" - -from __future__ import annotations - -import argparse -import json -import math -import random -from pathlib import Path - - -def lognormal_sigma(p50: float, p95: float) -> float: - return math.log(p95 / p50) / 1.645 - - -def sample_tokens(rng: random.Random, p50: float, p95: float, min_v: int, max_v: int) -> int: - sigma = lognormal_sigma(p50, p95) - mu = math.log(p50) - sampled = int(round(rng.lognormvariate(mu, sigma))) - return max(min_v, min(max_v, sampled)) - - -def generate_sessions(count: int, seed: int) -> dict: - rng = random.Random(seed) - sessions = [] - - # Target coding-workload distributions: - # ISL p50~8k, p95~32k - # OSL p50~512, p95~2k - for _ in range(count): - num_turns = rng.randint(4, 18) - turns = [] - for _ in range(num_turns): - turns.append( - { - "role": "user", - "content_token_count": sample_tokens( - rng, - p50=8000, - p95=32000, - min_v=512, - max_v=65536, - ), - "target_output_tokens": sample_tokens( - rng, - p50=512, - p95=2000, - min_v=64, - max_v=4096, - ), - } - ) - sessions.append({"turns": turns}) - - return {"sessions": sessions} - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Generate synthetic AIPerf traces") - parser.add_argument("--sessions", type=int, default=100, help="Number of sessions") - parser.add_argument("--seed", type=int, default=993, help="Random seed") - parser.add_argument( - "--output", - type=Path, - default=Path(__file__).with_name("aiperf_synthetic_traces.json"), - help="Output JSON path", - ) - return parser.parse_args() - - -def main() -> int: - args = parse_args() - payload = generate_sessions(args.sessions, args.seed) - args.output.parent.mkdir(parents=True, exist_ok=True) - args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md b/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md deleted file mode 100644 index 94731fd42..000000000 --- a/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# kv-cache-tester placeholder - -This directory should be populated with the external `kv-cache-tester` repository. - -Expected structure includes trace replay tooling and real trace assets used by experimental multiturn benchmarks. - -## Initialization - -If/when access is available, initialize this directory by checking out the kv-cache-tester repo contents here (for example via approved submodule setup or direct clone workflow owned by maintainers). - -Do not replace this placeholder with unapproved external URLs in this branch. diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/.gitkeep b/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/experimental/multiturn/vllm_benchmark/launch/README.md b/experimental/multiturn/vllm_benchmark/launch/README.md deleted file mode 100644 index 00d33ecba..000000000 --- a/experimental/multiturn/vllm_benchmark/launch/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# LMCache launch scripts (experimental) - -These scripts launch vLLM with LMCache KV transfer enabled: - -- `lmcache_vllm_h200.sh` -- `lmcache_vllm_b200.sh` - -They are experimental parity utilities and are not wired into the standard InferenceX benchmark dispatch lanes. diff --git a/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh deleted file mode 100755 index f83b4b7f2..000000000 --- a/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP - -PORT=${PORT:-8888} -SERVER_LOG=/workspace/server.log -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} - -cat > config.yaml << EOF -kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -python3 -m pip install -q lmcache - -launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -echo "LMCache vLLM server running (PID=$SERVER_PID, log=$SERVER_LOG)" -wait "$SERVER_PID" diff --git a/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh deleted file mode 100755 index f83b4b7f2..000000000 --- a/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP - -PORT=${PORT:-8888} -SERVER_LOG=/workspace/server.log -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} - -cat > config.yaml << EOF -kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -python3 -m pip install -q lmcache - -launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -echo "LMCache vLLM server running (PID=$SERVER_PID, log=$SERVER_LOG)" -wait "$SERVER_PID" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh deleted file mode 100755 index f917c03c3..000000000 --- a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC RESULT_FILENAME - -PORT=${PORT:-8888} -TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} -BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} -SERVER_LOG=/workspace/server.log - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} -cat > config.yaml << EOF -kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -start_gpu_monitor -start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 - -set -x -python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color -set +x - -stop_kv_metrics_collector -stop_gpu_monitor - -python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh deleted file mode 100755 index f917c03c3..000000000 --- a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC RESULT_FILENAME - -PORT=${PORT:-8888} -TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} -BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} -SERVER_LOG=/workspace/server.log - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} -cat > config.yaml << EOF -kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -start_gpu_monitor -start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 - -set -x -python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color -set +x - -stop_kv_metrics_collector -stop_gpu_monitor - -python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh deleted file mode 100755 index 7c46b0c31..000000000 --- a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC RESULT_FILENAME - -PORT=${PORT:-8888} -TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} -BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} -SERVER_LOG=/workspace/server.log - -CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} -RADIX_CACHE_ARGS="" -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -start_gpu_monitor -start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 - -set -x -python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color -set +x - -stop_kv_metrics_collector -stop_gpu_monitor - -python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh deleted file mode 100755 index f917c03c3..000000000 --- a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC RESULT_FILENAME - -PORT=${PORT:-8888} -TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} -BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} -SERVER_LOG=/workspace/server.log - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} -cat > config.yaml << EOF -kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -start_gpu_monitor -start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 - -set -x -python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color -set +x - -stop_kv_metrics_collector -stop_gpu_monitor - -python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh deleted file mode 100755 index 7c46b0c31..000000000 --- a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC RESULT_FILENAME - -PORT=${PORT:-8888} -TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} -BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} -SERVER_LOG=/workspace/server.log - -CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} -RADIX_CACHE_ARGS="" -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -start_gpu_monitor -start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 - -set -x -python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color -set +x - -stop_kv_metrics_collector -stop_gpu_monitor - -python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh deleted file mode 100755 index f917c03c3..000000000 --- a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC RESULT_FILENAME - -PORT=${PORT:-8888} -TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} -BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} -SERVER_LOG=/workspace/server.log - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} -cat > config.yaml << EOF -kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -start_gpu_monitor -start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 - -set -x -python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color -set +x - -stop_kv_metrics_collector -stop_gpu_monitor - -python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh deleted file mode 100755 index 7c46b0c31..000000000 --- a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC RESULT_FILENAME - -PORT=${PORT:-8888} -TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} -BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} -SERVER_LOG=/workspace/server.log - -CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} -RADIX_CACHE_ARGS="" -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -start_gpu_monitor -start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 - -set -x -python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color -set +x - -stop_kv_metrics_collector -stop_gpu_monitor - -python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh deleted file mode 100755 index f917c03c3..000000000 --- a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC RESULT_FILENAME - -PORT=${PORT:-8888} -TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} -BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} -SERVER_LOG=/workspace/server.log - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} -cat > config.yaml << EOF -kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -start_gpu_monitor -start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 - -set -x -python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color -set +x - -stop_kv_metrics_collector -stop_gpu_monitor - -python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh deleted file mode 100755 index 7c46b0c31..000000000 --- a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC RESULT_FILENAME - -PORT=${PORT:-8888} -TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} -BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} -SERVER_LOG=/workspace/server.log - -CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} -RADIX_CACHE_ARGS="" -if [[ -n "${OFFLOAD_MODE:-}" ]]; then - apply_sglang_offload_config -fi - -launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -start_gpu_monitor -start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 - -set -x -python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color -set +x - -stop_kv_metrics_collector -stop_gpu_monitor - -python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh deleted file mode 100755 index f917c03c3..000000000 --- a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC RESULT_FILENAME - -PORT=${PORT:-8888} -TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} -BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} -SERVER_LOG=/workspace/server.log - -CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} -cat > config.yaml << EOF -kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $CALCULATED_MAX_MODEL_LEN -EOF - -launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -start_gpu_monitor -start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 - -set -x -python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color -set +x - -stop_kv_metrics_collector -stop_gpu_monitor - -python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 746d0645d..3f081c240 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,9 @@ +- config-keys: + - qwen3.5-fp4-mi355x-sglang + description: + - "Update SGLang image from 'lmsysorg/sglang:v0.5.10-rocm720-mi35x' to 'rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413'" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1041 + - config-keys: - kimik2.5-int4-mi300x-vllm description: @@ -1308,6 +1314,16 @@ - "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820 +- config-keys: + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + description: + - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and MI325X to achieve better performance" + - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/986 + - config-keys: - glm5-fp4-b200-sglang description: @@ -1348,3 +1364,35 @@ description: - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017 + +- config-keys: + - qwen3.5-fp8-mi355x-sglang + - qwen3.5-bf16-mi355x-sglang + description: + - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI355X to achieve better performance" + - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 for BF16 benchmark" + - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 for FP8 benchmark" + - "Image includes upstream SGLang PRs: https://github.com/sgl-project/sglang/pull/21188, https://github.com/sgl-project/sglang/pull/21421, https://github.com/sgl-project/sglang/pull/20736" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1036 + +- config-keys: + - glm5-fp4-b200-sglang + description: + - "Update SGLang image from nightly-dev-cu13-20260328-a27651d5 to v0.5.10.post1-cu130" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1031 + +- config-keys: + - qwen3.5-fp8-b300-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035 + +- config-keys: + - qwen3.5-fp8-b300-sglang + description: + - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang benchmark (non-MTP)" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "TP=4, concurrency 4-256 for 1k1k and 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1048 diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 644b2c3a4..847b7ee80 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -1,7 +1,5 @@ #!/usr/bin/bash -source "$(dirname "$0")/lib_single_node_script.sh" - # System-specific configuration for B200 DGXC Slurm cluster SLURM_PARTITION="gpu" SLURM_ACCOUNT="benchmark" @@ -217,7 +215,8 @@ else HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache" SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 + FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" @@ -244,7 +243,5 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash "$SCRIPT_PATH" - - scancel $JOB_ID + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh fi diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index caa1e8364..f8c614936 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -1,9 +1,8 @@ #!/usr/bin/bash -source "$(dirname "$0")/lib_single_node_script.sh" - HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" -SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PORT=8888 # Create unique cache directory based on model parameters @@ -31,17 +30,13 @@ docker run --rm --init --network host --name $server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ --e SPEC_DECODING -e DISAGG \ --e BENCHMARK_TYPE -e EXPORT_FILE -e RUNTIME_STACK_ID -e HARDWARE_PROFILE_ID -e CANONICAL_MODEL_ID -e REQUEST_MODE -e MAX_CONCURRENCY \ --e SUPPORT_STATUS -e VLLM_CPU_OFFLOAD_GB -e VLLM_SWAP_SPACE_GB -e SGLANG_MEM_FRACTION_OVERRIDE -e SGLANG_CHUNKED_PREFILL_OVERRIDE \ --e MAX_SESSIONS -e MAX_TURNS_PER_SESSION -e MAX_OUTPUT_LEN -e NUM_WARMUP_SESSIONS -e IGNORE_WAITS -e IGNORE_EOS \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ -"$SCRIPT_PATH" +benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" # Try graceful first docker stop -t 90 "$server_name" || true diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index cbcc7469b..c321ee0f9 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -1,10 +1,9 @@ #!/usr/bin/bash -source "$(dirname "$0")/lib_single_node_script.sh" - HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" PARTITION="main" -SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') UCX_NET_DEVICES=eth0 @@ -18,4 +17,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \ -bash "$SCRIPT_PATH" \ No newline at end of file +bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh \ No newline at end of file diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 68da9f2b7..3d863b54c 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -6,6 +6,8 @@ SLURM_ACCOUNT="benchmark" set -x +if [[ "$IS_MULTINODE" == "true" ]]; then + # Validate framework if [[ $FRAMEWORK != "dynamo-sglang" && $FRAMEWORK != "dynamo-trt" ]]; then echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang" @@ -211,3 +213,28 @@ for i in 1 2 3 4 5; do sleep 10 done find . -name '.nfs*' -delete 2>/dev/null || true + +else + + HF_HUB_CACHE_MOUNT="/scratch/models" + export MODEL="/scratch/models/${MODEL#*/}" + SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + + # Pin to one of the known-good B300 nodes; others have hardware/network + # issues that cause benchmarks to hang or fail to start. + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) + + srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" + + srun --jobid=$JOB_ID \ + --container-image=$SQUASH_FILE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ + --no-container-mount-home \ + --container-workdir=/workspace/ \ + --no-container-entrypoint --export=ALL,PORT=8888 \ + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b300${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh + +fi diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 44c46600d..5100419b9 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -1,10 +1,7 @@ #!/usr/bin/bash -source "$(dirname "$0")/lib_single_node_script.sh" - HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/" PORT=8888 -SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 server_name="bmk-server" @@ -13,13 +10,9 @@ docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e EP_SIZE -e DP_ATTENTION -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ --e SPEC_DECODING -e DISAGG \ --e BENCHMARK_TYPE -e EXPORT_FILE -e RUNTIME_STACK_ID -e HARDWARE_PROFILE_ID -e CANONICAL_MODEL_ID -e REQUEST_MODE -e MAX_CONCURRENCY \ --e SUPPORT_STATUS -e VLLM_CPU_OFFLOAD_GB -e VLLM_SWAP_SPACE_GB -e SGLANG_MEM_FRACTION_OVERRIDE -e SGLANG_CHUNKED_PREFILL_OVERRIDE \ --e MAX_SESSIONS -e MAX_TURNS_PER_SESSION -e MAX_OUTPUT_LEN -e NUM_WARMUP_SESSIONS -e IGNORE_WAITS -e IGNORE_EOS \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ -"$SCRIPT_PATH" +benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100.sh" diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index bb10dcb6d..49a42e981 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -1,12 +1,9 @@ #!/usr/bin/env bash -source "$(dirname "$0")/lib_single_node_script.sh" - export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" PARTITION="h100" SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" -SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 set -x @@ -34,7 +31,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ -bash "$SCRIPT_PATH" +bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 11570289a..bb0335955 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -1,7 +1,5 @@ #!/usr/bin/bash -source "$(dirname "$0")/lib_single_node_script.sh" - # System-specific configuration for H100 DGXC Slurm cluster SLURM_PARTITION="hpc-gpu-1" SLURM_ACCOUNT="customer" @@ -232,7 +230,6 @@ else HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/" SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 salloc --exclude="$SLURM_EXCLUDED_NODELIST" --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -250,7 +247,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash "$SCRIPT_PATH" + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh scancel $JOB_ID diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 5a49efcc6..657f84792 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -1,12 +1,11 @@ #!/usr/bin/env bash -source "$(dirname "$0")/lib_single_node_script.sh" - export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" -SCRIPT_PATH=$(resolve_single_node_benchmark_script "$MODEL_CODE" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="h200" SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -45,7 +44,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash "$SCRIPT_PATH" +bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index a6f4d2986..9b3b771a5 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -1,7 +1,5 @@ #!/usr/bin/bash -source "$(dirname "$0")/lib_single_node_script.sh" - # System-specific configuration for H200 DGXC Slurm cluster SLURM_PARTITION="main" SLURM_ACCOUNT="sa-shared" @@ -235,7 +233,6 @@ else # Convert pyxis image format (nvcr.io#path) to docker format (nvcr.io/path) for enroot import DOCKER_IMAGE=$(echo "$IMAGE" | sed 's/#/\//g') LOCK_FILE="${SQUASH_FILE}.lock" - SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -261,7 +258,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash "$SCRIPT_PATH" + bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh scancel $JOB_ID diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 3b697fb51..9d157a858 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -1,12 +1,11 @@ #!/usr/bin/bash -source "$(dirname "$0")/lib_single_node_script.sh" - export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" -SCRIPT_PATH=$(resolve_single_node_benchmark_script "$MODEL_CODE" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="main" @@ -20,4 +19,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash "$SCRIPT_PATH" +bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh diff --git a/runners/lib_single_node_script.sh b/runners/lib_single_node_script.sh deleted file mode 100644 index 194668856..000000000 --- a/runners/lib_single_node_script.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -resolve_single_node_benchmark_script() { - local model_code="$1" - local precision="$2" - local runner_code="$3" - local framework="${4:-}" - local spec_decoding="${5:-}" - local script_base="benchmarks/single_node/${model_code}_${precision}_${runner_code}" - - if [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] && [[ "$framework" == "sglang" || "$framework" == "vllm" ]]; then - local runtime_candidate="${script_base}_${framework}.sh" - if [[ -f "$runtime_candidate" ]]; then - printf '%s\n' "$runtime_candidate" - return 0 - fi - fi - - local framework_suffix="" - local spec_suffix="" - if [[ "$framework" == "trt" ]]; then - framework_suffix="_trt" - fi - if [[ "$spec_decoding" == "mtp" ]]; then - spec_suffix="_mtp" - fi - - local legacy_candidate="${script_base}${framework_suffix}${spec_suffix}.sh" - if [[ -f "$legacy_candidate" ]]; then - printf '%s\n' "$legacy_candidate" - return 0 - fi - - echo "ERROR: Could not resolve single-node benchmark script." >&2 - echo " model=$model_code precision=$precision runner=$runner_code framework=${framework:-} spec_decoding=${spec_decoding:-} benchmark_type=${BENCHMARK_TYPE:-}" >&2 - if [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] && [[ "$framework" == "sglang" || "$framework" == "vllm" ]]; then - echo " checked runtime-aware candidate: ${script_base}_${framework}.sh" >&2 - fi - echo " checked legacy candidate: $legacy_candidate" >&2 - return 1 -} diff --git a/utils/gate_isb1.py b/utils/gate_isb1.py deleted file mode 100644 index e223e8c29..000000000 --- a/utils/gate_isb1.py +++ /dev/null @@ -1,298 +0,0 @@ -import argparse -import json -from pathlib import Path -from typing import Any, Callable - - -Row = dict[str, Any] -Criterion = tuple[str, Callable[[Row], bool]] - -EXPECTED_131K_COVERAGE = { - ("b200", "vllm"), - ("b200", "sglang"), - ("h100", "vllm"), - ("h100", "sglang"), - ("h200", "vllm"), - ("h200", "sglang"), -} -EXPECTED_1M_COVERAGE = { - ("b200", "vllm"), - ("b200", "sglang"), -} - - -def normalize_hw_label(hw: str | None) -> str: - """Normalize runner labels like h200-cw-1 to coverage labels like h200.""" - if not hw: - return "" - return hw.split("-", 1)[0] - - -def load_rows(report_path: Path) -> list[Row]: - """Load aggregated ISB1 rows from JSON.""" - payload = json.loads(report_path.read_text()) - if isinstance(payload, list): - return [row for row in payload if isinstance(row, dict)] - if isinstance(payload, dict): - return [payload] - raise ValueError(f"Unsupported ISB1 payload type: {type(payload)!r}") - - -def build_row_reference(row: Row, failed_criteria: list[str] | None = None) -> Row: - """Build a concise row reference for gate reports.""" - reference: Row = { - "result_filename": row.get("result_filename"), - "artifact_stems": row.get("artifact_stems") or {}, - "hw": row.get("hw"), - "framework": row.get("framework"), - "infmax_model_prefix": row.get("infmax_model_prefix"), - "support_status": row.get("support_status"), - "context_pressure_status": (row.get("context_pressure_signal") or {}).get("status"), - } - if failed_criteria: - reference["failed_criteria"] = failed_criteria - return reference - - -def completed_sessions_match(row: Row) -> bool: - return row.get("completed_sessions") == row.get("total_sessions") - - -def throughput_positive(row: Row) -> bool: - return float(row.get("session_throughput_sps") or 0.0) > 0.0 - - -def certification_verified(row: Row) -> bool: - return row.get("benchmark_certification_status") == "dataset_replay_verified" - - -def context_not_suspicious(row: Row) -> bool: - return not bool(row.get("context_pressure_suspicious")) - - -def vllm_context_ok(row: Row) -> bool: - if row.get("framework") != "vllm": - return True - signal = row.get("context_pressure_signal") or {} - return signal.get("status") == "ok" and not bool(row.get("context_pressure_suspicious")) - - -def get_present_coverage(rows: list[Row]) -> set[tuple[str, str]]: - return { - (normalize_hw_label(row.get("hw")), row.get("framework", "")) - for row in rows - } - - -def evaluate_gate( - gate_id: str, - label: str, - rows: list[Row], - criteria: list[Criterion], - *, - expected_coverage: set[tuple[str, str]] | None = None, - exact_coverage: bool = False, -) -> Row: - """Evaluate a gate definition over matching rows.""" - if not rows: - return { - "id": gate_id, - "label": label, - "status": "no_rows", - "matched_rows": 0, - "failing_rows": [], - "review_required_rows": [], - "missing_coverage": [], - "unexpected_coverage": [], - } - - failing_rows = [] - review_required_rows = [] - for row in rows: - failed_criteria = [description for description, checker in criteria if not checker(row)] - if failed_criteria: - failing_rows.append(build_row_reference(row, failed_criteria)) - signal = row.get("context_pressure_signal") or {} - if signal.get("requires_log_review"): - review_required_rows.append(build_row_reference(row)) - - missing_coverage: list[list[str]] = [] - unexpected_coverage: list[list[str]] = [] - if expected_coverage is not None: - present_coverage = get_present_coverage(rows) - missing_coverage = [list(item) for item in sorted(expected_coverage - present_coverage)] - if exact_coverage: - unexpected_coverage = [list(item) for item in sorted(present_coverage - expected_coverage)] - - status = "pass" - if failing_rows or missing_coverage or unexpected_coverage: - status = "fail" - - return { - "id": gate_id, - "label": label, - "status": status, - "matched_rows": len(rows), - "failing_rows": failing_rows, - "review_required_rows": review_required_rows, - "missing_coverage": missing_coverage, - "unexpected_coverage": unexpected_coverage, - } - - -def build_gate_report(rows: list[Row], advisory: bool = True) -> Row: - """Build the full advisory gate report for an aggregated ISB1 result set.""" - gates = [ - evaluate_gate( - "control_lanes", - "DSR1/GPT-OSS control lanes", - [ - row - for row in rows - if row.get("infmax_model_prefix") in {"dsr1", "gptoss"} - and row.get("support_status") == "supported" - ], - [ - ("completed_sessions == total_sessions", completed_sessions_match), - ("session_throughput_sps > 0", throughput_positive), - ], - ), - evaluate_gate( - "qwen_131k", - "Qwen 131k preview lanes", - [ - row - for row in rows - if row.get("infmax_model_prefix") == "qwen3.5" - and row.get("support_status") == "reviewed_preview" - and (row.get("effective_max_context_depth") or 0) < 200000 - ], - [ - ("completed_sessions == total_sessions", completed_sessions_match), - ("session_throughput_sps > 0", throughput_positive), - ], - expected_coverage=EXPECTED_131K_COVERAGE, - ), - evaluate_gate( - "qwen_500k", - "Qwen 500k preview lanes", - [ - row - for row in rows - if row.get("infmax_model_prefix") == "qwen3.5" - and row.get("effective_max_context_depth") == 524288 - and row.get("context_pressure_class") == "extended_500k" - ], - [ - ("completed_sessions == total_sessions", completed_sessions_match), - ( - "benchmark_certification_status == dataset_replay_verified", - certification_verified, - ), - ("context_pressure_suspicious == false", context_not_suspicious), - ("vllm context_pressure_signal.status == ok", vllm_context_ok), - ], - ), - evaluate_gate( - "qwen_1m", - "Qwen 1M preview lanes", - [ - row - for row in rows - if row.get("infmax_model_prefix") == "qwen3.5" - and row.get("effective_max_context_depth") == 1048576 - and row.get("context_pressure_class") == "extended_1m" - ], - [ - ("completed_sessions == total_sessions", completed_sessions_match), - ("context_pressure_suspicious == false", context_not_suspicious), - ("vllm context_pressure_signal.status == ok", vllm_context_ok), - ], - expected_coverage=EXPECTED_1M_COVERAGE, - exact_coverage=True, - ), - ] - - statuses = {gate["status"] for gate in gates} - if "fail" in statuses: - overall = "fail" - elif statuses == {"pass"}: - overall = "pass" - else: - overall = "partial" - - return { - "gates": gates, - "overall": overall, - "advisory": advisory, - } - - -def render_markdown(report: Row) -> str: - """Render a concise markdown advisory summary for workflow step summaries.""" - lines = [ - "## ISB1 Advisory Gates", - "", - f"Overall: **{report['overall'].upper()}** ({'advisory' if report['advisory'] else 'strict'})", - "", - ] - - for gate in report["gates"]: - lines.append(f"### {gate['label']} — {gate['status'].upper()}") - lines.append("") - lines.append(f"- Matched rows: {gate['matched_rows']}") - if gate["missing_coverage"]: - formatted = ", ".join(f"{hw}/{framework}" for hw, framework in gate["missing_coverage"]) - lines.append(f"- Missing coverage: {formatted}") - if gate["unexpected_coverage"]: - formatted = ", ".join( - f"{hw}/{framework}" for hw, framework in gate["unexpected_coverage"] - ) - lines.append(f"- Unexpected coverage: {formatted}") - if gate["failing_rows"]: - lines.append("- Failing rows:") - for row in gate["failing_rows"]: - failed_criteria = ", ".join(row.get("failed_criteria", [])) or "unknown" - lines.append( - f" - `{row.get('result_filename', 'unknown')}` ({row.get('hw', '-')}/" - f"{row.get('framework', '-')}) failed: {failed_criteria}" - ) - elif gate["matched_rows"]: - lines.append("- No failing rows.") - if gate["review_required_rows"]: - review_rows = ", ".join( - f"`{row.get('result_filename', 'unknown')}`" for row in gate["review_required_rows"] - ) - lines.append( - "- Manual log review still required for: " - f"{review_rows}" - ) - lines.append("") - - return "\n".join(lines).rstrip() + "\n" - - -def parse_args(argv: list[str] | None = None) -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Evaluate advisory ISB1 gates.") - parser.add_argument("report_path", type=Path) - parser.add_argument("--strict", action="store_true") - parser.add_argument("--format", choices=["json", "markdown"], default="json") - return parser.parse_args(argv) - - -def main(argv: list[str] | None = None) -> int: - args = parse_args(argv) - report = build_gate_report(load_rows(args.report_path), advisory=not args.strict) - - if args.format == "markdown": - print(render_markdown(report)) - else: - print(json.dumps(report, indent=2)) - - if args.strict and report["overall"] == "fail": - return 1 - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 14c69d3e9..bc4562415 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -10,11 +10,7 @@ from validation import ( validate_matrix_entry, - validate_isb1_matrix_entry, - validate_isb1_kv_stress_matrix_entry, load_config_files, - load_isb1_config_files, - load_isb1_kv_stress_config_files, load_runner_file, Fields ) @@ -378,243 +374,6 @@ def generate_full_sweep(args, all_config_data, runner_data): return matrix_values -def generate_isb1_sweep(args, all_config_data, runner_data): - """Generate ISB1 replay sweep configurations with optional filtering.""" - if args.runner_type: - valid_runner_types = set(runner_data.keys()) - invalid_runners = set(args.runner_type) - valid_runner_types - if invalid_runners: - raise ValueError( - f"Invalid runner type(s): {invalid_runners}. " - f"Valid runner types are: {', '.join(sorted(valid_runner_types))}" - ) - - matrix_values = [] - - for _, val in all_config_data.items(): - if args.model_prefix and val[Fields.MODEL_PREFIX.value] not in args.model_prefix: - continue - - if args.precision and val[Fields.PRECISION.value] not in args.precision: - continue - - if args.framework and val[Fields.FRAMEWORK.value] not in args.framework: - continue - - if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type: - continue - - image = val[Fields.IMAGE.value] - model = val[Fields.MODEL.value] - model_code = val[Fields.MODEL_PREFIX.value] - precision = val[Fields.PRECISION.value] - framework = val[Fields.FRAMEWORK.value] - runner = val[Fields.RUNNER.value] - benchmark_type = val[Fields.BENCHMARK_TYPE.value] - runtime_stack_id = val[Fields.RUNTIME_STACK_ID.value] - hardware_profile_id = val[Fields.HARDWARE_PROFILE_ID.value] - canonical_model_id = val[Fields.CANONICAL_MODEL_ID.value] - max_model_len = val.get(Fields.MAX_MODEL_LEN.value) - - runner_nodes_to_use = None - if args.runner_node_filter: - runner_nodes = runner_data.get(runner, []) - runner_nodes_to_use = [ - node for node in runner_nodes if args.runner_node_filter in node - ] - if not runner_nodes_to_use: - continue - - replay_configs = val[Fields.REPLAY_CONFIGS.value] - for replay_config in replay_configs: - export_file = replay_config[Fields.EXPORT_FILE.value] - request_mode = replay_config[Fields.REQUEST_MODE.value] - support_status = replay_config.get(Fields.SUPPORT_STATUS.value) - - for replay_space in replay_config[Fields.SEARCH_SPACE.value]: - max_concurrency = replay_space[Fields.MAX_CONCURRENCY.value] - - if args.max_concurrency is not None: - if args.max_concurrency <= 0: - continue - max_concurrency = min(max_concurrency, args.max_concurrency) - - runners_for_entry = ( - runner_nodes_to_use if runner_nodes_to_use else [runner] - ) - for runner_value in runners_for_entry: - entry = { - Fields.IMAGE.value: image, - Fields.MODEL.value: model, - Fields.MODEL_PREFIX.value: model_code, - Fields.PRECISION.value: precision, - Fields.FRAMEWORK.value: framework, - Fields.RUNNER.value: runner_value, - Fields.BENCHMARK_TYPE.value: benchmark_type, - Fields.EXPORT_FILE.value: export_file, - Fields.RUNTIME_STACK_ID.value: runtime_stack_id, - Fields.HARDWARE_PROFILE_ID.value: hardware_profile_id, - Fields.CANONICAL_MODEL_ID.value: canonical_model_id, - Fields.SUPPORT_STATUS.value: support_status, - Fields.REQUEST_MODE.value: request_mode, - Fields.MAX_CONCURRENCY.value: max_concurrency, - Fields.MAX_SESSIONS.value: replay_space.get(Fields.MAX_SESSIONS.value), - Fields.MAX_TURNS_PER_SESSION.value: replay_space.get(Fields.MAX_TURNS_PER_SESSION.value), - Fields.MAX_OUTPUT_LEN.value: replay_space.get(Fields.MAX_OUTPUT_LEN.value), - Fields.NUM_WARMUP_SESSIONS.value: replay_space.get( - Fields.NUM_WARMUP_SESSIONS.value, 0 - ), - Fields.IGNORE_WAITS.value: replay_space.get( - Fields.IGNORE_WAITS.value, False - ), - Fields.IGNORE_EOS.value: replay_space.get( - Fields.IGNORE_EOS.value, False - ), - Fields.MAX_MODEL_LEN.value: max_model_len, - Fields.OFFLOAD_MODE.value: val.get(Fields.OFFLOAD_MODE.value), - Fields.KV_CACHE_DTYPE.value: val.get(Fields.KV_CACHE_DTYPE.value), - Fields.DISABLE_PREFIX_CACHING.value: val.get( - Fields.DISABLE_PREFIX_CACHING.value - ), - 'benchmark-duration-s': replay_space.get('benchmark-duration-s'), - Fields.EXP_NAME.value: f"{model_code}_isb1", - } - validate_isb1_matrix_entry(entry) - matrix_values.append(entry) - - return matrix_values - - -def generate_isb1_kv_stress_sweep(args, all_config_data, runner_data): - """Generate ISB1 KV stress sweep configurations with optional filtering.""" - if args.runner_type: - valid_runner_types = set(runner_data.keys()) - invalid_runners = set(args.runner_type) - valid_runner_types - if invalid_runners: - raise ValueError( - f"Invalid runner type(s): {invalid_runners}. " - f"Valid runner types are: {', '.join(sorted(valid_runner_types))}" - ) - - matrix_values = [] - - for _, val in all_config_data.items(): - if args.model_prefix and val[Fields.MODEL_PREFIX.value] not in args.model_prefix: - continue - - if args.precision and val[Fields.PRECISION.value] not in args.precision: - continue - - if args.framework and val[Fields.FRAMEWORK.value] not in args.framework: - continue - - if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type: - continue - - image = val[Fields.IMAGE.value] - model = val[Fields.MODEL.value] - model_code = val[Fields.MODEL_PREFIX.value] - precision = val[Fields.PRECISION.value] - framework = val[Fields.FRAMEWORK.value] - runner = val[Fields.RUNNER.value] - benchmark_type = val[Fields.BENCHMARK_TYPE.value] - runtime_stack_id = val[Fields.RUNTIME_STACK_ID.value] - hardware_profile_id = val[Fields.HARDWARE_PROFILE_ID.value] - canonical_model_id = val[Fields.CANONICAL_MODEL_ID.value] - max_model_len = val.get(Fields.MAX_MODEL_LEN.value) - kv_cache_dtype = val[Fields.KV_CACHE_DTYPE.value] - - runner_nodes_to_use = None - if args.runner_node_filter: - runner_nodes = runner_data.get(runner, []) - runner_nodes_to_use = [ - node for node in runner_nodes if args.runner_node_filter in node - ] - if not runner_nodes_to_use: - continue - - kv_stress_configs = val[Fields.KV_STRESS_CONFIGS.value] - for kv_stress_config in kv_stress_configs: - export_file = kv_stress_config[Fields.EXPORT_FILE.value] - request_mode = kv_stress_config[Fields.REQUEST_MODE.value] - support_status = kv_stress_config.get(Fields.SUPPORT_STATUS.value) - workload_type = kv_stress_config[Fields.WORKLOAD_TYPE.value] - - runners_for_entry = ( - runner_nodes_to_use if runner_nodes_to_use else [runner] - ) - - def _append_kv_stress_entry( - max_concurrency: int, - offload_mode: str, - duration_s: int, - *, - tp: int | None = None, - ep: int | None = None, - ) -> None: - disable_prefix_caching = offload_mode == "noprefix" - for runner_value in runners_for_entry: - entry = { - Fields.IMAGE.value: image, - Fields.MODEL.value: model, - Fields.MODEL_PREFIX.value: model_code, - Fields.PRECISION.value: precision, - Fields.FRAMEWORK.value: framework, - Fields.RUNNER.value: runner_value, - Fields.BENCHMARK_TYPE.value: benchmark_type, - Fields.EXPORT_FILE.value: export_file, - Fields.RUNTIME_STACK_ID.value: runtime_stack_id, - Fields.HARDWARE_PROFILE_ID.value: hardware_profile_id, - Fields.CANONICAL_MODEL_ID.value: canonical_model_id, - Fields.SUPPORT_STATUS.value: support_status, - Fields.REQUEST_MODE.value: request_mode, - Fields.MAX_CONCURRENCY.value: max_concurrency, - Fields.OFFLOAD_MODE.value: offload_mode, - Fields.KV_CACHE_DTYPE.value: kv_cache_dtype, - Fields.DISABLE_PREFIX_CACHING.value: disable_prefix_caching, - 'benchmark-duration-s': duration_s, - Fields.WORKLOAD_TYPE.value: workload_type, - Fields.MAX_MODEL_LEN.value: max_model_len, - Fields.EXP_NAME.value: f"{model_code}_isb1_kv_stress", - } - if tp is not None: - entry[Fields.TP.value] = tp - if ep is not None: - entry[Fields.EP.value] = ep - validate_isb1_kv_stress_matrix_entry(entry) - matrix_values.append(entry) - - tp_configs = kv_stress_config.get('tp-configs') - if tp_configs: - for tp_config in tp_configs: - tp_value = tp_config[Fields.TP.value] - ep_value = tp_config.get(Fields.EP.value, 1) - users = tp_config[Fields.USERS.value] - offload_modes = tp_config[Fields.OFFLOAD_MODES.value] - duration_s = tp_config[Fields.DURATION_S.value] - - for max_concurrency in users: - for offload_mode in offload_modes: - _append_kv_stress_entry( - max_concurrency, - offload_mode, - duration_s, - tp=tp_value, - ep=ep_value, - ) - else: - for stress_space in kv_stress_config[Fields.SEARCH_SPACE.value]: - users = stress_space[Fields.USERS.value] - offload_modes = stress_space[Fields.OFFLOAD_MODES.value] - duration_s = stress_space[Fields.DURATION_S.value] - - for max_concurrency in users: - for offload_mode in offload_modes: - _append_kv_stress_entry(max_concurrency, offload_mode, duration_s) - - return matrix_values - - def generate_runner_model_sweep_config(args, all_config_data, runner_data): """Generate runner-model sweep configurations. @@ -1126,86 +885,6 @@ def main(): help='Show this help message and exit' ) - # Subcommand: isb1-sweep - isb1_sweep_parser = subparsers.add_parser( - 'isb1-sweep', - parents=[parent_parser], - add_help=False, - help='Generate ISB1 replay sweep configurations' - ) - isb1_sweep_parser.add_argument( - '--model-prefix', - nargs='+', - required=False, - help='Model prefix(es) to filter configurations (optional, can specify multiple)' - ) - isb1_sweep_parser.add_argument( - '--precision', - nargs='+', - required=False, - help='Precision(s) to filter by (optional, can specify multiple)' - ) - isb1_sweep_parser.add_argument( - '--framework', - nargs='+', - required=False, - help='Framework(s) to filter by (optional, can specify multiple)' - ) - isb1_sweep_parser.add_argument( - '--runner-type', - nargs='+', - required=False, - help='Runner type(s) to filter by (e.g., h200, b200) (optional, can specify multiple)' - ) - isb1_sweep_parser.add_argument( - '--max-concurrency', - type=int, - required=False, - help='Maximum replay concurrency value to include (caps higher values)' - ) - isb1_sweep_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: isb1-kv-stress-sweep - isb1_kv_stress_sweep_parser = subparsers.add_parser( - 'isb1-kv-stress-sweep', - parents=[parent_parser], - add_help=False, - help='Generate ISB1 KV stress sweep configurations' - ) - isb1_kv_stress_sweep_parser.add_argument( - '--model-prefix', - nargs='+', - required=False, - help='Model prefix(es) to filter configurations (optional, can specify multiple)' - ) - isb1_kv_stress_sweep_parser.add_argument( - '--precision', - nargs='+', - required=False, - help='Precision(s) to filter by (optional, can specify multiple)' - ) - isb1_kv_stress_sweep_parser.add_argument( - '--framework', - nargs='+', - required=False, - help='Framework(s) to filter by (optional, can specify multiple)' - ) - isb1_kv_stress_sweep_parser.add_argument( - '--runner-type', - nargs='+', - required=False, - help='Runner type(s) to filter by (e.g., h200, b200) (optional, can specify multiple)' - ) - isb1_kv_stress_sweep_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - # Subcommand: test-config test_config_keys_parser = subparsers.add_parser( 'test-config', @@ -1236,12 +915,7 @@ def main(): apply_node_type_defaults(args) # Load and validate configuration files (validation happens by default in load functions) - if args.command == 'isb1-sweep': - all_config_data = load_isb1_config_files(args.config_files) - elif args.command == 'isb1-kv-stress-sweep': - all_config_data = load_isb1_kv_stress_config_files(args.config_files) - else: - all_config_data = load_config_files(args.config_files) + all_config_data = load_config_files(args.config_files) runner_data = load_runner_file(args.runner_config) # Route to appropriate function based on subcommand @@ -1250,17 +924,13 @@ def main(): elif args.command == 'runner-model-sweep': matrix_values = generate_runner_model_sweep_config( args, all_config_data, runner_data) - elif args.command == 'isb1-sweep': - matrix_values = generate_isb1_sweep(args, all_config_data, runner_data) - elif args.command == 'isb1-kv-stress-sweep': - matrix_values = generate_isb1_kv_stress_sweep(args, all_config_data, runner_data) elif args.command == 'test-config': matrix_values = generate_test_config_sweep(args, all_config_data) else: parser.error(f"Unknown command: {args.command}") # Handle eval options (mutually exclusive: --no-evals or --evals-only) - if args.command not in ('isb1-sweep', 'isb1-kv-stress-sweep') and not args.no_evals: + if not args.no_evals: matrix_values = mark_eval_entries(matrix_values) if args.evals_only: matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)] diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index 90d7bf0ff..d05299472 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -1,73 +1,22 @@ """Comprehensive tests for generate_sweep_configs.py""" import pytest import argparse -import json -from pathlib import Path from generate_sweep_configs import ( seq_len_stoi, seq_len_itos, seq_len_to_str, generate_full_sweep, - generate_isb1_sweep, - generate_isb1_kv_stress_sweep, generate_runner_model_sweep_config, apply_node_type_defaults, expand_config_keys, mark_eval_entries, ) -from validation import ( - load_config_files, - load_isb1_config_files, - load_isb1_kv_stress_config_files, -) # ============================================================================= # Test Fixtures # ============================================================================= - -def _write_isb1_export_fixture( - root: Path, - relative_path: str, - *, - runtime_stack_id: str, - hardware_profile_id: str, - canonical_model_id: str, - support_status: str, - benchmark_certification_status: str = "dataset_replay_verified", -) -> None: - export_path = root / relative_path - export_path.parent.mkdir(parents=True, exist_ok=True) - export_path.write_text( - json.dumps( - { - "adapter_id": "inferencex_multiturn", - "exports": [ - { - "trace_id": f"{export_path.stem}-trace", - "runtime_stack_id": runtime_stack_id, - "hardware_profile_id": hardware_profile_id, - "canonical_model_id": canonical_model_id, - "support_status": support_status, - "benchmark_certification_status": benchmark_certification_status, - "session": { - "session_id": "fixture-session", - "turns": [ - { - "turn_idx": 0, - "turn_id": 0, - "messages": [{"role": "user", "content": "hi"}], - "expected_output_tokens": 8, - } - ], - }, - } - ], - } - ) - ) - @pytest.fixture def sample_single_node_config(): """Single node config based on dsr1-fp8-mi300x-sglang.""" @@ -200,161 +149,6 @@ def full_sweep_args_multi_node(): return args -@pytest.fixture -def sample_isb1_config(): - """ISB1 replay config based on NVIDIA H200 replay lane.""" - return { - "dsr1-isb1-h200-vllm": { - "image": "vllm/vllm-openai:v0.8.5", - "model": "deepseek-ai/DeepSeek-R1-0528", - "model-prefix": "dsr1", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "benchmark-type": "isb1_replay", - "runtime-stack-id": "vllm-0.8.5-h200", - "hardware-profile-id": "h200-8gpu", - "canonical-model-id": "deepseek-r1-0528", - "max-model-len": 16384, - "replay-configs": [ - { - "export-file": "datasets/isb1/exports/core/chat_8k1k.json", - "request-mode": "multi-turn", - "support-status": "supported", - "search-space": [ - { - "max-concurrency": 4, - "max-sessions": 2, - "max-turns-per-session": 6, - "max-output-len": 512, - "num-warmup-sessions": 1, - "ignore-waits": True, - "ignore-eos": False, - }, - {"max-concurrency": 8}, - {"max-concurrency": 16}, - ], - }, - { - "export-file": "datasets/isb1/exports/core/code_8k1k.json", - "request-mode": "multi-turn", - "support-status": "supported", - "search-space": [ - {"max-concurrency": 4}, - {"max-concurrency": 8}, - ], - }, - ], - } - } - - -@pytest.fixture -def isb1_sweep_args(): - """Args for isb1-sweep command.""" - args = argparse.Namespace() - args.model_prefix = None - args.precision = None - args.framework = None - args.runner_type = None - args.max_concurrency = None - args.runner_node_filter = None - return args - - -@pytest.fixture -def sample_isb1_kv_stress_config(): - """ISB1 KV stress config with users/offload-mode search space.""" - return { - "gptoss-fp4-h200-isb1-kv-stress-vllm-code": { - "image": "vllm/vllm-openai:v0.18.0", - "model": "openai/gpt-oss-120b", - "model-prefix": "gptoss", - "precision": "fp4", - "framework": "vllm", - "runner": "h200", - "benchmark-type": "isb1_kv_stress", - "runtime-stack-id": "standalone:vllm", - "hardware-profile-id": "nvidia:h200_sxm_141gb", - "canonical-model-id": "gpt_oss_120b", - "max-model-len": 131272, - "kv-cache-dtype": "fp8", - "kv-stress-configs": [ - { - "export-file": "datasets/isb1/exports/extension_131k/vllm/code_131k1k.json", - "request-mode": "multi-turn", - "support-status": "reviewed_preview", - "workload-type": "code", - "search-space": [ - { - "users": [2, 4, 8], - "offload-modes": ["on", "off", "noprefix"], - "duration-s": 1800, - } - ], - } - ], - } - } - - -@pytest.fixture -def sample_isb1_kv_stress_tp_config(): - """ISB1 KV stress config using per-TP expansion.""" - return { - "gptoss-fp4-h200-isb1-kv-stress-vllm-code-tp": { - "image": "vllm/vllm-openai:v0.18.0", - "model": "openai/gpt-oss-120b", - "model-prefix": "gptoss", - "precision": "fp4", - "framework": "vllm", - "runner": "h200", - "benchmark-type": "isb1_kv_stress", - "runtime-stack-id": "standalone:vllm", - "hardware-profile-id": "nvidia:h200_sxm_141gb", - "canonical-model-id": "gpt_oss_120b", - "max-model-len": 131272, - "kv-cache-dtype": "fp8", - "kv-stress-configs": [ - { - "export-file": "datasets/isb1/exports/extension_131k/vllm/code_131k1k.json", - "request-mode": "multi-turn", - "support-status": "reviewed_preview", - "workload-type": "code", - "search-space": [ - { - "users": [1], - "offload-modes": ["off"], - "duration-s": 10, - } - ], - "tp-configs": [ - { - "tp": 8, - "ep": 1, - "users": [2, 4, 8], - "offload-modes": ["on", "off", "noprefix"], - "duration-s": 1800, - } - ], - } - ], - } - } - - -@pytest.fixture -def isb1_kv_stress_sweep_args(): - """Args for isb1-kv-stress-sweep command.""" - args = argparse.Namespace() - args.model_prefix = None - args.precision = None - args.framework = None - args.runner_type = None - args.runner_node_filter = None - return args - - # ============================================================================= # Test seq_len mappings # ============================================================================= @@ -387,570 +181,6 @@ def test_unknown_sequence_lengths(self): assert seq_len_to_str(4096, 1024) == "4096_1024" -# ============================================================================= -# Test generate_isb1_sweep -# ============================================================================= - -class TestGenerateISB1Sweep: - """Tests for generate_isb1_sweep.""" - - def test_basic_sweep_generation(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert len(result) == 5 - - def test_matrix_entry_structure(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - entry = result[0] - assert entry["benchmark-type"] == "isb1_replay" - assert entry["export-file"].endswith("chat_8k1k.json") - assert entry["runtime-stack-id"] == "vllm-0.8.5-h200" - assert entry["hardware-profile-id"] == "h200-8gpu" - assert entry["canonical-model-id"] == "deepseek-r1-0528" - assert entry["support-status"] == "supported" - assert entry["request-mode"] == "multi-turn" - assert entry["max-concurrency"] == 4 - assert entry["max-sessions"] == 2 - assert entry["max-turns-per-session"] == 6 - assert entry["max-output-len"] == 512 - assert entry["num-warmup-sessions"] == 1 - assert entry["ignore-waits"] is True - assert entry["ignore-eos"] is False - assert entry["max-model-len"] == 16384 - assert entry["exp-name"] == "dsr1_isb1" - assert "run-eval" not in entry - - def test_filter_by_model_prefix(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - isb1_sweep_args.model_prefix = ["dsr1"] - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert len(result) == 5 - - isb1_sweep_args.model_prefix = ["gptoss"] - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert result == [] - - def test_filter_by_precision(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - isb1_sweep_args.precision = ["fp8"] - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert len(result) == 5 - - isb1_sweep_args.precision = ["fp4"] - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert result == [] - - def test_filter_by_framework(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - isb1_sweep_args.framework = ["vllm"] - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert len(result) == 5 - - isb1_sweep_args.framework = ["sglang"] - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert result == [] - - def test_filter_by_runner_type(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - isb1_sweep_args.runner_type = ["h200"] - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert len(result) == 5 - - isb1_sweep_args.runner_type = ["h100"] - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert result == [] - - def test_invalid_runner_type_raises_error(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - isb1_sweep_args.runner_type = ["not-a-runner"] - with pytest.raises(ValueError, match="Invalid runner type"): - generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - - def test_max_concurrency_cap(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - isb1_sweep_args.max_concurrency = 6 - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert len(result) == 5 - assert sorted(entry["max-concurrency"] for entry in result) == [4, 4, 6, 6, 6] - - def test_non_positive_max_concurrency_skips_all(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - isb1_sweep_args.max_concurrency = 0 - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert result == [] - - def test_max_model_len_passthrough_optional(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert all(entry["max-model-len"] == 16384 for entry in result) - - sample_isb1_config["dsr1-isb1-h200-vllm"].pop("max-model-len") - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert all(entry["max-model-len"] is None for entry in result) - - def test_runner_node_filter_expands_runner_nodes(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - isb1_sweep_args.runner_node_filter = "cw" - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert len(result) == 10 - assert all(entry["runner"].startswith("h200-cw") for entry in result) - - def test_runner_node_filter_no_match_returns_empty(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): - isb1_sweep_args.runner_node_filter = "does-not-exist" - result = generate_isb1_sweep( - isb1_sweep_args, - sample_isb1_config, - sample_runner_config, - ) - assert result == [] - - def test_main_routes_isb1_sweep(self, tmp_path, sample_isb1_config, sample_runner_config, monkeypatch): - import yaml - import sys - from generate_sweep_configs import main - - sample_entry = sample_isb1_config["dsr1-isb1-h200-vllm"] - for replay_config in sample_entry["replay-configs"]: - _write_isb1_export_fixture( - tmp_path, - replay_config["export-file"], - runtime_stack_id=sample_entry["runtime-stack-id"], - hardware_profile_id=sample_entry["hardware-profile-id"], - canonical_model_id=sample_entry["canonical-model-id"], - support_status=replay_config["support-status"], - ) - - config_file = tmp_path / "isb1.yaml" - runner_file = tmp_path / "runners.yaml" - config_file.write_text(yaml.dump(sample_isb1_config)) - runner_file.write_text(yaml.dump(sample_runner_config)) - - monkeypatch.setattr( - sys, - "argv", - [ - "generate_sweep_configs.py", - "isb1-sweep", - "--config-files", - str(config_file), - "--runner-config", - str(runner_file), - ], - ) - - result = main() - assert len(result) == 5 - assert all(entry["benchmark-type"] == "isb1_replay" for entry in result) - - -class TestKVStressSweep: - """Tests for generate_isb1_kv_stress_sweep.""" - - def test_basic_kv_stress_sweep_generation( - self, - sample_isb1_kv_stress_config, - sample_runner_config, - isb1_kv_stress_sweep_args, - ): - result = generate_isb1_kv_stress_sweep( - isb1_kv_stress_sweep_args, - sample_isb1_kv_stress_config, - sample_runner_config, - ) - # users(3) * offload-modes(3) = 9 flattened rows - assert len(result) == 9 - - def test_flatten_users_x_offload_modes( - self, - sample_isb1_kv_stress_config, - sample_runner_config, - isb1_kv_stress_sweep_args, - ): - result = generate_isb1_kv_stress_sweep( - isb1_kv_stress_sweep_args, - sample_isb1_kv_stress_config, - sample_runner_config, - ) - - assert all(entry["benchmark-type"] == "isb1_kv_stress" for entry in result) - assert all(isinstance(entry["max-concurrency"], int) for entry in result) - assert all(isinstance(entry["offload-mode"], str) for entry in result) - assert all(entry["benchmark-duration-s"] == 1800 for entry in result) - assert all(entry["kv-cache-dtype"] == "fp8" for entry in result) - assert all(entry["workload-type"] == "code" for entry in result) - - pairs = {(entry["max-concurrency"], entry["offload-mode"]) for entry in result} - assert pairs == { - (2, "on"), - (2, "off"), - (2, "noprefix"), - (4, "on"), - (4, "off"), - (4, "noprefix"), - (8, "on"), - (8, "off"), - (8, "noprefix"), - } - - def test_tp_config_expansion_produces_expected_rows( - self, - sample_isb1_kv_stress_tp_config, - sample_runner_config, - isb1_kv_stress_sweep_args, - ): - result = generate_isb1_kv_stress_sweep( - isb1_kv_stress_sweep_args, - sample_isb1_kv_stress_tp_config, - sample_runner_config, - ) - - # users(3) * offload-modes(3) = 9 rows from tp-configs expansion - assert len(result) == 9 - assert {entry["tp"] for entry in result} == {8} - assert {entry["ep"] for entry in result} == {1} - - def test_repo_kv_stress_config_loads_and_expands(self, isb1_kv_stress_sweep_args): - repo_root = Path(__file__).resolve().parents[2] - config_data = load_isb1_kv_stress_config_files( - [str(repo_root / ".github/configs/isb1-kv-stress.yaml")] - ) - runner_data = { - "b200": ["b200-nb_0"], - "h200": ["h200-cw_2"], - } - - matrix = generate_isb1_kv_stress_sweep( - isb1_kv_stress_sweep_args, - config_data, - runner_data, - ) - - # isb1-kv-stress.yaml covers many configs across multiple models, hardware - # profiles, and TP/PP shapes; the expanded matrix pairs each config with - # the users x offload-modes cross-product. The post-PR1032 kv-stress config - # declares explicit tp/ep per stanza via tp-configs expansion, so those - # keys MUST be present on every row. - assert len(matrix) > 0 - assert all(entry["benchmark-type"] == "isb1_kv_stress" for entry in matrix) - assert all("tp" in entry for entry in matrix) - assert all("ep" in entry for entry in matrix) - # Ensure every row resolves to an existing bundle on disk. - repo_root = Path(__file__).resolve().parents[2] - assert all((repo_root / entry["export-file"]).exists() for entry in matrix) - - -class TestISB1SweepIsolation: - """Tests for ISB1 sweep isolation from throughput config lane.""" - - def test_repo_isb1_master_includes_runtime_expansion_cells(self, isb1_sweep_args): - repo_root = Path(__file__).resolve().parents[2] - config_data = load_isb1_config_files( - [str(repo_root / ".github/configs/isb1-master.yaml")] - ) - runner_data = { - "b200": ["b200-nb_0"], - "h100": ["h100-cw_0"], - "h200": ["h200-cw_2"], - } - - matrix = generate_isb1_sweep(isb1_sweep_args, config_data, runner_data) - config_keys = set(config_data) - matrix_key_triples = { - (entry["model-prefix"], entry["framework"], entry["runner"]) - for entry in matrix - } - - # Current closure: standalone:vllm core/extension for dsr1/gptoss/qwen3.5, - # plus bounded 500k code preview on standalone:sglang. SGLang core/extension - # lanes and vllm 500k/1M previews are deferred until matching cells are - # materialized. - assert "dsr1-fp8-b200-isb1-vllm" in config_keys - assert "dsr1-fp8-h200-isb1-vllm" in config_keys - assert "gptoss-fp4-b200-isb1-vllm" in config_keys - assert "gptoss-fp4-h100-isb1-vllm" in config_keys - assert "gptoss-fp4-h200-isb1-vllm" in config_keys - assert "qwen3.5-fp8-b200-isb1-vllm" in config_keys - assert "qwen3.5-fp8-h100-isb1-vllm" in config_keys - assert "qwen3.5-fp8-h200-isb1-vllm" in config_keys - assert "gptoss-fp4-b200-isb1-vllm-extension" in config_keys - assert "gptoss-fp4-h100-isb1-vllm-extension" in config_keys - assert "gptoss-fp4-h200-isb1-vllm-extension" in config_keys - assert "qwen3.5-fp8-b200-isb1-vllm-extension" in config_keys - assert "qwen3.5-fp8-h100-isb1-vllm-extension" in config_keys - assert "qwen3.5-fp8-h200-isb1-vllm-extension" in config_keys - assert "gptoss-fp4-b200-isb1-sglang-500k-preview-code" in config_keys - assert "gptoss-fp4-h100-isb1-sglang-500k-preview-code" in config_keys - assert "gptoss-fp4-h200-isb1-sglang-500k-preview-code" in config_keys - assert "qwen3.5-fp8-b200-isb1-sglang-500k-preview-code" in config_keys - assert "qwen3.5-fp8-h100-isb1-sglang-500k-preview-code" in config_keys - assert "qwen3.5-fp8-h200-isb1-sglang-500k-preview-code" in config_keys - - assert ("dsr1", "vllm", "b200") in matrix_key_triples - assert ("dsr1", "vllm", "h200") in matrix_key_triples - assert ("gptoss", "vllm", "b200") in matrix_key_triples - assert ("gptoss", "vllm", "h100") in matrix_key_triples - assert ("gptoss", "vllm", "h200") in matrix_key_triples - assert ("qwen3.5", "vllm", "b200") in matrix_key_triples - assert ("qwen3.5", "vllm", "h100") in matrix_key_triples - assert ("qwen3.5", "vllm", "h200") in matrix_key_triples - assert ("gptoss", "sglang", "b200") in matrix_key_triples - assert ("gptoss", "sglang", "h100") in matrix_key_triples - assert ("gptoss", "sglang", "h200") in matrix_key_triples - assert ("qwen3.5", "sglang", "b200") in matrix_key_triples - assert ("qwen3.5", "sglang", "h100") in matrix_key_triples - assert ("qwen3.5", "sglang", "h200") in matrix_key_triples - - # Deferred stanzas must not appear in the master closure. - for deferred in [ - "dsr1-fp8-h100-isb1-sglang", - "dsr1-fp8-h100-isb1-vllm", - "dsr1-fp8-b200-isb1-sglang", - "dsr1-fp8-h200-isb1-sglang", - "gptoss-fp4-b200-isb1-sglang", - "gptoss-fp4-h100-isb1-sglang", - "gptoss-fp4-h200-isb1-sglang", - "qwen3.5-fp8-b200-isb1-sglang", - "qwen3.5-fp8-h100-isb1-sglang", - "qwen3.5-fp8-h200-isb1-sglang", - "gptoss-fp4-b200-isb1-sglang-extension", - "gptoss-fp4-h100-isb1-sglang-extension", - "gptoss-fp4-h200-isb1-sglang-extension", - "qwen3.5-fp8-h100-isb1-sglang-extension", - "gptoss-fp4-b200-isb1-vllm-500k-preview-code", - "gptoss-fp4-h100-isb1-vllm-500k-preview-code", - "gptoss-fp4-h200-isb1-vllm-500k-preview-code", - "qwen3.5-fp8-b200-isb1-vllm-500k-preview-code", - "qwen3.5-fp8-h100-isb1-vllm-500k-preview-code", - "qwen3.5-fp8-h200-isb1-vllm-500k-preview-code", - "gptoss-fp4-b200-isb1-sglang-offload-core-preview-chat", - "gptoss-fp4-h100-isb1-sglang-offload-core-preview-chat", - "gptoss-fp4-h200-isb1-sglang-offload-core-preview-chat", - "gptoss-fp4-b200-isb1-vllm-offload-core-preview-code", - "gptoss-fp4-h100-isb1-vllm-offload-core-preview-code", - "gptoss-fp4-h200-isb1-vllm-offload-core-preview-code", - ]: - assert deferred not in config_keys, ( - f"{deferred} must not be in isb1-master.yaml until matching " - f"cells are materialized in the corresponding bundle" - ) - - # Bundle path flatness: no per-engine subdirs and no __engine suffixes. - assert all("/vllm/" not in entry["export-file"] for entry in matrix) - assert all("/sglang/" not in entry["export-file"] for entry in matrix) - assert all("__vllm.json" not in entry["export-file"] for entry in matrix) - assert all("__sglang.json" not in entry["export-file"] for entry in matrix) - - # Core dsr1 coverage: chat is supported, code is reviewed_preview only. - assert any( - entry["export-file"].endswith("core/chat_8k1k.json") - and entry["support-status"] == "supported" - for entry in matrix - ) - assert any( - entry["export-file"].endswith("core/code_8k1k.json") - and entry["support-status"] == "reviewed_preview" - for entry in matrix - ) - assert not any( - entry["export-file"].endswith("core/code_8k1k.json") - and entry["support-status"] == "supported" - for entry in matrix - ) - - # Extension coverage on flat paths. - assert any( - entry["export-file"].endswith("extension_32k/chat_32k1k.json") - and entry["support-status"] == "supported" - for entry in matrix - ) - assert any( - entry["export-file"].endswith("extension_32k/code_32k1k.json") - and entry["support-status"] == "reviewed_preview" - for entry in matrix - ) - assert any( - entry["export-file"].endswith("extension_64k/code_64k1k.json") - and entry["support-status"] == "supported" - for entry in matrix - ) - assert any( - entry["export-file"].endswith("extension_131k/chat_131k1k.json") - and entry["support-status"] == "reviewed_preview" - for entry in matrix - ) - assert any( - entry["export-file"].endswith("extension_131k/code_131k1k.json") - and entry["support-status"] == "unsupported" - for entry in matrix - ) - - # Qwen flat-path bundles (no _qwen3.5 bundles at the engine-subdir level). - # After path-flattening, both vllm and sglang cells resolve to the same - # flat bundle path, so filter by framework explicitly. - qwen_131k_all = [ - entry - for entry in matrix - if entry["export-file"].endswith("extension_131k/code_131k1k_qwen3.5.json") - ] - assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_131k_all) - assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_131k_all) - - qwen_vllm_131k = [e for e in qwen_131k_all if e["framework"] == "vllm"] - assert len(qwen_vllm_131k) == 6 - - qwen_sglang_131k = [e for e in qwen_131k_all if e["framework"] == "sglang"] - assert len(qwen_sglang_131k) == 4 - - # 500k sglang preview: gptoss and qwen, one bundle per surface. - gptoss_sglang_500k = [ - entry - for entry in matrix - if entry["export-file"].endswith( - "preview/long_context_500k/" - "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json" - ) - ] - assert len(gptoss_sglang_500k) == 3 - assert all(entry["framework"] == "sglang" for entry in gptoss_sglang_500k) - assert all(entry["support-status"] == "reviewed_preview" for entry in gptoss_sglang_500k) - assert all(entry["max-model-len"] == 524288 for entry in gptoss_sglang_500k) - assert all(entry["max-concurrency"] == 1 for entry in gptoss_sglang_500k) - - qwen_sglang_500k = [ - entry - for entry in matrix - if entry["export-file"].endswith( - "preview/long_context_500k/" - "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json" - ) - ] - assert len(qwen_sglang_500k) == 3 - assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_sglang_500k) - assert all(entry["framework"] == "sglang" for entry in qwen_sglang_500k) - assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_sglang_500k) - assert all(entry["max-model-len"] == 524288 for entry in qwen_sglang_500k) - - # 1M qwen preview: gated-only, not part of isb1-master.yaml. - assert not any( - "long_context_1m" in entry["export-file"] for entry in matrix - ) - - # Every produced row must resolve to an existing bundle on disk. - assert all((repo_root / entry["export-file"]).exists() for entry in matrix) - - def test_repo_qwen_1m_preview_config_is_manual_and_separate(self, isb1_sweep_args): - repo_root = Path(__file__).resolve().parents[2] - config_data = load_isb1_config_files( - [str(repo_root / ".github/configs/isb1-qwen-1m-preview.yaml")] - ) - runner_data = { - "b200": ["b200-nb_0"], - "h100": ["h100-cw_0"], - "h200": ["h200-cw_2"], - } - - matrix = generate_isb1_sweep(isb1_sweep_args, config_data, runner_data) - config_keys = set(config_data) - - # 1M preview bundle currently carries standalone:sglang cells only. The - # vllm 1M stanza is deferred until matching cells are materialized. - assert config_keys == { - "qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code", - } - assert len(matrix) == 1 - assert {entry["runner"] for entry in matrix} == {"b200"} - assert {entry["framework"] for entry in matrix} == {"sglang"} - assert {entry["model-prefix"] for entry in matrix} == {"qwen3.5"} - assert {entry["support-status"] for entry in matrix} == {"reviewed_preview"} - assert {entry["max-model-len"] for entry in matrix} == {1048576} - assert {entry["max-concurrency"] for entry in matrix} == {1} - assert {entry["max-sessions"] for entry in matrix} == {1} - assert {entry["max-turns-per-session"] for entry in matrix} == {3} - assert { - entry["canonical-model-id"] for entry in matrix - } == {"qwen3_5_397b_a17b"} - assert { - entry["export-file"] for entry in matrix - } == { - "datasets/isb1/exports/preview/long_context_1m/" - "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json", - } - assert all((repo_root / entry["export-file"]).exists() for entry in matrix) - - - def test_isb1_config_does_not_validate_as_throughput(self, tmp_path, sample_isb1_config): - import yaml - - config_file = tmp_path / "isb1.yaml" - config_file.write_text(yaml.dump(sample_isb1_config)) - - with pytest.raises(ValueError): - load_config_files([str(config_file)]) - - def test_throughput_config_does_not_validate_as_isb1(self, tmp_path, sample_single_node_config): - import yaml - - config_file = tmp_path / "throughput.yaml" - config_file.write_text(yaml.dump(sample_single_node_config)) - - with pytest.raises(ValueError): - load_isb1_config_files([str(config_file)]) - - # ============================================================================= # Test generate_full_sweep for single-node # ============================================================================= diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py index 06267da22..0f1f44c27 100644 --- a/utils/matrix_logic/test_validation.py +++ b/utils/matrix_logic/test_validation.py @@ -1,31 +1,20 @@ """Comprehensive tests for validation.py""" -import json -from pathlib import Path - import pytest -import yaml from validation import ( Fields, SingleNodeMatrixEntry, MultiNodeMatrixEntry, - ISB1ReplayMatrixEntry, WorkerConfig, SingleNodeSearchSpaceEntry, MultiNodeSearchSpaceEntry, - ISB1ReplaySearchSpaceEntry, - ISB1ReplayConfigEntry, SingleNodeSeqLenConfig, MultiNodeSeqLenConfig, SingleNodeMasterConfigEntry, MultiNodeMasterConfigEntry, - ISB1MasterConfigEntry, validate_matrix_entry, - validate_isb1_matrix_entry, validate_master_config, - validate_isb1_master_config, validate_runner_config, load_config_files, - load_isb1_config_files, load_runner_file, ) @@ -34,68 +23,6 @@ # Test Fixtures # ============================================================================= - -def _write_isb1_export_fixture( - root: Path, - relative_path: str, - *, - runtime_stack_id: str, - hardware_profile_id: str, - canonical_model_id: str, - support_status: str, - benchmark_certification_status: str = "dataset_replay_verified", -) -> None: - export_path = root / relative_path - export_path.parent.mkdir(parents=True, exist_ok=True) - export_path.write_text( - json.dumps( - { - "adapter_id": "inferencex_multiturn", - "exports": [ - { - "trace_id": f"{export_path.stem}-trace", - "runtime_stack_id": runtime_stack_id, - "hardware_profile_id": hardware_profile_id, - "canonical_model_id": canonical_model_id, - "support_status": support_status, - "benchmark_certification_status": benchmark_certification_status, - "session": { - "session_id": "fixture-session", - "turns": [ - { - "turn_idx": 0, - "turn_id": 0, - "messages": [{"role": "user", "content": "hello"}], - "expected_output_tokens": 8, - } - ], - }, - } - ], - } - ) - ) - - -def _write_manifest_fixture( - root: Path, - relative_path: str, - *, - export_file: str, - max_model_len: int, -) -> None: - manifest_path = root / relative_path - manifest_path.parent.mkdir(parents=True, exist_ok=True) - manifest_path.write_text( - json.dumps( - { - "manifest_version": "0.1.0", - "max_model_len": max_model_len, - "exports": [{"export_file": export_file}], - } - ) - ) - @pytest.fixture def valid_single_node_matrix_entry(): """Valid single node matrix entry based on dsr1-fp4-mi355x-sglang config.""" @@ -232,74 +159,6 @@ def valid_multinode_master_config(): } -@pytest.fixture -def valid_isb1_master_config(): - """Valid ISB1 replay master config for NVIDIA PR1a.""" - return { - "image": "vllm/vllm-openai:v0.8.5", - "model": "deepseek-ai/DeepSeek-R1-0528", - "model-prefix": "dsr1", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "benchmark-type": "isb1_replay", - "runtime-stack-id": "vllm-0.8.5-h200", - "hardware-profile-id": "h200-8gpu", - "canonical-model-id": "deepseek-r1-0528", - "max-model-len": 16384, - "replay-configs": [ - { - "export-file": "datasets/isb1/exports/core/chat_8k1k.json", - "request-mode": "multi-turn", - "support-status": "supported", - "search-space": [ - { - "max-concurrency": 4, - "max-sessions": 2, - "max-turns-per-session": 6, - "max-output-len": 512, - "num-warmup-sessions": 1, - "ignore-waits": True, - "ignore-eos": False, - }, - { - "max-concurrency": 8, - }, - ], - } - ], - } - - -@pytest.fixture -def valid_isb1_matrix_entry(valid_isb1_master_config): - """Valid ISB1 replay matrix entry.""" - return { - "image": valid_isb1_master_config["image"], - "model": valid_isb1_master_config["model"], - "model-prefix": valid_isb1_master_config["model-prefix"], - "precision": valid_isb1_master_config["precision"], - "framework": valid_isb1_master_config["framework"], - "runner": valid_isb1_master_config["runner"], - "benchmark-type": valid_isb1_master_config["benchmark-type"], - "export-file": valid_isb1_master_config["replay-configs"][0]["export-file"], - "runtime-stack-id": valid_isb1_master_config["runtime-stack-id"], - "hardware-profile-id": valid_isb1_master_config["hardware-profile-id"], - "canonical-model-id": valid_isb1_master_config["canonical-model-id"], - "support-status": valid_isb1_master_config["replay-configs"][0]["support-status"], - "request-mode": valid_isb1_master_config["replay-configs"][0]["request-mode"], - "max-concurrency": 4, - "max-sessions": 2, - "max-turns-per-session": 6, - "max-output-len": 512, - "num-warmup-sessions": 1, - "ignore-waits": True, - "ignore-eos": False, - "max-model-len": valid_isb1_master_config["max-model-len"], - "exp-name": "dsr1_isb1", - } - - @pytest.fixture def valid_runner_config(): """Valid runner config based on .github/configs/runners.yaml.""" @@ -334,10 +193,6 @@ def test_key_fields_exist(self): assert Fields.SPEC_DECODING.value == "spec-decoding" assert Fields.PREFILL.value == "prefill" assert Fields.DECODE.value == "decode" - assert Fields.BENCHMARK_TYPE.value == "benchmark-type" - assert Fields.SUPPORT_STATUS.value == "support-status" - assert Fields.MAX_CONCURRENCY.value == "max-concurrency" - assert Fields.REPLAY_CONFIGS.value == "replay-configs" # ============================================================================= @@ -803,153 +658,6 @@ def test_disagg_default_false(self, valid_single_node_master_config): assert config.disagg is False -# ============================================================================= -# Test ISB1 replay models -# ============================================================================= - -class TestISB1ReplaySearchSpaceEntry: - """Tests for ISB1ReplaySearchSpaceEntry model.""" - - def test_valid_with_required_only(self): - config = ISB1ReplaySearchSpaceEntry(**{ - "max-concurrency": 4, - }) - assert config.max_concurrency == 4 - assert config.num_warmup_sessions == 0 - assert config.ignore_waits is False - assert config.ignore_eos is False - - def test_valid_with_all_fields(self): - config = ISB1ReplaySearchSpaceEntry(**{ - "max-concurrency": 8, - "max-sessions": 2, - "max-turns-per-session": 6, - "max-output-len": 512, - "num-warmup-sessions": 1, - "ignore-waits": True, - "ignore-eos": True, - }) - assert config.max_sessions == 2 - assert config.max_turns_per_session == 6 - assert config.max_output_len == 512 - assert config.num_warmup_sessions == 1 - assert config.ignore_waits is True - assert config.ignore_eos is True - - def test_missing_required_field(self): - with pytest.raises(Exception): - ISB1ReplaySearchSpaceEntry(**{ - "max-sessions": 2, - }) - - def test_extra_field_forbidden(self): - with pytest.raises(Exception): - ISB1ReplaySearchSpaceEntry(**{ - "max-concurrency": 4, - "unknown-field": "value", - }) - - -class TestISB1ReplayConfigEntry: - """Tests for ISB1ReplayConfigEntry model.""" - - def test_valid_entry(self): - config = ISB1ReplayConfigEntry(**{ - "export-file": "datasets/isb1/exports/core/chat_8k1k.json", - "request-mode": "multi-turn", - "support-status": "supported", - "search-space": [{"max-concurrency": 4}], - }) - assert config.export_file.endswith("chat_8k1k.json") - assert config.request_mode == "multi-turn" - assert config.support_status == "supported" - assert len(config.search_space) == 1 - - def test_invalid_support_status(self): - with pytest.raises(Exception): - ISB1ReplayConfigEntry(**{ - "export-file": "datasets/isb1/exports/core/chat_8k1k.json", - "request-mode": "multi-turn", - "support-status": "definitely_supported", - "search-space": [{"max-concurrency": 4}], - }) - - def test_missing_export_file(self): - with pytest.raises(Exception): - ISB1ReplayConfigEntry(**{ - "request-mode": "multi-turn", - "search-space": [{"max-concurrency": 4}], - }) - - def test_missing_request_mode(self): - with pytest.raises(Exception): - ISB1ReplayConfigEntry(**{ - "export-file": "datasets/isb1/exports/core/chat_8k1k.json", - "search-space": [{"max-concurrency": 4}], - }) - - def test_empty_search_space(self): - with pytest.raises(Exception): - ISB1ReplayConfigEntry(**{ - "export-file": "datasets/isb1/exports/core/chat_8k1k.json", - "request-mode": "multi-turn", - "search-space": [], - }) - - -class TestISB1MasterConfigEntry: - """Tests for ISB1MasterConfigEntry model.""" - - def test_valid_isb1_master_config(self, valid_isb1_master_config): - config = ISB1MasterConfigEntry(**valid_isb1_master_config) - assert config.benchmark_type == "isb1_replay" - assert config.model_prefix == "dsr1" - assert config.runner == "h200" - assert config.max_model_len == 16384 - assert len(config.replay_configs) == 1 - - def test_max_model_len_optional(self, valid_isb1_master_config): - del valid_isb1_master_config["max-model-len"] - config = ISB1MasterConfigEntry(**valid_isb1_master_config) - assert config.max_model_len is None - - def test_benchmark_type_must_match(self, valid_isb1_master_config): - valid_isb1_master_config["benchmark-type"] = "throughput" - with pytest.raises(Exception): - ISB1MasterConfigEntry(**valid_isb1_master_config) - - def test_throughput_only_field_rejected(self, valid_isb1_master_config): - valid_isb1_master_config["multinode"] = False - with pytest.raises(Exception): - ISB1MasterConfigEntry(**valid_isb1_master_config) - - def test_missing_required_field(self, valid_isb1_master_config): - del valid_isb1_master_config["runtime-stack-id"] - with pytest.raises(Exception): - ISB1MasterConfigEntry(**valid_isb1_master_config) - - -class TestISB1ReplayMatrixEntry: - """Tests for ISB1ReplayMatrixEntry model.""" - - def test_valid_entry(self, valid_isb1_matrix_entry): - entry = ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry) - assert entry.benchmark_type == "isb1_replay" - assert entry.support_status == "supported" - assert entry.max_concurrency == 4 - assert entry.exp_name == "dsr1_isb1" - - def test_missing_required_field(self, valid_isb1_matrix_entry): - del valid_isb1_matrix_entry["export-file"] - with pytest.raises(Exception): - ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry) - - def test_extra_throughput_field_forbidden(self, valid_isb1_matrix_entry): - valid_isb1_matrix_entry["tp"] = 8 - with pytest.raises(Exception): - ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry) - - # ============================================================================= # Test validate_master_config function # ============================================================================= @@ -988,37 +696,6 @@ def test_invalid_config_raises_valueerror(self, valid_single_node_master_config) assert "failed validation" in str(exc_info.value) -class TestValidateISB1MasterConfig: - """Tests for validate_isb1_master_config function.""" - - def test_valid_isb1_config(self, valid_isb1_master_config): - configs = {"dsr1-isb1-h200-vllm": valid_isb1_master_config} - result = validate_isb1_master_config(configs) - assert result == configs - - def test_invalid_isb1_config_raises_valueerror(self, valid_isb1_master_config): - del valid_isb1_master_config["model"] - configs = {"broken-isb1-config": valid_isb1_master_config} - with pytest.raises(ValueError) as exc_info: - validate_isb1_master_config(configs) - assert "broken-isb1-config" in str(exc_info.value) - assert "failed validation" in str(exc_info.value) - - -class TestValidateISB1MatrixEntry: - """Tests for validate_isb1_matrix_entry function.""" - - def test_valid_entry(self, valid_isb1_matrix_entry): - result = validate_isb1_matrix_entry(valid_isb1_matrix_entry) - assert result == valid_isb1_matrix_entry - - def test_invalid_entry_raises_valueerror(self, valid_isb1_matrix_entry): - del valid_isb1_matrix_entry["benchmark-type"] - with pytest.raises(ValueError) as exc_info: - validate_isb1_matrix_entry(valid_isb1_matrix_entry) - assert "failed validation" in str(exc_info.value) - - # ============================================================================= # Test validate_runner_config function # ============================================================================= @@ -1146,224 +823,6 @@ def test_validation_runs_by_default(self, tmp_path): assert "failed validation" in str(exc_info.value) -class TestLoadISB1ConfigFiles: - """Tests for load_isb1_config_files function.""" - - def test_load_single_file_with_validation(self, tmp_path, valid_isb1_master_config): - config_file = tmp_path / "isb1-config.yaml" - _write_isb1_export_fixture( - tmp_path, - valid_isb1_master_config["replay-configs"][0]["export-file"], - runtime_stack_id=valid_isb1_master_config["runtime-stack-id"], - hardware_profile_id=valid_isb1_master_config["hardware-profile-id"], - canonical_model_id=valid_isb1_master_config["canonical-model-id"], - support_status=valid_isb1_master_config["replay-configs"][0]["support-status"], - ) - - config_file.write_text( - yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config}) - ) - result = load_isb1_config_files([str(config_file)]) - assert "dsr1-isb1-h200-vllm" in result - assert result["dsr1-isb1-h200-vllm"]["benchmark-type"] == "isb1_replay" - - def test_export_contract_rejects_mismatched_support_status( - self, tmp_path, valid_isb1_master_config - ): - config_file = tmp_path / "isb1-config.yaml" - _write_isb1_export_fixture( - tmp_path, - valid_isb1_master_config["replay-configs"][0]["export-file"], - runtime_stack_id=valid_isb1_master_config["runtime-stack-id"], - hardware_profile_id=valid_isb1_master_config["hardware-profile-id"], - canonical_model_id=valid_isb1_master_config["canonical-model-id"], - support_status="reviewed_preview", - ) - config_file.write_text( - yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config}) - ) - - with pytest.raises(ValueError) as exc_info: - load_isb1_config_files([str(config_file)]) - assert "support-status" in str(exc_info.value) - assert "Available support tiers" in str(exc_info.value) - - def test_export_contract_requires_dataset_replay_verified_certification( - self, tmp_path, valid_isb1_master_config - ): - config_file = tmp_path / "isb1-config.yaml" - _write_isb1_export_fixture( - tmp_path, - valid_isb1_master_config["replay-configs"][0]["export-file"], - runtime_stack_id=valid_isb1_master_config["runtime-stack-id"], - hardware_profile_id=valid_isb1_master_config["hardware-profile-id"], - canonical_model_id=valid_isb1_master_config["canonical-model-id"], - support_status=valid_isb1_master_config["replay-configs"][0]["support-status"], - benchmark_certification_status="pending_review", - ) - config_file.write_text( - yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config}) - ) - - with pytest.raises(ValueError) as exc_info: - load_isb1_config_files([str(config_file)]) - assert "benchmark_certification_status" in str(exc_info.value) - assert "dataset_replay_verified" in str(exc_info.value) - - def test_export_contract_requires_max_model_len_for_preview_style_export( - self, tmp_path, valid_isb1_master_config - ): - config_file = tmp_path / "isb1-config.yaml" - preview_config = { - **valid_isb1_master_config, - "replay-configs": [ - { - **valid_isb1_master_config["replay-configs"][0], - "export-file": ( - "datasets/isb1/exports/preview/offload_core/" - "inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json" - ), - "support-status": "reviewed_preview", - } - ], - } - del preview_config["max-model-len"] - - _write_isb1_export_fixture( - tmp_path, - preview_config["replay-configs"][0]["export-file"], - runtime_stack_id=preview_config["runtime-stack-id"], - hardware_profile_id=preview_config["hardware-profile-id"], - canonical_model_id=preview_config["canonical-model-id"], - support_status="reviewed_preview", - ) - config_file.write_text(yaml.dump({"preview-row": preview_config})) - - with pytest.raises(ValueError) as exc_info: - load_isb1_config_files([str(config_file)]) - assert "max-model-len" in str(exc_info.value) - - def test_export_contract_accepts_preview_style_export_with_explicit_max_model_len( - self, tmp_path, valid_isb1_master_config - ): - config_file = tmp_path / "isb1-config.yaml" - preview_config = { - **valid_isb1_master_config, - "runtime-stack-id": "standalone:vllm", - "hardware-profile-id": "nvidia:h100_sxm_80gb", - "canonical-model-id": "gpt_oss_120b", - "max-model-len": 524288, - "replay-configs": [ - { - **valid_isb1_master_config["replay-configs"][0], - "export-file": ( - "datasets/isb1/exports/preview/long_context_500k/" - "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json" - ), - "support-status": "reviewed_preview", - } - ], - } - - _write_isb1_export_fixture( - tmp_path, - preview_config["replay-configs"][0]["export-file"], - runtime_stack_id=preview_config["runtime-stack-id"], - hardware_profile_id=preview_config["hardware-profile-id"], - canonical_model_id=preview_config["canonical-model-id"], - support_status="reviewed_preview", - ) - config_file.write_text(yaml.dump({"preview-row": preview_config})) - - result = load_isb1_config_files([str(config_file)]) - assert "preview-row" in result - - def test_export_contract_warns_when_manifest_max_model_len_mismatches_config( - self, tmp_path, valid_isb1_master_config - ): - config_file = tmp_path / "isb1-config.yaml" - preview_config = { - **valid_isb1_master_config, - "runtime-stack-id": "standalone:vllm", - "hardware-profile-id": "nvidia:h100_sxm_80gb", - "canonical-model-id": "qwen3_5_397b_a17b", - "max-model-len": 524288, - "replay-configs": [ - { - **valid_isb1_master_config["replay-configs"][0], - "export-file": ( - "datasets/isb1/exports/preview/long_context_500k/" - "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json" - ), - "support-status": "reviewed_preview", - } - ], - } - - export_file = preview_config["replay-configs"][0]["export-file"] - _write_isb1_export_fixture( - tmp_path, - export_file, - runtime_stack_id=preview_config["runtime-stack-id"], - hardware_profile_id=preview_config["hardware-profile-id"], - canonical_model_id=preview_config["canonical-model-id"], - support_status="reviewed_preview", - ) - _write_manifest_fixture( - tmp_path, - "datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json", - export_file=export_file, - max_model_len=1048576, - ) - config_file.write_text(yaml.dump({"preview-row": preview_config})) - - with pytest.warns(UserWarning, match="max-model-len"): - result = load_isb1_config_files([str(config_file)]) - assert "preview-row" in result - - def test_load_single_file_without_validation(self, tmp_path): - config_file = tmp_path / "isb1-config.yaml" - config_file.write_text(""" -test-isb1: - image: test-image - benchmark-type: isb1_replay -""") - result = load_isb1_config_files([str(config_file)], validate=False) - assert "test-isb1" in result - assert result["test-isb1"]["benchmark-type"] == "isb1_replay" - - def test_validation_runs_by_default(self, tmp_path): - config_file = tmp_path / "isb1-config.yaml" - config_file.write_text(""" -invalid-isb1: - image: test-image - benchmark-type: isb1_replay -""") - with pytest.raises(ValueError) as exc_info: - load_isb1_config_files([str(config_file)]) - assert "failed validation" in str(exc_info.value) - - def test_duplicate_keys_raise_error(self, tmp_path): - config1 = tmp_path / "config1.yaml" - config1.write_text(""" -duplicate-key: - benchmark-type: isb1_replay -""") - config2 = tmp_path / "config2.yaml" - config2.write_text(""" -duplicate-key: - benchmark-type: isb1_replay -""") - with pytest.raises(ValueError) as exc_info: - load_isb1_config_files([str(config1), str(config2)], validate=False) - assert "Duplicate configuration keys" in str(exc_info.value) - - def test_nonexistent_file_raises_error(self): - with pytest.raises(ValueError) as exc_info: - load_isb1_config_files(["nonexistent-isb1.yaml"]) - assert "does not exist" in str(exc_info.value) - - # ============================================================================= # Test load_runner_file # ============================================================================= diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index 331e374b4..312952b96 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -2,12 +2,8 @@ from typing import List, Optional, Union, Literal from enum import Enum -import json import pprint -import re -import warnings import yaml -from pathlib import Path """ The below class defines the field names expected to be present in the JSON entries @@ -59,31 +55,6 @@ class Fields(Enum): RUN_EVAL = 'run-eval' EVAL_ONLY = 'eval-only' - # ISB1 replay fields - BENCHMARK_TYPE = 'benchmark-type' - EXPORT_FILE = 'export-file' - RUNTIME_STACK_ID = 'runtime-stack-id' - HARDWARE_PROFILE_ID = 'hardware-profile-id' - CANONICAL_MODEL_ID = 'canonical-model-id' - REQUEST_MODE = 'request-mode' - MAX_CONCURRENCY = 'max-concurrency' - SUPPORT_STATUS = 'support-status' - MAX_SESSIONS = 'max-sessions' - MAX_TURNS_PER_SESSION = 'max-turns-per-session' - MAX_OUTPUT_LEN = 'max-output-len' - NUM_WARMUP_SESSIONS = 'num-warmup-sessions' - IGNORE_WAITS = 'ignore-waits' - IGNORE_EOS = 'ignore-eos' - REPLAY_CONFIGS = 'replay-configs' - KV_STRESS_CONFIGS = 'kv-stress-configs' - OFFLOAD_MODE = 'offload-mode' - OFFLOAD_MODES = 'offload-modes' - KV_CACHE_DTYPE = 'kv-cache-dtype' - DISABLE_PREFIX_CACHING = 'disable-prefix-caching' - USERS = 'users' - DURATION_S = 'duration-s' - WORKLOAD_TYPE = 'workload-type' - """ Below is the validation logic for the OUTPUT of utils/matrix_logic/generate_sweep_configs.py, i.e., @@ -176,119 +147,6 @@ def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: return entry -class ISB1ReplayMatrixEntry(BaseModel): - """Pydantic model for validating ISB1 replay matrix entry structure.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - image: str - model: str - model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) - precision: str - framework: str - runner: str - benchmark_type: Literal["isb1_replay"] = Field( - alias=Fields.BENCHMARK_TYPE.value - ) - export_file: str = Field(alias=Fields.EXPORT_FILE.value) - runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) - hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) - canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) - support_status: Optional[ - Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] - ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) - request_mode: str = Field(alias=Fields.REQUEST_MODE.value) - max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0) - max_sessions: Optional[int] = Field( - default=None, alias=Fields.MAX_SESSIONS.value, gt=0 - ) - max_turns_per_session: Optional[int] = Field( - default=None, alias=Fields.MAX_TURNS_PER_SESSION.value, gt=0 - ) - max_output_len: Optional[int] = Field( - default=None, alias=Fields.MAX_OUTPUT_LEN.value, gt=0 - ) - num_warmup_sessions: int = Field( - default=0, alias=Fields.NUM_WARMUP_SESSIONS.value, ge=0 - ) - ignore_waits: bool = Field(default=False, alias=Fields.IGNORE_WAITS.value) - ignore_eos: bool = Field(default=False, alias=Fields.IGNORE_EOS.value) - max_model_len: Optional[int] = Field( - default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 - ) - offload_mode: Optional[Literal["on", "off", "noprefix", "legacy"]] = Field( - default=None, alias=Fields.OFFLOAD_MODE.value - ) - kv_cache_dtype: Optional[Literal["auto", "fp8"]] = Field( - default=None, alias=Fields.KV_CACHE_DTYPE.value - ) - disable_prefix_caching: Optional[bool] = Field( - default=None, alias=Fields.DISABLE_PREFIX_CACHING.value - ) - benchmark_duration_s: Optional[int] = Field( - default=None, alias='benchmark-duration-s', gt=0 - ) - exp_name: str = Field(alias=Fields.EXP_NAME.value) - - -def validate_isb1_matrix_entry(entry: dict) -> dict: - """Validate that ISB1 replay matrix entries match the expected structure.""" - try: - ISB1ReplayMatrixEntry(**entry) - except ValidationError as e: - raise ValueError( - f"The following ISB1 matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}" - ) - return entry - - -class ISB1KVStressMatrixEntry(BaseModel): - """Pydantic model for validating ISB1 KV stress matrix entry structure.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - image: str - model: str - model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) - precision: str - framework: str - runner: str - benchmark_type: Literal["isb1_kv_stress"] = Field( - alias=Fields.BENCHMARK_TYPE.value - ) - export_file: str = Field(alias=Fields.EXPORT_FILE.value) - runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) - hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) - canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) - support_status: Optional[ - Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] - ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) - request_mode: str = Field(alias=Fields.REQUEST_MODE.value) - max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0) - offload_mode: Literal["on", "off", "noprefix", "legacy"] = Field( - alias=Fields.OFFLOAD_MODE.value - ) - kv_cache_dtype: Literal["auto", "fp8"] = Field(alias=Fields.KV_CACHE_DTYPE.value) - disable_prefix_caching: bool = Field(alias=Fields.DISABLE_PREFIX_CACHING.value) - benchmark_duration_s: int = Field(alias='benchmark-duration-s', gt=0) - workload_type: Literal["chat", "code"] = Field(alias=Fields.WORKLOAD_TYPE.value) - tp: Optional[int] = Field(default=None, alias=Fields.TP.value, gt=0) - ep: Optional[int] = Field(default=None, alias=Fields.EP.value, gt=0) - max_model_len: Optional[int] = Field( - default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 - ) - exp_name: str = Field(alias=Fields.EXP_NAME.value) - - -def validate_isb1_kv_stress_matrix_entry(entry: dict) -> dict: - """Validate that ISB1 KV stress matrix entries match the expected structure.""" - try: - ISB1KVStressMatrixEntry(**entry) - except ValidationError as e: - raise ValueError( - f"The following ISB1 KV stress matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}" - ) - return entry - - """ Below is the validation logic for the INPUT to utils/matrix_logic/generate_sweep_configs.py, i.e., the master configuration files found in .github/configs. The validation enforces a strict set of @@ -379,89 +237,6 @@ def validate_conc_fields(self): return _validate_conc_fields(self) -class ISB1ReplaySearchSpaceEntry(BaseModel): - """ISB1 replay search space configuration.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0) - max_sessions: Optional[int] = Field( - default=None, alias=Fields.MAX_SESSIONS.value, gt=0 - ) - max_turns_per_session: Optional[int] = Field( - default=None, alias=Fields.MAX_TURNS_PER_SESSION.value, gt=0 - ) - max_output_len: Optional[int] = Field( - default=None, alias=Fields.MAX_OUTPUT_LEN.value, gt=0 - ) - num_warmup_sessions: int = Field( - default=0, alias=Fields.NUM_WARMUP_SESSIONS.value, ge=0 - ) - ignore_waits: bool = Field(default=False, alias=Fields.IGNORE_WAITS.value) - ignore_eos: bool = Field(default=False, alias=Fields.IGNORE_EOS.value) - benchmark_duration_s: Optional[int] = Field( - default=None, alias='benchmark-duration-s', gt=0 - ) - - -class ISB1ReplayConfigEntry(BaseModel): - """Per-export replay configuration for ISB1.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - export_file: str = Field(alias=Fields.EXPORT_FILE.value) - request_mode: str = Field(alias=Fields.REQUEST_MODE.value) - support_status: Optional[ - Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] - ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) - search_space: List[ISB1ReplaySearchSpaceEntry] = Field( - alias=Fields.SEARCH_SPACE.value, min_length=1 - ) - - -class ISB1KVStressSearchSpaceEntry(BaseModel): - """ISB1 KV stress search space configuration.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - users: List[int] = Field(alias=Fields.USERS.value, min_length=1) - offload_modes: List[Literal["on", "off", "noprefix", "legacy"]] = Field( - alias=Fields.OFFLOAD_MODES.value, - min_length=1, - ) - duration_s: int = Field(alias=Fields.DURATION_S.value, gt=0) - - -class ISB1KVStressTPConfig(BaseModel): - """Per-TP KV stress configuration for ISB1 parity sweeps.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - tp: int = Field(gt=0) - ep: int = Field(default=1, gt=0) - users: List[int] = Field(alias=Fields.USERS.value, min_length=1) - offload_modes: List[Literal["on", "off", "noprefix", "legacy"]] = Field( - alias=Fields.OFFLOAD_MODES.value, - min_length=1, - ) - duration_s: int = Field(alias=Fields.DURATION_S.value, gt=0) - - -class ISB1KVStressConfigEntry(BaseModel): - """Per-export KV stress configuration for ISB1.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - export_file: str = Field(alias=Fields.EXPORT_FILE.value) - request_mode: str = Field(alias=Fields.REQUEST_MODE.value) - support_status: Optional[ - Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] - ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) - workload_type: Literal["chat", "code"] = Field(alias=Fields.WORKLOAD_TYPE.value) - search_space: List[ISB1KVStressSearchSpaceEntry] = Field( - alias=Fields.SEARCH_SPACE.value, min_length=1 - ) - tp_configs: Optional[List[ISB1KVStressTPConfig]] = Field( - default=None, - alias='tp-configs', - ) - - class SingleNodeSeqLenConfig(BaseModel): """Single node sequence length configuration.""" model_config = ConfigDict(extra='forbid', populate_by_name=True) @@ -514,335 +289,6 @@ class MultiNodeMasterConfigEntry(BaseModel): alias=Fields.SEQ_LEN_CONFIGS.value) -class ISB1MasterConfigEntry(BaseModel): - """Top-level ISB1 replay master configuration entry.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - image: str - model: str - model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) - precision: str - framework: str - runner: str - benchmark_type: Literal["isb1_replay"] = Field( - alias=Fields.BENCHMARK_TYPE.value - ) - runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) - hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) - canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) - max_model_len: Optional[int] = Field( - default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 - ) - offload_mode: Optional[Literal["on", "off", "noprefix", "legacy"]] = Field( - default=None, alias=Fields.OFFLOAD_MODE.value - ) - kv_cache_dtype: Optional[Literal["auto", "fp8"]] = Field( - default=None, alias=Fields.KV_CACHE_DTYPE.value - ) - disable_prefix_caching: Optional[bool] = Field( - default=None, alias=Fields.DISABLE_PREFIX_CACHING.value - ) - replay_configs: List[ISB1ReplayConfigEntry] = Field( - alias=Fields.REPLAY_CONFIGS.value, min_length=1 - ) - - -class ISB1KVStressMasterConfigEntry(BaseModel): - """Top-level ISB1 KV stress master configuration entry.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - image: str - model: str - model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) - precision: str - framework: str - runner: str - benchmark_type: Literal["isb1_kv_stress"] = Field( - alias=Fields.BENCHMARK_TYPE.value - ) - runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) - hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) - canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) - max_model_len: Optional[int] = Field( - default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 - ) - kv_cache_dtype: Literal["auto", "fp8"] = Field(alias=Fields.KV_CACHE_DTYPE.value) - kv_stress_configs: List[ISB1KVStressConfigEntry] = Field( - alias=Fields.KV_STRESS_CONFIGS.value, - min_length=1, - ) - - -ISB1_SHAPE_STEM_RE = re.compile(r"(?P\d+)k(?P\d+)k") -ISB1_RUNNABLE_CERTIFICATION_STATUSES = ["dataset_replay_verified"] - - -def _candidate_config_roots(config_file: str) -> list[Path]: - """Return candidate repo roots for resolving relative export-file paths.""" - config_path = Path(config_file).resolve() - parent_candidates = [config_path.parents[i] for i in range(min(3, len(config_path.parents)))] - candidates = [ - config_path.parent, - *parent_candidates, - Path.cwd().resolve(), - ] - - unique_candidates: list[Path] = [] - for candidate in candidates: - if candidate not in unique_candidates: - unique_candidates.append(candidate) - return unique_candidates - - -def _resolve_export_path(config_file: str, export_file: str) -> Path: - """Resolve an export file relative to the config file or current repo root.""" - export_path = Path(export_file) - if export_path.is_absolute(): - return export_path - - candidate_roots = _candidate_config_roots(config_file) - for candidate_root in candidate_roots: - candidate = candidate_root / export_path - if candidate.exists(): - return candidate - - return candidate_roots[0] / export_path - - -def _load_export_payload(export_path: Path) -> dict: - """Load an ISB1 export payload from disk.""" - try: - with export_path.open("r") as handle: - payload = json.load(handle) - except FileNotFoundError as exc: - raise ValueError(f"Referenced ISB1 export file does not exist: '{export_path}'.") from exc - except json.JSONDecodeError as exc: - raise ValueError(f"Referenced ISB1 export file is not valid JSON: '{export_path}'.") from exc - - exports = payload.get("exports") - if not isinstance(exports, list) or not exports: - raise ValueError( - f"Referenced ISB1 export file must contain a non-empty 'exports' list: '{export_path}'." - ) - return payload - - -def _identity_cells(payload: dict, entry: dict) -> list[dict]: - """Return export cells matching the configured runtime/hardware/model identity.""" - return [ - cell - for cell in payload["exports"] - if cell.get("runtime_stack_id") == entry[Fields.RUNTIME_STACK_ID.value] - and cell.get("hardware_profile_id") == entry[Fields.HARDWARE_PROFILE_ID.value] - and cell.get("canonical_model_id") == entry[Fields.CANONICAL_MODEL_ID.value] - ] - - -def _warn_manifest_max_model_len_mismatch( - *, - export_path: Path, - export_file: str, - max_model_len: Optional[int], - key: str, -) -> None: - """Emit advisory warning if sibling manifest max_model_len disagrees with config.""" - if max_model_len is None: - return - - for manifest_path in sorted(export_path.parent.glob("manifest*.json")): - try: - manifest_payload = json.loads(manifest_path.read_text()) - except (OSError, json.JSONDecodeError): - continue - - manifest_exports = manifest_payload.get("exports") - if isinstance(manifest_exports, list): - export_files = { - item.get("export_file") - for item in manifest_exports - if isinstance(item, dict) and isinstance(item.get("export_file"), str) - } - if export_files and export_file not in export_files: - continue - - manifest_max_model_len = manifest_payload.get("max_model_len") - if manifest_max_model_len is None: - continue - - try: - manifest_max_model_len = int(manifest_max_model_len) - except (TypeError, ValueError): - continue - - if manifest_max_model_len != max_model_len: - warnings.warn( - f"ISB1 master config entry '{key}' sets '{Fields.MAX_MODEL_LEN.value}'=" - f"{max_model_len} for export '{export_file}', but sibling manifest " - f"'{manifest_path}' declares max_model_len={manifest_max_model_len}.", - stacklevel=2, - ) - - -def certify_isb1_replay_contract(master_configs: dict, config_file: str) -> dict: - """Validate that every replay-config resolves to a real, runnable export selection.""" - for key, entry in master_configs.items(): - max_model_len = entry.get(Fields.MAX_MODEL_LEN.value) - - for replay_config in entry[Fields.REPLAY_CONFIGS.value]: - export_file = replay_config[Fields.EXPORT_FILE.value] - support_status = replay_config.get(Fields.SUPPORT_STATUS.value) - export_path = _resolve_export_path(config_file, export_file) - payload = _load_export_payload(export_path) - _warn_manifest_max_model_len_mismatch( - export_path=export_path, - export_file=export_file, - max_model_len=max_model_len, - key=key, - ) - - if not ISB1_SHAPE_STEM_RE.search(export_path.stem) and max_model_len is None: - raise ValueError( - f"ISB1 master config entry '{key}' references mixed-shape export " - f"'{export_file}' without '{Fields.MAX_MODEL_LEN.value}'." - ) - - identity_cells = _identity_cells(payload, entry) - identity_statuses = sorted( - { - cell.get("support_status") - for cell in identity_cells - if cell.get("support_status") is not None - } - ) - matching_cells = [ - cell - for cell in identity_cells - if support_status is None or cell.get("support_status") == support_status - ] - - if support_status is None and len(identity_statuses) > 1: - raise ValueError( - f"ISB1 master config entry '{key}' must pin " - f"'{Fields.SUPPORT_STATUS.value}' for export '{export_file}'. " - f"Matching cells span multiple tiers: {identity_statuses}." - ) - - if not matching_cells: - available_statuses = identity_statuses or [""] - raise ValueError( - f"ISB1 master config entry '{key}' requests export '{export_file}' " - f"with support-status '{support_status}', but no export cell matches " - f"runtime_stack_id='{entry[Fields.RUNTIME_STACK_ID.value]}', " - f"hardware_profile_id='{entry[Fields.HARDWARE_PROFILE_ID.value]}', " - f"canonical_model_id='{entry[Fields.CANONICAL_MODEL_ID.value]}'. " - f"Available support tiers for that identity: {available_statuses}." - ) - - certification_statuses = sorted( - { - cell.get("benchmark_certification_status") - for cell in matching_cells - if cell.get("benchmark_certification_status") is not None - } - ) - if not certification_statuses: - raise ValueError( - f"ISB1 master config entry '{key}' requests export '{export_file}' " - "but the selected export cells do not declare " - "'benchmark_certification_status'." - ) - if certification_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES: - raise ValueError( - f"ISB1 master config entry '{key}' requests export '{export_file}' " - "with runnable support tier selection, but the selected export cells " - f"have benchmark_certification_status values {certification_statuses}. " - "Current InferenceX consumer lanes only accept " - f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}." - ) - - return master_configs - - -def certify_isb1_kv_stress_contract(master_configs: dict, config_file: str) -> dict: - """Validate that every kv-stress-config resolves to a real, runnable export selection.""" - for key, entry in master_configs.items(): - max_model_len = entry.get(Fields.MAX_MODEL_LEN.value) - - for kv_stress_config in entry[Fields.KV_STRESS_CONFIGS.value]: - export_file = kv_stress_config[Fields.EXPORT_FILE.value] - support_status = kv_stress_config.get(Fields.SUPPORT_STATUS.value) - export_path = _resolve_export_path(config_file, export_file) - payload = _load_export_payload(export_path) - _warn_manifest_max_model_len_mismatch( - export_path=export_path, - export_file=export_file, - max_model_len=max_model_len, - key=key, - ) - - if not ISB1_SHAPE_STEM_RE.search(export_path.stem) and max_model_len is None: - raise ValueError( - f"ISB1 KV stress config entry '{key}' references mixed-shape export " - f"'{export_file}' without '{Fields.MAX_MODEL_LEN.value}'." - ) - - identity_cells = _identity_cells(payload, entry) - identity_statuses = sorted( - { - cell.get("support_status") - for cell in identity_cells - if cell.get("support_status") is not None - } - ) - matching_cells = [ - cell - for cell in identity_cells - if support_status is None or cell.get("support_status") == support_status - ] - - if support_status is None and len(identity_statuses) > 1: - raise ValueError( - f"ISB1 KV stress config entry '{key}' must pin " - f"'{Fields.SUPPORT_STATUS.value}' for export '{export_file}'. " - f"Matching cells span multiple tiers: {identity_statuses}." - ) - - if not matching_cells: - available_statuses = identity_statuses or [""] - raise ValueError( - f"ISB1 KV stress config entry '{key}' requests export '{export_file}' " - f"with support-status '{support_status}', but no export cell matches " - f"runtime_stack_id='{entry[Fields.RUNTIME_STACK_ID.value]}', " - f"hardware_profile_id='{entry[Fields.HARDWARE_PROFILE_ID.value]}', " - f"canonical_model_id='{entry[Fields.CANONICAL_MODEL_ID.value]}'. " - f"Available support tiers for that identity: {available_statuses}." - ) - - certification_statuses = sorted( - { - cell.get("benchmark_certification_status") - for cell in matching_cells - if cell.get("benchmark_certification_status") is not None - } - ) - if not certification_statuses: - raise ValueError( - f"ISB1 KV stress config entry '{key}' requests export '{export_file}' " - "but the selected export cells do not declare " - "'benchmark_certification_status'." - ) - if certification_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES: - raise ValueError( - f"ISB1 KV stress config entry '{key}' requests export '{export_file}' " - "with runnable support tier selection, but the selected export cells " - f"have benchmark_certification_status values {certification_statuses}. " - "Current InferenceX consumer lanes only accept " - f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}." - ) - - return master_configs - - def validate_master_config(master_configs: dict) -> List[dict]: """Validate input master configuration structure.""" for key, entry in master_configs.items(): @@ -858,30 +304,6 @@ def validate_master_config(master_configs: dict) -> List[dict]: f"Master config entry '{key}' failed validation:\n{e}") return master_configs - -def validate_isb1_master_config(master_configs: dict) -> List[dict]: - """Validate ISB1 replay master configuration structure.""" - for key, entry in master_configs.items(): - try: - ISB1MasterConfigEntry(**entry) - except ValidationError as e: - raise ValueError( - f"ISB1 master config entry '{key}' failed validation:\n{e}" - ) - return master_configs - - -def validate_isb1_kv_stress_master_config(master_configs: dict) -> List[dict]: - """Validate ISB1 KV stress master configuration structure.""" - for key, entry in master_configs.items(): - try: - ISB1KVStressMasterConfigEntry(**entry) - except ValidationError as e: - raise ValueError( - f"ISB1 KV stress master config entry '{key}' failed validation:\n{e}" - ) - return master_configs - # Runner Config Validation @@ -949,17 +371,26 @@ class ChangelogMatrixEntry(BaseModel): # ============================================================================= -def _load_and_merge_yaml_files(config_files: List[str]) -> dict: - """Load and merge YAML configuration files.""" +def load_config_files(config_files: List[str], validate: bool = True) -> dict: + """Load and merge configuration files. + + Args: + config_files: List of paths to YAML configuration files. + validate: If True, run validate_master_config on loaded data. Defaults to True. + + Returns: + Merged configuration dictionary. + + Raises: + ValueError: If file doesn't exist, isn't a dict, or has duplicate keys. + """ all_config_data = {} for config_file in config_files: try: with open(config_file, 'r') as f: config_data = yaml.safe_load(f) - if not isinstance(config_data, dict): - raise ValueError( - f"Config file '{config_file}' must contain a dictionary" - ) + assert isinstance( + config_data, dict), f"Config file '{config_file}' must contain a dictionary" # Don't allow '*' wildcard in master config keys as we need to reserve these # for expansion in process_changelog.py @@ -980,60 +411,12 @@ def _load_and_merge_yaml_files(config_files: List[str]) -> dict: except FileNotFoundError: raise ValueError(f"Input file '{config_file}' does not exist.") - return all_config_data - - -def load_config_files(config_files: List[str], validate: bool = True) -> dict: - """Load and merge throughput configuration files. - - Args: - config_files: List of paths to YAML configuration files. - validate: If True, run validate_master_config on loaded data. Defaults to True. - - Returns: - Merged configuration dictionary. - - Raises: - ValueError: If file doesn't exist, isn't a dict, or has duplicate keys. - """ - all_config_data = _load_and_merge_yaml_files(config_files) - if validate: validate_master_config(all_config_data) return all_config_data -def load_isb1_config_files(config_files: List[str], validate: bool = True) -> dict: - """Load and merge ISB1 replay configuration files.""" - all_config_data = _load_and_merge_yaml_files(config_files) - - if validate: - validate_isb1_master_config(all_config_data) - for config_file in config_files: - certify_isb1_replay_contract( - _load_and_merge_yaml_files([config_file]), - config_file=config_file, - ) - - return all_config_data - - -def load_isb1_kv_stress_config_files(config_files: List[str], validate: bool = True) -> dict: - """Load and merge ISB1 KV stress configuration files.""" - all_config_data = _load_and_merge_yaml_files(config_files) - - if validate: - validate_isb1_kv_stress_master_config(all_config_data) - for config_file in config_files: - certify_isb1_kv_stress_contract( - _load_and_merge_yaml_files([config_file]), - config_file=config_file, - ) - - return all_config_data - - def load_runner_file(runner_file: str, validate: bool = True) -> dict: """Load runner configuration file. diff --git a/utils/summarize_isb1.py b/utils/summarize_isb1.py deleted file mode 100644 index 3c2428a4b..000000000 --- a/utils/summarize_isb1.py +++ /dev/null @@ -1,238 +0,0 @@ -import argparse -import json -from pathlib import Path -from typing import Any - -try: - from tabulate import tabulate as _tabulate -except ImportError: # pragma: no cover - fallback for minimal local environments - _tabulate = None - - -SUPPORT_STATUS_ORDER = { - "supported": 0, - "reviewed_preview": 1, - "gated": 2, - "artifact_only": 3, - "unsupported": 4, - None: 5, -} - - -def load_isb1_rows(results_dir: Path) -> list[dict[str, Any]]: - """Load processed ISB1 rows from a results directory.""" - rows: list[dict[str, Any]] = [] - for result_path in results_dir.rglob("*.json"): - try: - payload = json.loads(result_path.read_text()) - except (OSError, json.JSONDecodeError): - continue - - candidates = payload if isinstance(payload, list) else [payload] - for candidate in candidates: - if isinstance(candidate, dict) and candidate.get("benchmark_type") == "isb1_replay": - rows.append(candidate) - return rows - - -def sort_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: - """Sort rows in an operator-friendly order.""" - return sorted( - rows, - key=lambda row: ( - SUPPORT_STATUS_ORDER.get(row.get("support_status"), 99), - row.get("infmax_model_prefix", ""), - row.get("hw", ""), - row.get("framework", ""), - row.get("effective_max_context_depth", 0) or 0, - row.get("result_filename", ""), - ), - ) - - -def format_float(value: Any, precision: int = 2) -> str: - """Format a numeric value for markdown tables.""" - if value is None: - return "-" - try: - return f"{float(value):.{precision}f}" - except (TypeError, ValueError): - return str(value) - - -def format_bool(value: Any) -> str: - """Format a truthy value as yes/no for operators.""" - return "yes" if bool(value) else "no" - - -def render_table(headers: list[str], rows: list[list[Any]], tablefmt: str) -> str: - """Render a markdown/plain table with a lightweight fallback if tabulate is absent.""" - normalized_rows = [[str(cell) for cell in row] for row in rows] - if _tabulate is not None: - return _tabulate(normalized_rows, headers=headers, tablefmt=tablefmt) - - widths = [len(header) for header in headers] - for row in normalized_rows: - for index, cell in enumerate(row): - widths[index] = max(widths[index], len(cell)) - - def render_row(row: list[str]) -> str: - cells = [cell.ljust(widths[index]) for index, cell in enumerate(row)] - return f"| {' | '.join(cells)} |" - - divider = f"| {' | '.join('-' * width for width in widths)} |" - lines = [render_row(headers), divider] - lines.extend(render_row(row) for row in normalized_rows) - return "\n".join(lines) - - -def build_lane_summary_table(rows: list[dict[str, Any]], tablefmt: str) -> str: - """Render the main operator lane summary table.""" - headers = [ - "Lane", - "Model", - "HW", - "Framework", - "Support", - "Cert", - "Max Ctx", - "Context Class", - "Sessions", - "Session Tput", - "TTFT Median (s)", - "Ctx Pressure", - "Log Review", - "KV Offload", - "GPU Cache Peak", - "CPU Cache Peak", - ] - table_rows = [ - [ - row.get("result_filename", "-"), - row.get("infmax_model_prefix", "-"), - row.get("hw", "-"), - row.get("framework", "-"), - row.get("support_status", "-"), - row.get("benchmark_certification_status", "-"), - row.get("effective_max_context_depth", "-"), - row.get("context_pressure_class", "-"), - f"{row.get('completed_sessions', 0)}/{row.get('total_sessions', 0)}", - format_float(row.get("session_throughput_sps"), 2), - format_float(row.get("median_ttft"), 3), - (row.get("context_pressure_signal") or {}).get("status", "-"), - format_bool((row.get("context_pressure_signal") or {}).get("requires_log_review")), - format_bool(row.get("kv_offload_observed")), - format_float(row.get("peak_gpu_cache_usage"), 2), - format_float(row.get("peak_cpu_cache_usage"), 2), - ] - for row in rows - ] - return render_table(headers, table_rows, tablefmt) - - -def build_runtime_override_table(rows: list[dict[str, Any]], tablefmt: str) -> str | None: - """Render the runtime override table when any override is present.""" - override_rows = [] - for row in rows: - runtime_overrides = row.get("runtime_overrides") or {} - if not any(value not in (None, "") for value in runtime_overrides.values()): - continue - override_rows.append( - [ - row.get("result_filename", "-"), - row.get("infmax_model_prefix", "-"), - row.get("hw", "-"), - row.get("framework", "-"), - runtime_overrides.get("vllm_cpu_offload_gb") or "-", - runtime_overrides.get("vllm_swap_space_gb") or "-", - runtime_overrides.get("sglang_mem_fraction_override") or "-", - runtime_overrides.get("sglang_chunked_prefill_override") or "-", - row.get("dispatch_ref") or "-", - ] - ) - - if not override_rows: - return None - - headers = [ - "Lane", - "Model", - "HW", - "Framework", - "VLLM CPU Offload GB", - "VLLM Swap GB", - "SGLang Mem Fraction", - "SGLang Chunked Prefill", - "Dispatch Ref", - ] - return render_table(headers, override_rows, tablefmt) - - -def build_action_items(rows: list[dict[str, Any]]) -> list[str]: - """Build operator action items for suspicious or manual-review rows.""" - items: list[str] = [] - for row in rows: - signal = row.get("context_pressure_signal") or {} - if not row.get("context_pressure_suspicious") and not signal.get("requires_log_review"): - continue - - artifact_stems = row.get("artifact_stems") or {} - items.append( - "- " - f"`{row.get('result_filename', 'unknown')}` ({row.get('infmax_model_prefix', '-')}/" - f"{row.get('hw', '-')}/{row.get('framework', '-')}) " - f"requires follow-up: context pressure `{signal.get('status', 'unknown')}`; " - f"review replay `{artifact_stems.get('raw_replay', '-')}`, " - f"logs `{artifact_stems.get('server_logs', '-')}`, " - f"GPU metrics `{artifact_stems.get('gpu_metrics', '-')}`" - + ( - f", dispatch `{row.get('dispatch_ref')}`" - if row.get("dispatch_ref") - else "" - ) - + "." - ) - return items - - -def generate_summary(results_dir: Path, tablefmt: str = "github") -> str: - """Generate an ISB1-specific operator summary in markdown/plain text.""" - rows = sort_rows(load_isb1_rows(results_dir)) - sections = ["## ISB1 Operator Summary", ""] - - if not rows: - sections.append("No ISB1 replay rows found.") - return "\n".join(sections).rstrip() + "\n" - - sections.extend(["### Lane Summary", "", build_lane_summary_table(rows, tablefmt), ""]) - - runtime_override_table = build_runtime_override_table(rows, tablefmt) - if runtime_override_table: - sections.extend(["### Runtime Overrides", "", runtime_override_table, ""]) - - action_items = build_action_items(rows) - sections.append("### Action Items") - sections.append("") - if action_items: - sections.extend(action_items) - else: - sections.append("- None. No suspicious or manual-log-review rows were detected.") - - return "\n".join(sections).rstrip() + "\n" - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Generate an ISB1-specific operator summary.") - parser.add_argument("results_dir", type=Path) - parser.add_argument("--format", choices=["github", "plain"], default="github") - return parser.parse_args() - - -def main() -> int: - args = parse_args() - print(generate_summary(args.results_dir, tablefmt=args.format)) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/utils/test_gate_isb1.py b/utils/test_gate_isb1.py deleted file mode 100644 index 3a9e590e0..000000000 --- a/utils/test_gate_isb1.py +++ /dev/null @@ -1,218 +0,0 @@ -import json -from pathlib import Path - -from gate_isb1 import build_gate_report, load_rows, main - - -def make_row( - *, - result_filename: str, - model: str, - hw: str, - framework: str, - support_status: str, - effective_max_context_depth: int, - context_pressure_class: str, - context_status: str, - requires_log_review: bool = False, - context_pressure_suspicious: bool = False, - completed_sessions: int = 2, - total_sessions: int = 2, - session_throughput_sps: float = 1.0, - benchmark_certification_status: str = "dataset_replay_verified", -): - return { - "benchmark_type": "isb1_replay", - "result_filename": result_filename, - "artifact_stems": { - "processed": f"isb1_{result_filename}", - "raw_replay": f"replay_{result_filename}", - "server_logs": f"server_logs_{result_filename}", - "gpu_metrics": f"gpu_metrics_{result_filename}", - }, - "infmax_model_prefix": model, - "hw": hw, - "framework": framework, - "support_status": support_status, - "effective_max_context_depth": effective_max_context_depth, - "context_pressure_class": context_pressure_class, - "context_pressure_signal": { - "status": context_status, - "requires_log_review": requires_log_review, - }, - "context_pressure_suspicious": context_pressure_suspicious, - "completed_sessions": completed_sessions, - "total_sessions": total_sessions, - "session_throughput_sps": session_throughput_sps, - "benchmark_certification_status": benchmark_certification_status, - } - - -def test_build_gate_report_passes_with_sglang_observability_gap(): - rows = [ - make_row( - result_filename="dsr1_control_b200_vllm", - model="dsr1", - hw="b200-cw-1", - framework="vllm", - support_status="supported", - effective_max_context_depth=9416, - context_pressure_class="standard", - context_status="not_applicable", - ), - make_row( - result_filename="gptoss_control_h100_vllm", - model="gptoss", - hw="h100-cw-1", - framework="vllm", - support_status="supported", - effective_max_context_depth=9416, - context_pressure_class="standard", - context_status="not_applicable", - ), - ] - - for hw in ("b200-cw-1", "h100-cw-1", "h200-cw-1"): - for framework in ("vllm", "sglang"): - rows.append( - make_row( - result_filename=f"qwen_131k_{hw}_{framework}", - model="qwen3.5", - hw=hw, - framework=framework, - support_status="reviewed_preview", - effective_max_context_depth=131272, - context_pressure_class="standard", - context_status="not_applicable", - ) - ) - rows.append( - make_row( - result_filename=f"qwen_500k_{hw}_{framework}", - model="qwen3.5", - hw=hw, - framework=framework, - support_status="reviewed_preview", - effective_max_context_depth=524288, - context_pressure_class="extended_500k", - context_status="ok" if framework == "vllm" else "observability_gap", - requires_log_review=framework == "sglang", - ) - ) - - rows.extend( - [ - make_row( - result_filename="qwen_1m_b200_vllm", - model="qwen3.5", - hw="b200-cw-1", - framework="vllm", - support_status="reviewed_preview", - effective_max_context_depth=1048576, - context_pressure_class="extended_1m", - context_status="ok", - ), - make_row( - result_filename="qwen_1m_b200_sglang", - model="qwen3.5", - hw="b200-cw-1", - framework="sglang", - support_status="reviewed_preview", - effective_max_context_depth=1048576, - context_pressure_class="extended_1m", - context_status="observability_gap", - requires_log_review=True, - ), - ] - ) - - report = build_gate_report(rows) - - assert report["overall"] == "pass" - assert all(gate["status"] == "pass" for gate in report["gates"]) - qwen_500k_gate = next(gate for gate in report["gates"] if gate["id"] == "qwen_500k") - assert qwen_500k_gate["review_required_rows"] - assert any( - row["result_filename"] == "qwen_500k_b200-cw-1_sglang" - for row in qwen_500k_gate["review_required_rows"] - ) - - -def test_build_gate_report_fails_control_lane_and_preserves_artifact_refs(): - rows = [ - make_row( - result_filename="dsr1_control_b200_vllm", - model="dsr1", - hw="b200-cw-1", - framework="vllm", - support_status="supported", - effective_max_context_depth=9416, - context_pressure_class="standard", - context_status="not_applicable", - completed_sessions=1, - total_sessions=2, - session_throughput_sps=0.0, - ) - ] - - report = build_gate_report(rows) - - assert report["overall"] == "fail" - control_gate = next(gate for gate in report["gates"] if gate["id"] == "control_lanes") - assert control_gate["status"] == "fail" - assert control_gate["failing_rows"][0]["result_filename"] == "dsr1_control_b200_vllm" - assert control_gate["failing_rows"][0]["artifact_stems"]["server_logs"] == "server_logs_dsr1_control_b200_vllm" - assert "completed_sessions == total_sessions" in control_gate["failing_rows"][0]["failed_criteria"] - assert "session_throughput_sps > 0" in control_gate["failing_rows"][0]["failed_criteria"] - - -def test_build_gate_report_fails_when_qwen_131k_coverage_is_missing(): - rows = [ - make_row( - result_filename="qwen_131k_b200_vllm", - model="qwen3.5", - hw="b200-cw-1", - framework="vllm", - support_status="reviewed_preview", - effective_max_context_depth=131272, - context_pressure_class="standard", - context_status="not_applicable", - ) - ] - - report = build_gate_report(rows) - - assert report["overall"] == "fail" - qwen_131k_gate = next(gate for gate in report["gates"] if gate["id"] == "qwen_131k") - assert qwen_131k_gate["status"] == "fail" - assert ["b200", "sglang"] in qwen_131k_gate["missing_coverage"] - assert ["h200", "vllm"] in qwen_131k_gate["missing_coverage"] - - -def test_build_gate_report_handles_no_rows(): - report = build_gate_report([]) - - assert report["overall"] == "partial" - assert all(gate["status"] == "no_rows" for gate in report["gates"]) - - -def test_gate_main_strict_returns_nonzero_on_failure(tmp_path): - payload = [ - make_row( - result_filename="dsr1_control_b200_vllm", - model="dsr1", - hw="b200-cw-1", - framework="vllm", - support_status="supported", - effective_max_context_depth=9416, - context_pressure_class="standard", - context_status="not_applicable", - completed_sessions=1, - total_sessions=2, - ) - ] - report_path = tmp_path / "agg_isb1.json" - report_path.write_text(json.dumps(payload)) - - assert load_rows(report_path)[0]["result_filename"] == "dsr1_control_b200_vllm" - assert main([str(report_path), "--strict"]) == 1 diff --git a/utils/test_summarize_isb1.py b/utils/test_summarize_isb1.py deleted file mode 100644 index 3f4320594..000000000 --- a/utils/test_summarize_isb1.py +++ /dev/null @@ -1,105 +0,0 @@ -import json -from pathlib import Path - -from summarize_isb1 import generate_summary - - -def write_result(path: Path, payload: dict) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload)) - - -def make_row(**overrides): - row = { - "benchmark_type": "isb1_replay", - "result_filename": "isb1_control_vllm_b200", - "artifact_stems": { - "processed": "isb1_isb1_control_vllm_b200", - "raw_replay": "replay_isb1_control_vllm_b200", - "server_logs": "server_logs_isb1_control_vllm_b200", - "gpu_metrics": "gpu_metrics_isb1_control_vllm_b200", - }, - "dispatch_ref": "refs/heads/test-summary", - "infmax_model_prefix": "dsr1", - "hw": "b200-cw-1", - "framework": "vllm", - "support_status": "supported", - "benchmark_certification_status": "dataset_replay_verified", - "effective_max_context_depth": 9416, - "context_pressure_class": "standard", - "context_pressure_signal": { - "status": "not_applicable", - "requires_log_review": False, - }, - "context_pressure_suspicious": False, - "completed_sessions": 2, - "total_sessions": 2, - "session_throughput_sps": 1.25, - "median_ttft": 0.18, - "kv_offload_observed": True, - "peak_gpu_cache_usage": 0.78, - "peak_cpu_cache_usage": 0.31, - "runtime_overrides": { - "vllm_cpu_offload_gb": None, - "vllm_swap_space_gb": None, - "sglang_mem_fraction_override": None, - "sglang_chunked_prefill_override": None, - }, - } - row.update(overrides) - return row - - -def test_generate_summary_surfaces_lane_override_and_action_sections(tmp_path): - control_row = make_row() - review_row = make_row( - result_filename="isb1_qwen_500k_sglang", - artifact_stems={ - "processed": "isb1_isb1_qwen_500k_sglang", - "raw_replay": "replay_isb1_qwen_500k_sglang", - "server_logs": "server_logs_isb1_qwen_500k_sglang", - "gpu_metrics": "gpu_metrics_isb1_qwen_500k_sglang", - }, - infmax_model_prefix="qwen3.5", - hw="h200-cw-1", - framework="sglang", - support_status="reviewed_preview", - effective_max_context_depth=524288, - context_pressure_class="extended_500k", - context_pressure_signal={ - "status": "observability_gap", - "requires_log_review": True, - }, - runtime_overrides={ - "vllm_cpu_offload_gb": None, - "vllm_swap_space_gb": None, - "sglang_mem_fraction_override": "0.77", - "sglang_chunked_prefill_override": "65536", - }, - kv_offload_observed=False, - peak_gpu_cache_usage=0.88, - peak_cpu_cache_usage=0.0, - ) - non_isb1_row = {"benchmark_type": "throughput", "ignored": True} - - write_result(tmp_path / "results" / "control.json", control_row) - write_result(tmp_path / "results" / "review.json", review_row) - write_result(tmp_path / "results" / "non_isb1.json", non_isb1_row) - - summary = generate_summary(tmp_path / "results") - - assert "## ISB1 Operator Summary" in summary - assert "### Lane Summary" in summary - assert "### Runtime Overrides" in summary - assert "### Action Items" in summary - assert "isb1_qwen_500k_sglang" in summary - assert "observability_gap" in summary - assert "65536" in summary - assert "server_logs_isb1_qwen_500k_sglang" in summary - assert "non_isb1" not in summary - - -def test_generate_summary_handles_empty_results(tmp_path): - summary = generate_summary(tmp_path / "results") - assert "No ISB1 replay rows found." in summary - assert "Lane Summary" not in summary diff --git a/utils/test_verify_producer_sync.py b/utils/test_verify_producer_sync.py deleted file mode 100644 index 071d42ba8..000000000 --- a/utils/test_verify_producer_sync.py +++ /dev/null @@ -1,102 +0,0 @@ -import json -import subprocess -import sys -from pathlib import Path - -SCRIPT_PATH = Path(__file__).parent / "verify_producer_sync.py" - - -RELEVANT_FILES = { - "core/sglang/chat_8k1k_qwen3.5.json": {"name": "core"}, - "extension_32k/sglang/chat_32k1k.json": {"name": "e32k"}, - "extension_64k/sglang/chat_64k1k.json": {"name": "e64k"}, - "extension_131k/sglang/code_131k1k_qwen3.5.json": {"name": "e131k"}, - "preview/long_context_500k/manifest_qwen3.5.json": {"name": "500k"}, - "preview/long_context_1m/manifest.json": {"name": "1m"}, -} - - -def _write_tree(root: Path, files: dict[str, dict]) -> None: - for relative_path, payload in files.items(): - file_path = root / relative_path - file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text(json.dumps(payload, sort_keys=True)) - - -def _run_verify(producer_root: Path, consumer_root: Path) -> subprocess.CompletedProcess[str]: - return subprocess.run( - [ - sys.executable, - str(SCRIPT_PATH), - "--producer-root", - str(producer_root), - "--consumer-root", - str(consumer_root), - ], - capture_output=True, - text=True, - check=False, - ) - - -def test_verify_producer_sync_passes_for_identical_trees(tmp_path: Path) -> None: - producer_root = tmp_path / "producer" - consumer_root = tmp_path / "consumer" - _write_tree(producer_root, RELEVANT_FILES) - _write_tree(consumer_root, RELEVANT_FILES) - - result = _run_verify(producer_root, consumer_root) - - assert result.returncode == 0 - assert "sync check passed" in result.stdout - - -def test_verify_producer_sync_fails_on_content_mismatch(tmp_path: Path) -> None: - producer_root = tmp_path / "producer" - consumer_root = tmp_path / "consumer" - _write_tree(producer_root, RELEVANT_FILES) - _write_tree(consumer_root, RELEVANT_FILES) - - mismatched_path = consumer_root / "preview/long_context_500k/manifest_qwen3.5.json" - mismatched_path.write_text(json.dumps({"name": "changed"}, sort_keys=True)) - - result = _run_verify(producer_root, consumer_root) - - assert result.returncode == 1 - assert "content_mismatch" in result.stderr - assert "preview/long_context_500k/manifest_qwen3.5.json" in result.stderr - - -def test_verify_producer_sync_skips_subtrees_missing_on_both_sides(tmp_path: Path) -> None: - # Only one subtree is populated — others are legitimately empty on both - # sides (e.g. a producer that has not materialized 1M previews yet, run - # against a consumer that has not committed them). This must pass. - producer_root = tmp_path / "producer" - consumer_root = tmp_path / "consumer" - partial = { - "extension_131k/sglang/code_131k1k_qwen3.5.json": {"name": "only"}, - } - _write_tree(producer_root, partial) - _write_tree(consumer_root, partial) - - result = _run_verify(producer_root, consumer_root) - - assert result.returncode == 0 - assert "sync check passed" in result.stdout - - -def test_verify_producer_sync_reports_one_sided_subtree(tmp_path: Path) -> None: - # Producer has a subtree but consumer is missing it — must fail. - producer_root = tmp_path / "producer" - consumer_root = tmp_path / "consumer" - _write_tree( - producer_root, - {"extension_131k/sglang/code_131k1k_qwen3.5.json": {"name": "p"}}, - ) - consumer_root.mkdir(parents=True, exist_ok=True) - - result = _run_verify(producer_root, consumer_root) - - assert result.returncode == 1 - assert "missing_consumer_subtree" in result.stderr - assert "extension_131k" in result.stderr diff --git a/utils/verify_producer_sync.py b/utils/verify_producer_sync.py deleted file mode 100644 index 0b60957e0..000000000 --- a/utils/verify_producer_sync.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python3 -"""Verify producer/consumer sync for ISB1 committed export subtrees. - -Covers every export root that is intentionally mirrored from the Inferscope -producer into the InferenceX consumer tree: the 8k core bundle, the 32k / 64k / -131k extension bundles, and the gated 500k / 1M preview bundles. - -Subtrees that exist on neither side are silently skipped (there is nothing to -sync). Subtrees that exist on only one side are reported as sync issues. -""" - -from __future__ import annotations - -import argparse -import sys -from dataclasses import dataclass -from pathlib import Path - - -RELEVANT_SUBTREES = ( - "core", - "extension_32k", - "extension_64k", - "extension_131k", - "preview/long_context_500k", - "preview/long_context_1m", -) - - -@dataclass -class SyncIssue: - kind: str - path: str - - -def _json_files(root: Path) -> set[str]: - if not root.exists(): - return set() - return { - str(path.relative_to(root)) - for path in root.rglob("*.json") - if path.is_file() - } - - -def _compare_subtree(producer_root: Path, consumer_root: Path, subtree: str) -> list[SyncIssue]: - issues: list[SyncIssue] = [] - - producer_subtree = producer_root / subtree - consumer_subtree = consumer_root / subtree - - producer_exists = producer_subtree.exists() - consumer_exists = consumer_subtree.exists() - - # Nothing on either side: nothing to sync, skip silently. - if not producer_exists and not consumer_exists: - return issues - - producer_files = _json_files(producer_subtree) - consumer_files = _json_files(consumer_subtree) - - if not producer_exists: - issues.append(SyncIssue("missing_producer_subtree", subtree)) - return issues - if not consumer_exists: - issues.append(SyncIssue("missing_consumer_subtree", subtree)) - return issues - - for relative_path in sorted(producer_files - consumer_files): - issues.append(SyncIssue("missing_in_consumer", f"{subtree}/{relative_path}")) - - for relative_path in sorted(consumer_files - producer_files): - issues.append(SyncIssue("extra_in_consumer", f"{subtree}/{relative_path}")) - - for relative_path in sorted(producer_files & consumer_files): - producer_file = producer_subtree / relative_path - consumer_file = consumer_subtree / relative_path - if producer_file.read_bytes() != consumer_file.read_bytes(): - issues.append(SyncIssue("content_mismatch", f"{subtree}/{relative_path}")) - - return issues - - -def verify_sync(producer_root: Path, consumer_root: Path) -> list[SyncIssue]: - issues: list[SyncIssue] = [] - for subtree in RELEVANT_SUBTREES: - issues.extend(_compare_subtree(producer_root, consumer_root, subtree)) - return issues - - -def _default_consumer_root() -> Path: - return Path(__file__).resolve().parents[1] / "datasets" / "isb1" / "exports" - - -def parse_args(argv: list[str] | None = None) -> argparse.Namespace: - parser = argparse.ArgumentParser( - description=( - "Verify that committed ISB1 consumer preview/extension exports are " - "synced with producer exports." - ) - ) - parser.add_argument( - "--producer-root", - required=True, - type=Path, - help="Path to ISB1 producer exports root (…/upstream/inferencex/exports)", - ) - parser.add_argument( - "--consumer-root", - default=_default_consumer_root(), - type=Path, - help="Path to InferenceX consumer exports root (default: datasets/isb1/exports)", - ) - return parser.parse_args(argv) - - -def main(argv: list[str] | None = None) -> int: - args = parse_args(argv) - issues = verify_sync(args.producer_root.resolve(), args.consumer_root.resolve()) - - if not issues: - print( - "Producer/consumer export sync check passed for: " - + ", ".join(RELEVANT_SUBTREES) - ) - return 0 - - print("Producer/consumer export sync check failed:", file=sys.stderr) - for issue in issues: - print(f"- {issue.kind}: {issue.path}", file=sys.stderr) - return 1 - - -if __name__ == "__main__": - raise SystemExit(main()) From 3c10c055df63d9ccc63d0eac53c34a4114d008a7 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:37:45 -0700 Subject: [PATCH 07/18] chore(isb1): drop remaining non-scope infra files from PR #1032 --- .github/configs/amd-master.yaml | 65 +- .github/configs/nvidia-master.yaml | 1254 +++++++++++++++++++------- .github/workflows/benchmark-tmpl.yml | 30 +- perf-changelog.yaml | 244 ++++- runners/launch_b300-nv.sh | 113 ++- 5 files changed, 1308 insertions(+), 398 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a2c424c91..f1181b941 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -131,6 +131,24 @@ qwen3.5-bf16-mi355x-sglang: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } +qwen3.5-bf16-mi355x-sglang-mtp: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: mi355x + precision: bf16 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + qwen3.5-bf16-mi300x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B @@ -206,6 +224,27 @@ qwen3.5-fp8-mi355x-sglang: - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } +qwen3.5-fp8-mi355x-sglang-mtp: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } + qwen3.5-fp4-mi355x-sglang: image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413 model: amd/Qwen3.5-397B-A17B-MXFP4 @@ -245,7 +284,7 @@ qwen3.5-fp8-mi300x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang: - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x @@ -262,6 +301,24 @@ glm5-fp8-mi355x-sglang: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } +glm5-fp8-mi355x-sglang-mtp: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + glm5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post model: zai-org/GLM-5-FP8 @@ -470,7 +527,7 @@ gptoss-fp4-mi300x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } + - { tp: 1, conc-start: 64, conc-end: 256 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } @@ -896,7 +953,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg @@ -1104,7 +1161,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5b550879c..90a430b9d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -19,7 +19,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" decode: num-worker: 2 @@ -34,7 +34,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -49,7 +49,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -64,7 +64,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -79,7 +79,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" decode: num-worker: 4 @@ -94,7 +94,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" decode: num-worker: 5 @@ -110,7 +110,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -124,7 +124,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -138,7 +138,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -152,7 +152,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -166,7 +166,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -180,7 +180,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 6 @@ -199,7 +199,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -214,7 +214,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -229,7 +229,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -244,7 +244,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -259,7 +259,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -274,7 +274,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -289,7 +289,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 2 @@ -305,7 +305,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -319,7 +319,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -333,7 +333,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -347,7 +347,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -361,7 +361,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -375,7 +375,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -405,7 +405,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 8 @@ -420,7 +420,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" decode: num-worker: 8 @@ -435,7 +435,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" decode: num-worker: 8 @@ -450,7 +450,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" decode: num-worker: 8 @@ -466,7 +466,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" decode: num-worker: 7 @@ -481,7 +481,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" decode: num-worker: 4 @@ -496,7 +496,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" decode: num-worker: 3 @@ -511,7 +511,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" decode: num-worker: 2 @@ -527,7 +527,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" decode: num-worker: 3 @@ -541,7 +541,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" decode: num-worker: 3 @@ -555,7 +555,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" decode: num-worker: 3 @@ -570,7 +570,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" decode: num-worker: 5 @@ -584,7 +584,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" decode: num-worker: 1 @@ -598,7 +598,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" decode: num-worker: 5 @@ -618,7 +618,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" decode: num-worker: 6 @@ -633,7 +633,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" decode: num-worker: 2 @@ -648,7 +648,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" decode: num-worker: 6 @@ -663,7 +663,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" decode: num-worker: 4 @@ -679,7 +679,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" decode: num-worker: 3 @@ -694,7 +694,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" decode: num-worker: 1 @@ -709,7 +709,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" decode: num-worker: 1 @@ -725,7 +725,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" decode: num-worker: 1 @@ -739,7 +739,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" decode: num-worker: 4 @@ -753,7 +753,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" decode: num-worker: 4 @@ -767,7 +767,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" decode: num-worker: 6 @@ -782,7 +782,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" decode: num-worker: 1 @@ -796,7 +796,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" decode: num-worker: 2 @@ -810,7 +810,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" decode: num-worker: 1 @@ -824,7 +824,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" decode: num-worker: 1 @@ -854,7 +854,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -869,7 +869,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 2 @@ -884,7 +884,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -899,7 +899,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -914,7 +914,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -929,7 +929,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml" decode: num-worker: 2 @@ -944,7 +944,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml" decode: num-worker: 2 @@ -960,7 +960,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -974,7 +974,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -988,7 +988,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -1002,7 +1002,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -1016,7 +1016,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -1030,7 +1030,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -1044,7 +1044,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -1063,7 +1063,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -1078,7 +1078,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -1093,7 +1093,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -1108,7 +1108,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -1123,7 +1123,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -1138,7 +1138,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -1154,7 +1154,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -1168,7 +1168,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -1182,7 +1182,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -1196,7 +1196,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -1210,7 +1210,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -1224,7 +1224,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -1238,7 +1238,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -1268,7 +1268,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml" decode: num-worker: 8 @@ -1283,7 +1283,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml" decode: num-worker: 8 @@ -1298,7 +1298,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml" decode: num-worker: 1 @@ -1313,7 +1313,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml" decode: num-worker: 2 @@ -1328,7 +1328,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml" decode: num-worker: 5 @@ -1343,7 +1343,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml" decode: num-worker: 2 @@ -1361,7 +1361,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml" decode: num-worker: 1 @@ -1375,7 +1375,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml" decode: num-worker: 2 @@ -1389,7 +1389,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml" decode: num-worker: 3 @@ -1403,7 +1403,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml" decode: num-worker: 8 @@ -1417,7 +1417,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml" decode: num-worker: 8 @@ -1431,7 +1431,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml" decode: num-worker: 8 @@ -1445,7 +1445,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml" decode: num-worker: 1 @@ -1464,7 +1464,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml" decode: num-worker: 2 @@ -1479,7 +1479,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 4 @@ -1494,7 +1494,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml" decode: num-worker: 4 @@ -1509,7 +1509,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml" decode: num-worker: 1 @@ -1524,7 +1524,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml" decode: num-worker: 1 @@ -1539,7 +1539,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml" decode: num-worker: 1 @@ -1557,7 +1557,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml" decode: num-worker: 4 @@ -1571,7 +1571,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml" decode: num-worker: 8 @@ -1585,7 +1585,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml" decode: num-worker: 1 @@ -1599,7 +1599,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml" decode: num-worker: 1 @@ -1613,7 +1613,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml" decode: num-worker: 5 @@ -1627,7 +1627,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml" decode: num-worker: 1 @@ -1641,7 +1641,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml" decode: num-worker: 1 @@ -1669,6 +1669,29 @@ dsr1-fp4-b200-sglang: - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } +# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 +# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 +# B200 SGLang recipe as-is until B300-specific tuning is available. +dsr1-fp4-b300-sglang: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + dsr1-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 model: nvidia/DeepSeek-R1-0528-FP4-V2 @@ -1751,6 +1774,28 @@ dsr1-fp8-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } +# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 +# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP8 +# B200 SGLang recipe as-is until B300-specific tuning is available. +dsr1-fp8-b300-sglang: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: b300 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } + qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e model: Qwen/Qwen3.5-397B-A17B @@ -1769,6 +1814,24 @@ qwen3.5-bf16-b200-sglang: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } +qwen3.5-bf16-b200-sglang-mtp: + image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: b200 + precision: bf16 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + qwen3.5-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.9-cu130-amd64 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -1807,6 +1870,24 @@ qwen3.5-fp4-b200-sglang: search-space: - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } +qwen3.5-fp4-b200-sglang-mtp: + image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 + model: nvidia/Qwen3.5-397B-A17B-NVFP4 + model-prefix: qwen3.5 + runner: b200 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } + glm5-fp8-b200-sglang: image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 model: zai-org/GLM-5-FP8 @@ -1825,6 +1906,63 @@ glm5-fp8-b200-sglang: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } +glm5-fp8-b200-sglang-mtp: + image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: b200 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + +# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 +# does not have a B300-specific recipe, so this config reuses the existing GLM5 FP8 +# B200 SGLang recipe as-is until B300-specific tuning is available. +glm5-fp8-b300-sglang: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: b300 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + +glm5-fp8-b300-sglang-mtp: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: b300 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + glm5-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.10.post1-cu130 model: nvidia/GLM-5-NVFP4 @@ -1845,6 +1983,69 @@ glm5-fp4-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } +glm5-fp4-b200-sglang-mtp: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: nvidia/GLM-5-NVFP4 + model-prefix: glm5 + runner: b200 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + +# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5 +# does not have a B300-specific recipe, so this config reuses the existing +# GLM-5 FP4 B200 SGLang recipe as-is until B300-specific tuning is available. +glm5-fp4-b300-sglang: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: nvidia/GLM-5-NVFP4 + model-prefix: glm5 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + +glm5-fp4-b300-sglang-mtp: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: nvidia/GLM-5-NVFP4 + model-prefix: glm5 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + qwen3.5-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.9-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -1900,6 +2101,86 @@ qwen3.5-fp8-b300-sglang: search-space: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } +qwen3.5-fp4-b300-sglang: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: nvidia/Qwen3.5-397B-A17B-NVFP4 + model-prefix: qwen3.5 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } + +qwen3.5-fp4-b300-sglang-mtp: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: nvidia/Qwen3.5-397B-A17B-NVFP4 + model-prefix: qwen3.5 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } + +qwen3.5-bf16-b300-sglang: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: b300 + precision: bf16 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + +qwen3.5-bf16-b300-sglang-mtp: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: b300 + precision: bf16 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + kimik2.5-int4-b200-vllm: image: vllm/vllm-openai:v0.15.1 model: moonshotai/Kimi-K2.5 @@ -1974,6 +2255,28 @@ dsr1-fp8-b200-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } +# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 +# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP8 +# B200 SGLang MTP recipe as-is until B300-specific tuning is available. Image bumped +# to v0.5.10.post1-cu130 to match the standard B300 SGLang image used by other B300 configs. +dsr1-fp8-b300-sglang-mtp: + image: lmsysorg/sglang:v0.5.10.post1-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: b300 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } + dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 model: deepseek-ai/DeepSeek-R1-0528 @@ -2158,7 +2461,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 11 @@ -2173,7 +2476,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 11 @@ -2188,7 +2491,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 11 @@ -2203,7 +2506,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 9 @@ -2218,7 +2521,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 11 @@ -2233,7 +2536,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 8 @@ -2248,7 +2551,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 7 @@ -2263,7 +2566,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -2278,7 +2581,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 2 @@ -2293,7 +2596,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2307,7 +2610,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2321,7 +2624,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2335,7 +2638,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2349,7 +2652,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2363,7 +2666,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2377,7 +2680,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2391,7 +2694,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 6 @@ -2405,7 +2708,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 7 @@ -2424,7 +2727,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 7 @@ -2439,7 +2742,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 7 @@ -2454,7 +2757,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 6 @@ -2469,7 +2772,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 3 @@ -2484,7 +2787,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -2499,7 +2802,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 @@ -2514,7 +2817,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 @@ -2529,7 +2832,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 @@ -2544,7 +2847,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -2559,7 +2862,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 7 @@ -2573,7 +2876,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 7 @@ -2587,7 +2890,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 6 @@ -2601,7 +2904,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2615,7 +2918,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -2629,7 +2932,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2643,7 +2946,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -2657,7 +2960,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2671,7 +2974,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -2701,7 +3004,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -2716,7 +3019,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -2731,7 +3034,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -2746,7 +3049,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -2761,7 +3064,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -2776,7 +3079,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -2791,7 +3094,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -2806,7 +3109,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 @@ -2821,7 +3124,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -2836,7 +3139,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2850,7 +3153,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2864,7 +3167,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2878,7 +3181,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2892,7 +3195,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2906,7 +3209,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2920,7 +3223,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2934,7 +3237,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2948,7 +3251,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -2967,7 +3270,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -2982,7 +3285,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -2997,7 +3300,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3012,7 +3315,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -3029,7 +3332,7 @@ dsr1-fp8-h100-dynamo-trt: # ep: 16 # dp-attn: true # additional-settings: - # # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml + # # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml # - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" # decode: # num-worker: 2 @@ -3044,7 +3347,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -3059,7 +3362,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3073,7 +3376,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3087,7 +3390,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3101,7 +3404,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -3115,7 +3418,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3197,6 +3500,31 @@ minimaxm2.5-fp8-b200-vllm: - { tp: 2, conc-start: 4, conc-end: 512 } - { tp: 4, conc-start: 4, conc-end: 512 } +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html +# does not have a B300-specific recipe, so this config reuses the existing +# MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. +minimaxm2.5-fp8-b300-vllm: + image: vllm/vllm-openai:v0.19.0-cu130 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: b300 + precision: fp8 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 512 } + - { tp: 4, conc-start: 4, conc-end: 512 } + - { tp: 2, ep: 2, conc-start: 512, conc-end: 512 } + - { tp: 4, ep: 4, conc-start: 256, conc-end: 512 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 512 } + - { tp: 4, conc-start: 4, conc-end: 512 } + minimaxm2.5-fp4-b200-vllm: image: vllm/vllm-openai:v0.19.0-cu130 model: nvidia/MiniMax-M2.5-NVFP4 @@ -3226,6 +3554,38 @@ minimaxm2.5-fp4-b200-vllm: - { tp: 4, conc-start: 4, conc-end: 512 } - { tp: 8, conc-start: 4, conc-end: 4 } +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html +# does not have a B300-specific recipe, so this config reuses the existing +# MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. +minimaxm2.5-fp4-b300-vllm: + image: vllm/vllm-openai:v0.19.0-cu130 + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 4 } + - { tp: 2, conc-start: 4, conc-end: 512 } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 4, conc-start: 4, conc-end: 512 } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 4 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 32 } + - { tp: 1, conc-start: 256, conc-end: 512 } + - { tp: 2, conc-start: 4, conc-end: 512 } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 512 } + - { tp: 4, conc-start: 4, conc-end: 512 } + - { tp: 8, conc-start: 4, conc-end: 4 } + gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.18.0 model: openai/gpt-oss-120b @@ -3490,7 +3850,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -3505,7 +3865,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -3520,7 +3880,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" decode: num-worker: 1 @@ -3535,7 +3895,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" decode: num-worker: 1 @@ -3550,7 +3910,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" decode: num-worker: 5 @@ -3567,7 +3927,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3581,7 +3941,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3595,7 +3955,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -3609,7 +3969,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -3623,7 +3983,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -3637,7 +3997,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 @@ -3651,7 +4011,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3671,7 +4031,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -3686,7 +4046,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -3701,7 +4061,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" decode: num-worker: 1 @@ -3716,7 +4076,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -3731,7 +4091,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" decode: num-worker: 1 @@ -3747,7 +4107,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -3761,7 +4121,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -3775,7 +4135,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3789,7 +4149,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3803,7 +4163,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3817,7 +4177,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 @@ -3848,7 +4208,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml" decode: num-worker: 1 @@ -3863,7 +4223,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml" decode: num-worker: 1 @@ -3878,7 +4238,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" decode: num-worker: 1 @@ -3893,7 +4253,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml" decode: num-worker: 1 @@ -3908,7 +4268,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml" decode: num-worker: 3 @@ -3923,7 +4283,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml" decode: num-worker: 3 @@ -3938,7 +4298,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml" decode: num-worker: 3 @@ -3953,7 +4313,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml" decode: num-worker: 1 @@ -3967,7 +4327,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml" decode: num-worker: 1 @@ -3981,7 +4341,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml" decode: num-worker: 1 @@ -3995,7 +4355,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml" decode: num-worker: 1 @@ -4009,7 +4369,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml" decode: num-worker: 1 @@ -4023,7 +4383,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml" decode: num-worker: 3 @@ -4037,7 +4397,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml" decode: num-worker: 3 @@ -4056,7 +4416,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml" decode: num-worker: 1 @@ -4071,7 +4431,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml" decode: num-worker: 1 @@ -4086,7 +4446,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml" decode: num-worker: 1 @@ -4101,7 +4461,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml" decode: num-worker: 1 @@ -4116,7 +4476,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml" decode: num-worker: 1 @@ -4131,7 +4491,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml" decode: num-worker: 3 @@ -4146,7 +4506,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml" decode: num-worker: 3 @@ -4161,7 +4521,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" decode: num-worker: 1 @@ -4175,7 +4535,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml" decode: num-worker: 1 @@ -4189,7 +4549,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml" decode: num-worker: 1 @@ -4203,7 +4563,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml" decode: num-worker: 1 @@ -4217,7 +4577,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml" decode: num-worker: 3 @@ -4231,7 +4591,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml" decode: num-worker: 3 @@ -4245,7 +4605,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml" decode: num-worker: 3 @@ -4275,7 +4635,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 1 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/low-latency.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/low-latency.yaml - "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml" decode: num-worker: 1 @@ -4291,7 +4651,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml - "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml" decode: num-worker: 1 @@ -4307,7 +4667,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml - "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml" decode: num-worker: 1 @@ -4323,7 +4683,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml - "CONFIG_FILE=recipes/gb200-fp8/1k1k/ultra-tpt.yaml" decode: num-worker: 1 @@ -4342,7 +4702,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 1 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/low-latency.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/low-latency.yaml - "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml" decode: num-worker: 1 @@ -4358,7 +4718,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml - "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml" decode: num-worker: 1 @@ -4374,7 +4734,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml" decode: num-worker: 1 @@ -4403,7 +4763,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml" decode: num-worker: 4 @@ -4419,7 +4779,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml" decode: num-worker: 1 @@ -4435,7 +4795,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/max.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/max.yaml - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml" decode: num-worker: 1 @@ -4454,7 +4814,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml" decode: num-worker: 1 @@ -4470,7 +4830,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml" decode: num-worker: 1 @@ -4486,7 +4846,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/max.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/max.yaml - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml" decode: num-worker: 1 @@ -4630,7 +4990,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -4645,7 +5005,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" decode: num-worker: 1 @@ -4660,7 +5020,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -4675,7 +5035,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -4690,7 +5050,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" decode: num-worker: 1 @@ -4705,7 +5065,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" decode: num-worker: 1 @@ -4720,7 +5080,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -4734,7 +5094,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -4748,7 +5108,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -4762,7 +5122,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -4776,7 +5136,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 @@ -4790,7 +5150,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -4809,7 +5169,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -4824,7 +5184,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -4839,7 +5199,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -4854,7 +5214,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -4869,7 +5229,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -4884,7 +5244,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -4899,7 +5259,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -4914,7 +5274,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml" decode: num-worker: 1 @@ -4929,7 +5289,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -4943,7 +5303,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -4957,7 +5317,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -4971,7 +5331,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -4985,7 +5345,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -4999,7 +5359,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -5013,7 +5373,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -5027,7 +5387,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -5171,7 +5531,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 4 @@ -5186,7 +5546,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" decode: num-worker: 4 @@ -5201,7 +5561,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml" decode: num-worker: 1 @@ -5216,7 +5576,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml" decode: num-worker: 1 @@ -5231,7 +5591,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml" decode: num-worker: 1 @@ -5246,7 +5606,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml" decode: num-worker: 1 @@ -5261,7 +5621,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml" decode: num-worker: 2 @@ -5276,7 +5636,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" decode: num-worker: 4 @@ -5290,7 +5650,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" decode: num-worker: 4 @@ -5304,7 +5664,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml" decode: num-worker: 4 @@ -5318,7 +5678,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml" decode: num-worker: 1 @@ -5332,7 +5692,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml" decode: num-worker: 1 @@ -5346,7 +5706,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml" decode: num-worker: 2 @@ -5360,7 +5720,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml" decode: num-worker: 2 @@ -5379,7 +5739,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 4 @@ -5394,7 +5754,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" decode: num-worker: 4 @@ -5409,7 +5769,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml" decode: num-worker: 1 @@ -5424,7 +5784,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml" decode: num-worker: 1 @@ -5439,7 +5799,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" decode: num-worker: 1 @@ -5454,7 +5814,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml" decode: num-worker: 1 @@ -5469,7 +5829,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" decode: num-worker: 4 @@ -5483,7 +5843,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" decode: num-worker: 4 @@ -5497,7 +5857,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" decode: num-worker: 4 @@ -5511,7 +5871,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml" decode: num-worker: 1 @@ -5525,7 +5885,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml" decode: num-worker: 1 @@ -5539,7 +5899,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" decode: num-worker: 1 @@ -5553,7 +5913,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml" decode: num-worker: 1 @@ -6109,7 +6469,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml" + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]" decode: num-worker: 5 tp: 8 @@ -6122,7 +6482,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml" + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[1]" decode: num-worker: 6 tp: 8 @@ -6135,7 +6495,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml" + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[0]" decode: num-worker: 1 tp: 8 @@ -6148,7 +6508,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml" + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[1]" decode: num-worker: 2 tp: 8 @@ -6165,7 +6525,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml" + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]" decode: num-worker: 1 tp: 8 @@ -6178,7 +6538,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml" + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[1]" decode: num-worker: 5 tp: 8 @@ -6191,7 +6551,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml" + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[2]" decode: num-worker: 5 tp: 8 @@ -6204,7 +6564,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml" + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_tp4" decode: num-worker: 1 tp: 8 @@ -6217,7 +6577,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml" + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_maxtpt_7p2d" decode: num-worker: 2 tp: 8 @@ -6245,7 +6605,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml" + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]" decode: num-worker: 1 tp: 8 @@ -6258,7 +6618,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml" + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[1]" decode: num-worker: 3 tp: 8 @@ -6271,7 +6631,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml" + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[0]" decode: num-worker: 5 tp: 8 @@ -6284,7 +6644,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml" + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[1]" decode: num-worker: 5 tp: 8 @@ -6301,7 +6661,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_lowlat_0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_0.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_0.yaml" decode: num-worker: 3 @@ -6315,7 +6675,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_lowlat_1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_1.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_1.yaml" decode: num-worker: 4 @@ -6329,7 +6689,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_lowlat_2.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_2.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_2.yaml" decode: num-worker: 6 @@ -6344,7 +6704,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml" decode: num-worker: 2 @@ -6358,7 +6718,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml" decode: num-worker: 1 @@ -6372,7 +6732,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml" decode: num-worker: 1 @@ -6386,7 +6746,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml" decode: num-worker: 1 @@ -6416,7 +6776,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 8 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k/mtp/low-latency-tep8-1p1d.yaml" + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[0]" decode: num-worker: 1 tp: 8 @@ -6431,7 +6791,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 8 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k/mtp/low-latency-tep8-1p3d.yaml" + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[1]" decode: num-worker: 3 tp: 8 @@ -6446,7 +6806,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p5d.yaml" + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[1]" decode: num-worker: 5 tp: 8 @@ -6461,7 +6821,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-2p5d.yaml" + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[2]" decode: num-worker: 5 tp: 8 @@ -6476,7 +6836,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p2d.yaml" + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:override_mtp_maxtpt_1p2d" decode: num-worker: 2 tp: 8 @@ -6494,7 +6854,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml" decode: num-worker: 3 @@ -6509,7 +6869,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml" decode: num-worker: 4 @@ -6524,7 +6884,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml" decode: num-worker: 6 @@ -6540,7 +6900,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml" decode: num-worker: 2 @@ -6555,7 +6915,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml" decode: num-worker: 1 @@ -6570,7 +6930,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml" decode: num-worker: 1 @@ -6585,7 +6945,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml" decode: num-worker: 1 @@ -6614,8 +6974,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml" + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" decode: num-worker: 5 tp: 8 @@ -6629,8 +6989,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-6d.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-6d.yaml" + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" decode: num-worker: 6 tp: 8 @@ -6644,8 +7004,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-1d.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-1d.yaml" + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" decode: num-worker: 1 tp: 8 @@ -6659,8 +7019,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-2d.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-2d.yaml" + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" decode: num-worker: 2 tp: 8 @@ -6680,8 +7040,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-1d.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-1d.yaml" + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]" decode: num-worker: 1 tp: 8 @@ -6695,8 +7055,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml" + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]" decode: num-worker: 5 tp: 8 @@ -6710,8 +7070,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-2p-tep8-5d.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k/mtp/low-latency-dep4-2p-tep8-5d.yaml" + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]" decode: num-worker: 5 tp: 8 @@ -6725,14 +7085,230 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 1 dp-attn: false additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/8k1k/mtp/low-latency-tp4-1p-tp8-1d.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k/mtp/low-latency-tp4-1p-tp8-1d.yaml" + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_mtp_tp4" decode: num-worker: 1 tp: 8 ep: 1 dp-attn: false +kimik2.5-fp4-gb200-dynamo-trt: + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: gb200 + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 4, 192, 360, 668 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 5, 15, 30, 55 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 666 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 4301, 6452 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [ 4301 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 4301 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 4 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 156 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 5, 15, 30, 60, 105 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 333 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 615 ] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 2151 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + kimik2.5-fp4-gb200-dynamo-vllm: image: vllm/vllm-openai:v0.18.0-cu130 model: nvidia/Kimi-K2.5-NVFP4 diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index d5a6cc1f4..561a3fbb8 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -116,22 +116,13 @@ jobs: # Cleanup SLURM resources if command -v squeue >/dev/null 2>&1; then - if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* || "${{ runner.name }}" == b300-nv* ]]; then - echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." - scancel --name="${{ runner.name }}" || true - while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do - squeue --name="${{ runner.name }}" - sleep 5 - done - else - echo "[Slurm] Cleaning up jobs for user: $USER ..." - scancel -u "$USER" || true - while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do - squeue -u "$USER" - sleep 5 - done - fi - fi + echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." + scancel --name="${{ runner.name }}" || true + while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do + squeue --name="${{ runner.name }}" + sleep 5 + done + fi - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: @@ -140,6 +131,13 @@ jobs: ref: ${{ inputs.ref || github.ref }} clean: false + - name: Cleanup stale eval outputs (pre-run) + if: ${{ inputs.run-eval || inputs.eval-only }} + run: | + rm -f meta_env.json || true + rm -f results*.json || true + rm -f sample*.jsonl || true + - name: Launch job script env: RUNNER_NAME: ${{ runner.name }} diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3f081c240..ac0ef5d79 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,13 @@ +- config-keys: + - kimik2.5-fp4-gb200-dynamo-trt + description: + - "Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo (14 STP configs)" + - "New framework: dynamo-trt (Dynamo frontend + TensorRT-LLM backend)" + - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" + - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" + - "Runner script updated to support kimik2.5 model prefix with dynamo-trt framework" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1026 + - config-keys: - qwen3.5-fp4-mi355x-sglang description: @@ -1322,7 +1332,7 @@ description: - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and MI325X to achieve better performance" - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/986 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1063 - config-keys: - glm5-fp4-b200-sglang @@ -1359,6 +1369,13 @@ - "TP2/TP4 seach space exploration for Qwen3.5 fp4 on SGL" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1022 +- config-keys: + - glm5-fp8-mi355x-sglang + description: + - "Upgrade GLM5 FP8 MI355X SGLang image to v0.5.10rc0-rocm720-mi35x-20260413" + - "Set --kv-cache-dtype fp8_e4m3 and --disable-radix-cache" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1023 + - config-keys: - qwen3.5-fp8-h200-sglang-mtp description: @@ -1396,3 +1413,228 @@ - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - "TP=4, concurrency 4-256 for 1k1k and 8k1k" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1048 + +- config-keys: + - dsr1-fp4-b300-sglang + description: + - "Add DeepSeek-R1-0528 FP4 B300 SGLang benchmark (non-MTP)" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 does not have a B300-specific recipe, so this reuses the existing DSR1 FP4 B200 SGLang recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1049 + +- config-keys: + - dsr1-fp8-b300-sglang + description: + - "Add DeepSeek-R1-0528 FP8 B300 SGLang benchmark (non-MTP)" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 does not have a B300-specific recipe, so this reuses the existing DSR1 FP8 B200 SGLang recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1050 + +- config-keys: + - glm5-fp8-b300-sglang + description: + - "Add GLM-5 FP8 B300 SGLang benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 does not have a B300-specific recipe, so this reuses the existing GLM5 FP8 B200 SGLang recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1051 + +- config-keys: + - glm5-fp4-b300-sglang + description: + - "Add GLM-5 FP4 (NVFP4) B300 SGLang benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5 does not have a B300-specific recipe, so this reuses the existing GLM-5 FP4 B200 SGLang recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1058 + +- config-keys: + - dsr1-fp8-b300-sglang-mtp + description: + - "Add DeepSeek-R1-0528 FP8 B300 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "EAGLE speculative decoding with MTP, TP=8, concurrency 4-512 for 1k1k and 8k1k" + - "At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 does not have a B300-specific recipe, so this reuses the existing DSR1 FP8 B200 SGLang MTP recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1059 + +- config-keys: + - minimaxm2.5-fp4-b300-vllm + description: + - "Add MiniMax-M2.5 FP4 (NVFP4) B300 vLLM benchmark" + - "Image: vllm/vllm-openai:v0.19.0-cu130" + - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP4 B200 vLLM recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1055 + +- config-keys: + - minimaxm2.5-fp8-b300-vllm + description: + - "Add MiniMax-M2.5 FP8 B300 vLLM benchmark" + - "Image: vllm/vllm-openai:v0.19.0-cu130" + - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054 + +- config-keys: + - gptoss-fp4-mi300x-vllm + description: + - "Expand GPT-OSS 120B FP4 MI300X TP=1 concurrency from 64 to 256 for 1k1k" + - "Higher concurrency improves MoE weight amortization: 8552 total TPS at conc=256 vs 4016 at conc=64 (2.1x)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1053 + +- config-keys: + - dsr1-fp4-b200-dynamo-trt + - dsr1-fp8-b200-dynamo-trt + - dsr1-fp4-b200-dynamo-sglang + - dsr1-fp8-b200-dynamo-sglang + - dsr1-fp8-b200-dynamo-sglang-mtp + - dsr1-fp4-b200-dynamo-sglang-mtp + - dsr1-fp4-b300-dynamo-trt + - dsr1-fp8-b300-dynamo-trt + - dsr1-fp4-gb300-dynamo-trt + - dsr1-fp8-gb300-dynamo-trt + - dsr1-fp4-gb300-dynamo-sglang + - dsr1-fp8-gb300-dynamo-sglang + - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-mi355x-sglang-disagg-mtp + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Add multi-node lm-eval accuracy runs" + - "Eval picks the config with highest max eligible concurrency per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) group on 8k1k" + - "Eval concurrency set to the median eligible conc (>= MIN_EVAL_CONC=16) of the selected config to avoid OOM" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1000 + evals-only: true + +- config-keys: + - qwen3.5-fp4-b300-sglang + description: + - "Add Qwen3.5-397B-A17B NVFP4 B300 SGLang benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" + - "Follows the SGLang cookbook recipe at https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5 as of 2026-04-17" + - "Mirrors the B200 FP4 recipe with mem-fraction-static lowered to 0.8 and an extra TP2/EP2 search-space entry" + - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - qwen3.5-bf16-b300-sglang + description: + - "Add Qwen3.5-397B-A17B BF16 B300 SGLang benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "Model: Qwen/Qwen3.5-397B-A17B" + - "Mirrors the B200 BF16 recipe with an extra TP4/EP1 search-space entry alongside the existing TP8/EP1 sweep" + - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - qwen3.5-bf16-b200-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B BF16 B200 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e" + - "Model: Qwen/Qwen3.5-397B-A17B" + - "Mirrors the qwen3.5-bf16-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-64 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - qwen3.5-bf16-b300-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B BF16 B300 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "Model: Qwen/Qwen3.5-397B-A17B" + - "Mirrors the qwen3.5-bf16-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64, spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - qwen3.5-fp4-b300-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B NVFP4 B300 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" + - "Mirrors the qwen3.5-fp4-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128, spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - glm5-fp8-b300-sglang-mtp + description: + - "Add GLM-5 FP8 B300 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "Model: zai-org/GLM-5-FP8" + - "Mirrors the glm5-fp8-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" + - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - qwen3.5-bf16-mi355x-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B BF16 MI355X SGLang MTP benchmark" + - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" + - "Model: Qwen/Qwen3.5-397B-A17B" + - "Mirrors the qwen3.5-bf16-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - glm5-fp8-b200-sglang-mtp + description: + - "Add GLM-5 FP8 B200 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448" + - "Model: zai-org/GLM-5-FP8" + - "Mirrors the glm5-fp8-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" + - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - glm5-fp4-b300-sglang-mtp + description: + - "Add GLM-5 NVFP4 B300 SGLang MTP benchmark (draft)" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "Model: nvidia/GLM-5-NVFP4" + - "Follows the glm5-fp8-b300-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" + - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - qwen3.5-fp8-mi355x-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B FP8 MI355X SGLang MTP benchmark" + - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414" + - "Model: Qwen/Qwen3.5-397B-A17B-FP8" + - "Mirrors the qwen3.5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k (TP8/EP1, TP8/EP8, TP2/EP2) and 8k1k (TP2/EP2, TP4/EP1) with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - glm5-fp8-mi355x-sglang-mtp + description: + - "Add GLM-5 FP8 MI355X SGLang MTP benchmark" + - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413" + - "Model: zai-org/GLM-5-FP8" + - "Mirrors the glm5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" + - "Configs: 1k1k and 8k1k, TP=8 conc 4-64 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - qwen3.5-fp4-b200-sglang-mtp + description: + - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6" + - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" + - "Mirrors the qwen3.5-fp4-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Configs: 1k1k and 8k1k, TP=4/EP=1 conc 4-128 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - glm5-fp4-b200-sglang-mtp + description: + - "Add GLM-5 NVFP4 B200 SGLang MTP benchmark (draft)" + - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" + - "Model: nvidia/GLM-5-NVFP4" + - "Follows the glm5-fp8-b200-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" + - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - kimik2.5-fp4-b200-vllm + description: + - "Add kv-cache-dtype fp8, max-cudagraph-capture-size 2048, max-num-batched-tokens, and stream-interval 20 to server launch args" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1047 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 3d863b54c..b49391a3c 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -18,11 +18,11 @@ fi # The yaml files specify HuggingFace model IDs for portability, but we use # local paths to avoid repeated downloading on the shared B300 cluster. if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then - export MODEL_PATH="/scratch/models/deepseek-r1-0528-nvfp4-v2" + export MODEL_PATH="/data/models/dsr1-fp4" export SERVED_MODEL_NAME="deepseek-r1-fp4" export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then - export MODEL_PATH="/scratch/models/deepseek-r1-0528" + export MODEL_PATH="/data/models/dsr1-fp8" export SERVED_MODEL_NAME="deepseek-r1-fp8" export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" else @@ -37,9 +37,9 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" +git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 -git checkout sa-submission-q1-2026 +git checkout sa-submission-q2-2026 echo "Installing srtctl..." export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" @@ -66,6 +66,7 @@ srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX export ISL="$ISL" export OSL="$OSL" +export EVAL_ONLY="${EVAL_ONLY:-false}" # Create srtslurm.yaml for srtctl SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" @@ -101,7 +102,17 @@ cat srtslurm.yaml echo "Running make setup..." make setup ARCH=x86_64 +# Export eval-related env vars for srt-slurm post-benchmark eval +export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" + echo "Submitting job with srtctl..." + +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) @@ -165,45 +176,66 @@ echo "Found logs directory: $LOGS_DIR" cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . -# Find all result subdirectories -RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi -if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" + echo "All result files processed" else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" - - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi +# Collect eval results if eval was requested +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" done - done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi fi -echo "All result files processed" - # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." @@ -217,7 +249,12 @@ find . -name '.nfs*' -delete 2>/dev/null || true else HF_HUB_CACHE_MOUNT="/scratch/models" - export MODEL="/scratch/models/${MODEL#*/}" + # Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster, + # so point MODEL at the local copy. Other models fall through and use `hf download` + # against the mounted cache from their benchmark script. + if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then + export MODEL="/scratch/models/${MODEL#*/}" + fi SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') From 638e62ab039e4d6c54c6b595a55350669fd9895c Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:39:56 -0700 Subject: [PATCH 08/18] chore(isb1): drop remaining non-scope files relative to PR base --- .github/configs/amd-master.yaml | 65 +- .github/configs/nvidia-master.yaml | 1272 ++++++-------------- .github/workflows/benchmark-tmpl.yml | 30 +- benchmarks/single_node/qwen3.5_fp8_b300.sh | 83 -- perf-changelog.yaml | 252 +--- runners/launch_b300-nv.sh | 117 +- 6 files changed, 399 insertions(+), 1420 deletions(-) delete mode 100644 benchmarks/single_node/qwen3.5_fp8_b300.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f1181b941..a2c424c91 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -131,24 +131,6 @@ qwen3.5-bf16-mi355x-sglang: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } -qwen3.5-bf16-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - runner: mi355x - precision: bf16 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - qwen3.5-bf16-mi300x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B @@ -224,27 +206,6 @@ qwen3.5-fp8-mi355x-sglang: - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } -qwen3.5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } - qwen3.5-fp4-mi355x-sglang: image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413 model: amd/Qwen3.5-397B-A17B-MXFP4 @@ -284,7 +245,7 @@ qwen3.5-fp8-mi300x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x @@ -301,24 +262,6 @@ glm5-fp8-mi355x-sglang: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -glm5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - glm5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post model: zai-org/GLM-5-FP8 @@ -527,7 +470,7 @@ gptoss-fp4-mi300x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 1, conc-start: 64, conc-end: 256 } + - { tp: 1, conc-start: 64, conc-end: 64 } - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } @@ -953,7 +896,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg @@ -1161,7 +1104,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 90a430b9d..484be6899 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -19,7 +19,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" decode: num-worker: 2 @@ -34,7 +34,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -49,7 +49,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -64,7 +64,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -79,7 +79,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" decode: num-worker: 4 @@ -94,7 +94,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" decode: num-worker: 5 @@ -110,7 +110,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -124,7 +124,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -138,7 +138,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -152,7 +152,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -166,7 +166,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -180,7 +180,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 6 @@ -199,7 +199,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -214,7 +214,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -229,7 +229,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -244,7 +244,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -259,7 +259,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -274,7 +274,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -289,7 +289,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 2 @@ -305,7 +305,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -319,7 +319,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -333,7 +333,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -347,7 +347,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -361,7 +361,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -375,7 +375,7 @@ dsr1-fp4-b200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -405,7 +405,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 8 @@ -420,7 +420,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" decode: num-worker: 8 @@ -435,7 +435,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" decode: num-worker: 8 @@ -450,7 +450,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" decode: num-worker: 8 @@ -466,7 +466,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" decode: num-worker: 7 @@ -481,7 +481,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" decode: num-worker: 4 @@ -496,7 +496,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" decode: num-worker: 3 @@ -511,7 +511,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" decode: num-worker: 2 @@ -527,7 +527,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" decode: num-worker: 3 @@ -541,7 +541,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" decode: num-worker: 3 @@ -555,7 +555,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" decode: num-worker: 3 @@ -570,7 +570,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" decode: num-worker: 5 @@ -584,7 +584,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" decode: num-worker: 1 @@ -598,7 +598,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" decode: num-worker: 5 @@ -618,7 +618,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" decode: num-worker: 6 @@ -633,7 +633,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" decode: num-worker: 2 @@ -648,7 +648,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" decode: num-worker: 6 @@ -663,7 +663,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" decode: num-worker: 4 @@ -679,7 +679,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" decode: num-worker: 3 @@ -694,7 +694,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" decode: num-worker: 1 @@ -709,7 +709,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" decode: num-worker: 1 @@ -725,7 +725,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" decode: num-worker: 1 @@ -739,7 +739,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" decode: num-worker: 4 @@ -753,7 +753,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" decode: num-worker: 4 @@ -767,7 +767,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" decode: num-worker: 6 @@ -782,7 +782,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" decode: num-worker: 1 @@ -796,7 +796,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" decode: num-worker: 2 @@ -810,7 +810,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" decode: num-worker: 1 @@ -824,7 +824,7 @@ dsr1-fp8-b200-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" decode: num-worker: 1 @@ -854,7 +854,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -869,7 +869,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 2 @@ -884,7 +884,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -899,7 +899,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -914,7 +914,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -929,7 +929,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml" decode: num-worker: 2 @@ -944,7 +944,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml" decode: num-worker: 2 @@ -960,7 +960,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -974,7 +974,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -988,7 +988,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -1002,7 +1002,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -1016,7 +1016,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -1030,7 +1030,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -1044,7 +1044,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -1063,7 +1063,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -1078,7 +1078,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -1093,7 +1093,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -1108,7 +1108,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -1123,7 +1123,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -1138,7 +1138,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -1154,7 +1154,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -1168,7 +1168,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -1182,7 +1182,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -1196,7 +1196,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -1210,7 +1210,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -1224,7 +1224,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -1238,7 +1238,7 @@ dsr1-fp4-b300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -1268,7 +1268,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml" decode: num-worker: 8 @@ -1283,7 +1283,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml" decode: num-worker: 8 @@ -1298,7 +1298,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml" decode: num-worker: 1 @@ -1313,7 +1313,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml" decode: num-worker: 2 @@ -1328,7 +1328,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml" decode: num-worker: 5 @@ -1343,7 +1343,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml" decode: num-worker: 2 @@ -1361,7 +1361,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml" decode: num-worker: 1 @@ -1375,7 +1375,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml" decode: num-worker: 2 @@ -1389,7 +1389,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml" decode: num-worker: 3 @@ -1403,7 +1403,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml" decode: num-worker: 8 @@ -1417,7 +1417,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml" decode: num-worker: 8 @@ -1431,7 +1431,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml" decode: num-worker: 8 @@ -1445,7 +1445,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml" decode: num-worker: 1 @@ -1464,7 +1464,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml" decode: num-worker: 2 @@ -1479,7 +1479,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 4 @@ -1494,7 +1494,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml" decode: num-worker: 4 @@ -1509,7 +1509,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml" decode: num-worker: 1 @@ -1524,7 +1524,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml" decode: num-worker: 1 @@ -1539,7 +1539,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml" decode: num-worker: 1 @@ -1557,7 +1557,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml" decode: num-worker: 4 @@ -1571,7 +1571,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml" decode: num-worker: 8 @@ -1585,7 +1585,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml" decode: num-worker: 1 @@ -1599,7 +1599,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml" decode: num-worker: 1 @@ -1613,7 +1613,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml" decode: num-worker: 5 @@ -1627,7 +1627,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml" decode: num-worker: 1 @@ -1641,7 +1641,7 @@ dsr1-fp8-b300-dynamo-trt: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml" decode: num-worker: 1 @@ -1669,29 +1669,6 @@ dsr1-fp4-b200-sglang: - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } -# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 -# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 -# B200 SGLang recipe as-is until B300-specific tuning is available. -dsr1-fp4-b300-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - runner: b300 - precision: fp4 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } - dsr1-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 model: nvidia/DeepSeek-R1-0528-FP4-V2 @@ -1774,28 +1751,6 @@ dsr1-fp8-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } -# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 -# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP8 -# B200 SGLang recipe as-is until B300-specific tuning is available. -dsr1-fp8-b300-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: b300 - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } - qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e model: Qwen/Qwen3.5-397B-A17B @@ -1814,24 +1769,6 @@ qwen3.5-bf16-b200-sglang: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } -qwen3.5-bf16-b200-sglang-mtp: - image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - runner: b200 - precision: bf16 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - qwen3.5-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.9-cu130-amd64 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -1870,24 +1807,6 @@ qwen3.5-fp4-b200-sglang: search-space: - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } -qwen3.5-fp4-b200-sglang-mtp: - image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 - model: nvidia/Qwen3.5-397B-A17B-NVFP4 - model-prefix: qwen3.5 - runner: b200 - precision: fp4 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - glm5-fp8-b200-sglang: image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 model: zai-org/GLM-5-FP8 @@ -1906,63 +1825,6 @@ glm5-fp8-b200-sglang: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } -glm5-fp8-b200-sglang-mtp: - image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: b200 - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - -# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 -# does not have a B300-specific recipe, so this config reuses the existing GLM5 FP8 -# B200 SGLang recipe as-is until B300-specific tuning is available. -glm5-fp8-b300-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: b300 - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - -glm5-fp8-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: b300 - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - glm5-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.10.post1-cu130 model: nvidia/GLM-5-NVFP4 @@ -1983,69 +1845,6 @@ glm5-fp4-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } -glm5-fp4-b200-sglang-mtp: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: nvidia/GLM-5-NVFP4 - model-prefix: glm5 - runner: b200 - precision: fp4 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - -# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5 -# does not have a B300-specific recipe, so this config reuses the existing -# GLM-5 FP4 B200 SGLang recipe as-is until B300-specific tuning is available. -glm5-fp4-b300-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: nvidia/GLM-5-NVFP4 - model-prefix: glm5 - runner: b300 - precision: fp4 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - -glm5-fp4-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: nvidia/GLM-5-NVFP4 - model-prefix: glm5 - runner: b300 - precision: fp4 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - qwen3.5-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.9-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -2083,104 +1882,6 @@ qwen3.5-fp8-b300-sglang-mtp: search-space: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } -qwen3.5-fp8-b300-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: b300 - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - -qwen3.5-fp4-b300-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: nvidia/Qwen3.5-397B-A17B-NVFP4 - model-prefix: qwen3.5 - runner: b300 - precision: fp4 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } - -qwen3.5-fp4-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: nvidia/Qwen3.5-397B-A17B-NVFP4 - model-prefix: qwen3.5 - runner: b300 - precision: fp4 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - -qwen3.5-bf16-b300-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - runner: b300 - precision: bf16 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - -qwen3.5-bf16-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - runner: b300 - precision: bf16 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - kimik2.5-int4-b200-vllm: image: vllm/vllm-openai:v0.15.1 model: moonshotai/Kimi-K2.5 @@ -2255,28 +1956,6 @@ dsr1-fp8-b200-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } -# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 -# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP8 -# B200 SGLang MTP recipe as-is until B300-specific tuning is available. Image bumped -# to v0.5.10.post1-cu130 to match the standard B300 SGLang image used by other B300 configs. -dsr1-fp8-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: b300 - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 model: deepseek-ai/DeepSeek-R1-0528 @@ -2461,7 +2140,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 11 @@ -2476,7 +2155,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 11 @@ -2491,7 +2170,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 11 @@ -2506,7 +2185,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 9 @@ -2521,7 +2200,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 11 @@ -2536,7 +2215,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 8 @@ -2551,7 +2230,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 7 @@ -2566,7 +2245,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -2581,7 +2260,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 2 @@ -2596,7 +2275,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2610,7 +2289,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2624,7 +2303,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2638,7 +2317,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2652,7 +2331,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2666,7 +2345,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2680,7 +2359,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 9 @@ -2694,7 +2373,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 6 @@ -2708,7 +2387,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 7 @@ -2727,7 +2406,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 7 @@ -2742,7 +2421,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 7 @@ -2757,7 +2436,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 6 @@ -2772,7 +2451,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 3 @@ -2787,7 +2466,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml" decode: num-worker: 5 @@ -2802,7 +2481,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 @@ -2817,7 +2496,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 @@ -2832,7 +2511,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 @@ -2847,7 +2526,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -2862,7 +2541,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 7 @@ -2876,7 +2555,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 7 @@ -2890,7 +2569,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 6 @@ -2904,7 +2583,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2918,7 +2597,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -2932,7 +2611,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2946,7 +2625,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -2960,7 +2639,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -2974,7 +2653,7 @@ dsr1-fp8-h200-dynamo-trt: ep: 8 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3004,7 +2683,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3019,7 +2698,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3034,7 +2713,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3049,7 +2728,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3064,7 +2743,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3079,7 +2758,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3094,7 +2773,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3109,7 +2788,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml" decode: num-worker: 1 @@ -3124,7 +2803,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -3139,7 +2818,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3153,7 +2832,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3167,7 +2846,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3181,7 +2860,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3195,7 +2874,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3209,7 +2888,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3223,7 +2902,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3237,7 +2916,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3251,7 +2930,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3270,7 +2949,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3285,7 +2964,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3300,7 +2979,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -3315,7 +2994,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -3332,7 +3011,7 @@ dsr1-fp8-h100-dynamo-trt: # ep: 16 # dp-attn: true # additional-settings: - # # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml + # # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml # - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" # decode: # num-worker: 2 @@ -3347,7 +3026,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -3362,7 +3041,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3376,7 +3055,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3390,7 +3069,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -3404,7 +3083,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -3418,7 +3097,7 @@ dsr1-fp8-h100-dynamo-trt: ep: 16 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3500,31 +3179,6 @@ minimaxm2.5-fp8-b200-vllm: - { tp: 2, conc-start: 4, conc-end: 512 } - { tp: 4, conc-start: 4, conc-end: 512 } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html -# does not have a B300-specific recipe, so this config reuses the existing -# MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. -minimaxm2.5-fp8-b300-vllm: - image: vllm/vllm-openai:v0.19.0-cu130 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: b300 - precision: fp8 - framework: vllm - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 512 } - - { tp: 4, conc-start: 4, conc-end: 512 } - - { tp: 2, ep: 2, conc-start: 512, conc-end: 512 } - - { tp: 4, ep: 4, conc-start: 256, conc-end: 512 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 512 } - - { tp: 4, conc-start: 4, conc-end: 512 } - minimaxm2.5-fp4-b200-vllm: image: vllm/vllm-openai:v0.19.0-cu130 model: nvidia/MiniMax-M2.5-NVFP4 @@ -3554,38 +3208,6 @@ minimaxm2.5-fp4-b200-vllm: - { tp: 4, conc-start: 4, conc-end: 512 } - { tp: 8, conc-start: 4, conc-end: 4 } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html -# does not have a B300-specific recipe, so this config reuses the existing -# MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -minimaxm2.5-fp4-b300-vllm: - image: vllm/vllm-openai:v0.19.0-cu130 - model: nvidia/MiniMax-M2.5-NVFP4 - model-prefix: minimaxm2.5 - runner: b300 - precision: fp4 - framework: vllm - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 4 } - - { tp: 2, conc-start: 4, conc-end: 512 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 4, conc-start: 4, conc-end: 512 } - - { tp: 4, ep: 4, conc-start: 32, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 4 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 32 } - - { tp: 1, conc-start: 256, conc-end: 512 } - - { tp: 2, conc-start: 4, conc-end: 512 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 512 } - - { tp: 4, conc-start: 4, conc-end: 512 } - - { tp: 8, conc-start: 4, conc-end: 4 } - gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.18.0 model: openai/gpt-oss-120b @@ -3850,7 +3472,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -3865,7 +3487,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -3880,7 +3502,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" decode: num-worker: 1 @@ -3895,7 +3517,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" decode: num-worker: 1 @@ -3910,7 +3532,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" decode: num-worker: 5 @@ -3927,7 +3549,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3941,7 +3563,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -3955,7 +3577,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" decode: num-worker: 2 @@ -3969,7 +3591,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -3983,7 +3605,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -3997,7 +3619,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 @@ -4011,7 +3633,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -4031,7 +3653,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -4046,7 +3668,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -4061,7 +3683,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" decode: num-worker: 1 @@ -4076,7 +3698,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -4091,7 +3713,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" decode: num-worker: 1 @@ -4107,7 +3729,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -4121,7 +3743,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -4135,7 +3757,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -4149,7 +3771,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -4163,7 +3785,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -4177,7 +3799,7 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 @@ -4208,7 +3830,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml" decode: num-worker: 1 @@ -4223,7 +3845,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml" decode: num-worker: 1 @@ -4238,7 +3860,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" decode: num-worker: 1 @@ -4253,7 +3875,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml" decode: num-worker: 1 @@ -4268,7 +3890,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml" decode: num-worker: 3 @@ -4283,7 +3905,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml" decode: num-worker: 3 @@ -4298,7 +3920,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml" decode: num-worker: 3 @@ -4313,7 +3935,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml" decode: num-worker: 1 @@ -4327,7 +3949,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml" decode: num-worker: 1 @@ -4341,7 +3963,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml" decode: num-worker: 1 @@ -4355,7 +3977,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml" decode: num-worker: 1 @@ -4369,7 +3991,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml" decode: num-worker: 1 @@ -4383,7 +4005,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml" decode: num-worker: 3 @@ -4397,7 +4019,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml" decode: num-worker: 3 @@ -4416,7 +4038,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml" decode: num-worker: 1 @@ -4431,7 +4053,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml" decode: num-worker: 1 @@ -4446,7 +4068,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml" decode: num-worker: 1 @@ -4461,7 +4083,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml" decode: num-worker: 1 @@ -4476,7 +4098,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml" decode: num-worker: 1 @@ -4491,7 +4113,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml" decode: num-worker: 3 @@ -4506,7 +4128,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml" decode: num-worker: 3 @@ -4521,7 +4143,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" decode: num-worker: 1 @@ -4535,7 +4157,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml" decode: num-worker: 1 @@ -4549,7 +4171,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml" decode: num-worker: 1 @@ -4563,7 +4185,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml" decode: num-worker: 1 @@ -4577,7 +4199,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml" decode: num-worker: 3 @@ -4591,7 +4213,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml" decode: num-worker: 3 @@ -4605,7 +4227,7 @@ dsr1-fp8-gb200-dynamo-trt: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml" decode: num-worker: 3 @@ -4635,7 +4257,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 1 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/low-latency.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/low-latency.yaml - "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml" decode: num-worker: 1 @@ -4651,7 +4273,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml - "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml" decode: num-worker: 1 @@ -4667,7 +4289,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml - "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml" decode: num-worker: 1 @@ -4683,7 +4305,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml - "CONFIG_FILE=recipes/gb200-fp8/1k1k/ultra-tpt.yaml" decode: num-worker: 1 @@ -4702,7 +4324,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 1 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/low-latency.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/low-latency.yaml - "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml" decode: num-worker: 1 @@ -4718,7 +4340,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml - "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml" decode: num-worker: 1 @@ -4734,7 +4356,7 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml" decode: num-worker: 1 @@ -4763,7 +4385,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml" decode: num-worker: 4 @@ -4779,7 +4401,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml" decode: num-worker: 1 @@ -4795,7 +4417,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/max.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/1k1k/stp/max.yaml - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml" decode: num-worker: 1 @@ -4814,7 +4436,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml" decode: num-worker: 1 @@ -4830,7 +4452,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml" decode: num-worker: 1 @@ -4846,7 +4468,7 @@ dsr1-fp8-gb300-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/max.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/gb300-fp8/8k1k/stp/max.yaml - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml" decode: num-worker: 1 @@ -4990,7 +4612,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -5005,7 +4627,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" decode: num-worker: 1 @@ -5020,7 +4642,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -5035,7 +4657,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -5050,7 +4672,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" decode: num-worker: 1 @@ -5065,7 +4687,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" decode: num-worker: 1 @@ -5080,7 +4702,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -5094,7 +4716,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -5108,7 +4730,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -5122,7 +4744,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -5136,7 +4758,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 @@ -5150,7 +4772,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -5169,7 +4791,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 3 @@ -5184,7 +4806,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -5199,7 +4821,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" decode: num-worker: 4 @@ -5214,7 +4836,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -5229,7 +4851,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -5244,7 +4866,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" decode: num-worker: 1 @@ -5259,7 +4881,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml" decode: num-worker: 1 @@ -5274,7 +4896,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml" decode: num-worker: 1 @@ -5289,7 +4911,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -5303,7 +4925,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -5317,7 +4939,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml" decode: num-worker: 4 @@ -5331,7 +4953,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" decode: num-worker: 5 @@ -5345,7 +4967,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -5359,7 +4981,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -5373,7 +4995,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml" decode: num-worker: 3 @@ -5387,7 +5009,7 @@ dsr1-fp4-gb300-dynamo-trt: ep: 2 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 @@ -5531,7 +5153,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 4 @@ -5546,7 +5168,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" decode: num-worker: 4 @@ -5561,7 +5183,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml" decode: num-worker: 1 @@ -5576,7 +5198,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml" decode: num-worker: 1 @@ -5591,7 +5213,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml" decode: num-worker: 1 @@ -5606,7 +5228,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml" decode: num-worker: 1 @@ -5621,7 +5243,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml" decode: num-worker: 2 @@ -5636,7 +5258,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" decode: num-worker: 4 @@ -5650,7 +5272,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" decode: num-worker: 4 @@ -5664,7 +5286,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml" decode: num-worker: 4 @@ -5678,7 +5300,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml" decode: num-worker: 1 @@ -5692,7 +5314,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml" decode: num-worker: 1 @@ -5706,7 +5328,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml" decode: num-worker: 2 @@ -5720,7 +5342,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml" decode: num-worker: 2 @@ -5739,7 +5361,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" decode: num-worker: 4 @@ -5754,7 +5376,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" decode: num-worker: 4 @@ -5769,7 +5391,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml" decode: num-worker: 1 @@ -5784,7 +5406,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml" decode: num-worker: 1 @@ -5799,7 +5421,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" decode: num-worker: 1 @@ -5814,7 +5436,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml" decode: num-worker: 1 @@ -5829,7 +5451,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" decode: num-worker: 4 @@ -5843,7 +5465,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" decode: num-worker: 4 @@ -5857,7 +5479,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" decode: num-worker: 4 @@ -5871,7 +5493,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml" decode: num-worker: 1 @@ -5885,7 +5507,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml" decode: num-worker: 1 @@ -5899,7 +5521,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" decode: num-worker: 1 @@ -5913,7 +5535,7 @@ dsr1-fp8-gb300-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml" decode: num-worker: 1 @@ -6469,7 +6091,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]" + - "CONFIG_FILE=recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml" decode: num-worker: 5 tp: 8 @@ -6482,7 +6104,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[1]" + - "CONFIG_FILE=recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml" decode: num-worker: 6 tp: 8 @@ -6495,7 +6117,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[0]" + - "CONFIG_FILE=recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml" decode: num-worker: 1 tp: 8 @@ -6508,7 +6130,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[1]" + - "CONFIG_FILE=recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml" decode: num-worker: 2 tp: 8 @@ -6525,7 +6147,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]" + - "CONFIG_FILE=recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml" decode: num-worker: 1 tp: 8 @@ -6538,7 +6160,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[1]" + - "CONFIG_FILE=recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml" decode: num-worker: 5 tp: 8 @@ -6551,7 +6173,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[2]" + - "CONFIG_FILE=recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml" decode: num-worker: 5 tp: 8 @@ -6564,7 +6186,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_tp4" + - "CONFIG_FILE=recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml" decode: num-worker: 1 tp: 8 @@ -6577,7 +6199,7 @@ dsr1-fp4-b200-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_maxtpt_7p2d" + - "CONFIG_FILE=recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml" decode: num-worker: 2 tp: 8 @@ -6605,7 +6227,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]" + - "CONFIG_FILE=recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p1d.yaml" decode: num-worker: 1 tp: 8 @@ -6618,7 +6240,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[1]" + - "CONFIG_FILE=recipes/b200-fp8/1k1k/stp/low-latency-tep8-1p3d.yaml" decode: num-worker: 3 tp: 8 @@ -6631,7 +6253,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[0]" + - "CONFIG_FILE=recipes/b200-fp8/1k1k/stp/max-tpt-dep8-1p5d.yaml" decode: num-worker: 5 tp: 8 @@ -6644,7 +6266,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[1]" + - "CONFIG_FILE=recipes/b200-fp8/1k1k/stp/max-tpt-dep8-2p5d.yaml" decode: num-worker: 5 tp: 8 @@ -6661,7 +6283,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_lowlat_0.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_0.yaml" decode: num-worker: 3 @@ -6675,7 +6297,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_lowlat_1.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_1.yaml" decode: num-worker: 4 @@ -6689,7 +6311,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_2.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_lowlat_2.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_2.yaml" decode: num-worker: 6 @@ -6704,7 +6326,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml" decode: num-worker: 2 @@ -6718,7 +6340,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml" decode: num-worker: 1 @@ -6732,7 +6354,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml" decode: num-worker: 1 @@ -6746,7 +6368,7 @@ dsr1-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml" decode: num-worker: 1 @@ -6776,7 +6398,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 8 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[0]" + - "CONFIG_FILE=recipes/b200-fp8/1k1k/mtp/low-latency-tep8-1p1d.yaml" decode: num-worker: 1 tp: 8 @@ -6791,7 +6413,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 8 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[1]" + - "CONFIG_FILE=recipes/b200-fp8/1k1k/mtp/low-latency-tep8-1p3d.yaml" decode: num-worker: 3 tp: 8 @@ -6806,7 +6428,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[1]" + - "CONFIG_FILE=recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p5d.yaml" decode: num-worker: 5 tp: 8 @@ -6821,7 +6443,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[2]" + - "CONFIG_FILE=recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-2p5d.yaml" decode: num-worker: 5 tp: 8 @@ -6836,7 +6458,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:override_mtp_maxtpt_1p2d" + - "CONFIG_FILE=recipes/b200-fp8/1k1k/mtp/max-tpt-dep8-1p2d.yaml" decode: num-worker: 2 tp: 8 @@ -6854,7 +6476,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml" decode: num-worker: 3 @@ -6869,7 +6491,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml" decode: num-worker: 4 @@ -6884,7 +6506,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml" decode: num-worker: 6 @@ -6900,7 +6522,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml" decode: num-worker: 2 @@ -6915,7 +6537,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml" decode: num-worker: 1 @@ -6930,7 +6552,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml" decode: num-worker: 1 @@ -6945,7 +6567,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp: ep: 1 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml" decode: num-worker: 1 @@ -6974,8 +6596,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml" decode: num-worker: 5 tp: 8 @@ -6989,8 +6611,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-6d.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k/mtp/low-latency-dep4-1p-tep8-6d.yaml" decode: num-worker: 6 tp: 8 @@ -7004,8 +6626,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-1d.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-1d.yaml" decode: num-worker: 1 tp: 8 @@ -7019,8 +6641,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-2d.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k/mtp/max-tpt-dep4-1p-dep8-2d.yaml" decode: num-worker: 2 tp: 8 @@ -7040,8 +6662,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-1d.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-1d.yaml" decode: num-worker: 1 tp: 8 @@ -7055,8 +6677,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k/mtp/low-latency-dep4-1p-tep8-5d.yaml" decode: num-worker: 5 tp: 8 @@ -7070,8 +6692,8 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/8k1k/mtp/low-latency-dep4-2p-tep8-5d.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k/mtp/low-latency-dep4-2p-tep8-5d.yaml" decode: num-worker: 5 tp: 8 @@ -7085,230 +6707,14 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 1 dp-attn: false additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_mtp_tp4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/b200-fp4/8k1k/mtp/low-latency-tp4-1p-tp8-1d.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k/mtp/low-latency-tp4-1p-tp8-1d.yaml" decode: num-worker: 1 tp: 8 ep: 1 dp-attn: false -kimik2.5-fp4-gb200-dynamo-trt: - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2 - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - runner: gb200 - precision: fp4 - framework: dynamo-trt - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [ 4, 192, 360, 668 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 5, 15, 30, 55 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [ 666 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [ 2253 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 4301, 6452 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [ 4301 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [ 4301 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [ 4 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 156 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [ 5, 15, 30, 60, 105 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [ 333 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [ 615 ] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [ 2151 ] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [ 2253 ] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - kimik2.5-fp4-gb200-dynamo-vllm: image: vllm/vllm-openai:v0.18.0-cu130 model: nvidia/Kimi-K2.5-NVFP4 diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 561a3fbb8..d5a6cc1f4 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -116,13 +116,22 @@ jobs: # Cleanup SLURM resources if command -v squeue >/dev/null 2>&1; then - echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." - scancel --name="${{ runner.name }}" || true - while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do - squeue --name="${{ runner.name }}" - sleep 5 - done - fi + if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* || "${{ runner.name }}" == b300-nv* ]]; then + echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." + scancel --name="${{ runner.name }}" || true + while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do + squeue --name="${{ runner.name }}" + sleep 5 + done + else + echo "[Slurm] Cleaning up jobs for user: $USER ..." + scancel -u "$USER" || true + while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do + squeue -u "$USER" + sleep 5 + done + fi + fi - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: @@ -131,13 +140,6 @@ jobs: ref: ${{ inputs.ref || github.ref }} clean: false - - name: Cleanup stale eval outputs (pre-run) - if: ${{ inputs.run-eval || inputs.eval-only }} - run: | - rm -f meta_env.json || true - rm -f results*.json || true - rm -f sample*.jsonl || true - - name: Launch job script env: RUNNER_NAME: ${{ runner.name }} diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/qwen3.5_fp8_b300.sh deleted file mode 100644 index b87d25e91..000000000 --- a/benchmarks/single_node/qwen3.5_fp8_b300.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME \ - EP_SIZE - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CONTEXT_LENGTH=$((ISL + OSL + 20)) -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" -fi - -# Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor - -set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ ---trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \ ---enable-symm-mem \ ---disable-radix-cache \ ---quantization fp8 \ ---kv-cache-dtype fp8_e4m3 \ ---mamba-ssm-dtype bfloat16 \ ---attention-backend trtllm_mha \ ---moe-runner-backend flashinfer_trtllm \ ---cuda-graph-max-bs $CONC \ ---max-running-requests $CONC \ ---max-prefill-tokens 16384 \ ---chunked-prefill-size 16384 \ ---mem-fraction-static 0.8 \ ---stream-interval 50 \ ---scheduler-recv-interval 10 \ ---tokenizer-worker-num 6 \ ---tokenizer-path $MODEL \ ---context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --use-chat-template - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -# Stop GPU monitoring -stop_gpu_monitor -set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ac0ef5d79..7ca73d687 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,13 +1,3 @@ -- config-keys: - - kimik2.5-fp4-gb200-dynamo-trt - description: - - "Add Kimi K2.5 NVFP4 GB200 disaggregated TRT-LLM benchmarks via Dynamo (14 STP configs)" - - "New framework: dynamo-trt (Dynamo frontend + TensorRT-LLM backend)" - - "Container: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2" - - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" - - "Runner script updated to support kimik2.5 model prefix with dynamo-trt framework" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1026 - - config-keys: - qwen3.5-fp4-mi355x-sglang description: @@ -1332,7 +1322,7 @@ description: - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and MI325X to achieve better performance" - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1063 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/986 - config-keys: - glm5-fp4-b200-sglang @@ -1369,13 +1359,6 @@ - "TP2/TP4 seach space exploration for Qwen3.5 fp4 on SGL" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1022 -- config-keys: - - glm5-fp8-mi355x-sglang - description: - - "Upgrade GLM5 FP8 MI355X SGLang image to v0.5.10rc0-rocm720-mi35x-20260413" - - "Set --kv-cache-dtype fp8_e4m3 and --disable-radix-cache" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1023 - - config-keys: - qwen3.5-fp8-h200-sglang-mtp description: @@ -1405,236 +1388,3 @@ - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035 - -- config-keys: - - qwen3.5-fp8-b300-sglang - description: - - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang benchmark (non-MTP)" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "TP=4, concurrency 4-256 for 1k1k and 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1048 - -- config-keys: - - dsr1-fp4-b300-sglang - description: - - "Add DeepSeek-R1-0528 FP4 B300 SGLang benchmark (non-MTP)" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 does not have a B300-specific recipe, so this reuses the existing DSR1 FP4 B200 SGLang recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1049 - -- config-keys: - - dsr1-fp8-b300-sglang - description: - - "Add DeepSeek-R1-0528 FP8 B300 SGLang benchmark (non-MTP)" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 does not have a B300-specific recipe, so this reuses the existing DSR1 FP8 B200 SGLang recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1050 - -- config-keys: - - glm5-fp8-b300-sglang - description: - - "Add GLM-5 FP8 B300 SGLang benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 does not have a B300-specific recipe, so this reuses the existing GLM5 FP8 B200 SGLang recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1051 - -- config-keys: - - glm5-fp4-b300-sglang - description: - - "Add GLM-5 FP4 (NVFP4) B300 SGLang benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5 does not have a B300-specific recipe, so this reuses the existing GLM-5 FP4 B200 SGLang recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1058 - -- config-keys: - - dsr1-fp8-b300-sglang-mtp - description: - - "Add DeepSeek-R1-0528 FP8 B300 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "EAGLE speculative decoding with MTP, TP=8, concurrency 4-512 for 1k1k and 8k1k" - - "At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 does not have a B300-specific recipe, so this reuses the existing DSR1 FP8 B200 SGLang MTP recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1059 - -- config-keys: - - minimaxm2.5-fp4-b300-vllm - description: - - "Add MiniMax-M2.5 FP4 (NVFP4) B300 vLLM benchmark" - - "Image: vllm/vllm-openai:v0.19.0-cu130" - - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP4 B200 vLLM recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1055 - -- config-keys: - - minimaxm2.5-fp8-b300-vllm - description: - - "Add MiniMax-M2.5 FP8 B300 vLLM benchmark" - - "Image: vllm/vllm-openai:v0.19.0-cu130" - - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html does not have a B300-specific recipe, so this reuses the existing MiniMax-M2.5 FP8 B200 vLLM recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1054 - -- config-keys: - - gptoss-fp4-mi300x-vllm - description: - - "Expand GPT-OSS 120B FP4 MI300X TP=1 concurrency from 64 to 256 for 1k1k" - - "Higher concurrency improves MoE weight amortization: 8552 total TPS at conc=256 vs 4016 at conc=64 (2.1x)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1053 - -- config-keys: - - dsr1-fp4-b200-dynamo-trt - - dsr1-fp8-b200-dynamo-trt - - dsr1-fp4-b200-dynamo-sglang - - dsr1-fp8-b200-dynamo-sglang - - dsr1-fp8-b200-dynamo-sglang-mtp - - dsr1-fp4-b200-dynamo-sglang-mtp - - dsr1-fp4-b300-dynamo-trt - - dsr1-fp8-b300-dynamo-trt - - dsr1-fp4-gb300-dynamo-trt - - dsr1-fp8-gb300-dynamo-trt - - dsr1-fp4-gb300-dynamo-sglang - - dsr1-fp8-gb300-dynamo-sglang - - dsr1-fp8-mi355x-sglang-disagg - - dsr1-fp8-mi355x-sglang-disagg-mtp - - dsr1-fp4-mi355x-sglang-disagg - - dsr1-fp4-mi355x-sglang-disagg-mtp - description: - - "Add multi-node lm-eval accuracy runs" - - "Eval picks the config with highest max eligible concurrency per (model, runner, framework, precision, spec-decoding, prefill-dp-attn, decode-dp-attn) group on 8k1k" - - "Eval concurrency set to the median eligible conc (>= MIN_EVAL_CONC=16) of the selected config to avoid OOM" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1000 - evals-only: true - -- config-keys: - - qwen3.5-fp4-b300-sglang - description: - - "Add Qwen3.5-397B-A17B NVFP4 B300 SGLang benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" - - "Follows the SGLang cookbook recipe at https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5 as of 2026-04-17" - - "Mirrors the B200 FP4 recipe with mem-fraction-static lowered to 0.8 and an extra TP2/EP2 search-space entry" - - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-bf16-b300-sglang - description: - - "Add Qwen3.5-397B-A17B BF16 B300 SGLang benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "Model: Qwen/Qwen3.5-397B-A17B" - - "Mirrors the B200 BF16 recipe with an extra TP4/EP1 search-space entry alongside the existing TP8/EP1 sweep" - - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-bf16-b200-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B BF16 B200 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e" - - "Model: Qwen/Qwen3.5-397B-A17B" - - "Mirrors the qwen3.5-bf16-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-64 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-bf16-b300-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B BF16 B300 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "Model: Qwen/Qwen3.5-397B-A17B" - - "Mirrors the qwen3.5-bf16-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-64 + TP4/EP1 conc 4-64, spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-fp4-b300-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B NVFP4 B300 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" - - "Mirrors the qwen3.5-fp4-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k and 8k1k, TP4/EP1 conc 4-128 + TP2/EP2 conc 4-128, spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - glm5-fp8-b300-sglang-mtp - description: - - "Add GLM-5 FP8 B300 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "Model: zai-org/GLM-5-FP8" - - "Mirrors the glm5-fp8-b300-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-bf16-mi355x-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B BF16 MI355X SGLang MTP benchmark" - - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" - - "Model: Qwen/Qwen3.5-397B-A17B" - - "Mirrors the qwen3.5-bf16-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - glm5-fp8-b200-sglang-mtp - description: - - "Add GLM-5 FP8 B200 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448" - - "Model: zai-org/GLM-5-FP8" - - "Mirrors the glm5-fp8-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - - "Configs: 1k1k and 8k1k, TP=8/EP=1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - glm5-fp4-b300-sglang-mtp - description: - - "Add GLM-5 NVFP4 B300 SGLang MTP benchmark (draft)" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "Model: nvidia/GLM-5-NVFP4" - - "Follows the glm5-fp8-b300-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-fp8-mi355x-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B FP8 MI355X SGLang MTP benchmark" - - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414" - - "Model: Qwen/Qwen3.5-397B-A17B-FP8" - - "Mirrors the qwen3.5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k (TP8/EP1, TP8/EP8, TP2/EP2) and 8k1k (TP2/EP2, TP4/EP1) with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - glm5-fp8-mi355x-sglang-mtp - description: - - "Add GLM-5 FP8 MI355X SGLang MTP benchmark" - - "Image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413" - - "Model: zai-org/GLM-5-FP8" - - "Mirrors the glm5-fp8-mi355x-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - - "Configs: 1k1k and 8k1k, TP=8 conc 4-64 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - qwen3.5-fp4-b200-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B NVFP4 B200 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6" - - "Model: nvidia/Qwen3.5-397B-A17B-NVFP4" - - "Mirrors the qwen3.5-fp4-b200-sglang non-MTP recipe and adds EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4)" - - "Configs: 1k1k and 8k1k, TP=4/EP=1 conc 4-128 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - glm5-fp4-b200-sglang-mtp - description: - - "Add GLM-5 NVFP4 B200 SGLang MTP benchmark (draft)" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "Model: nvidia/GLM-5-NVFP4" - - "Follows the glm5-fp8-b200-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX - -- config-keys: - - kimik2.5-fp4-b200-vllm - description: - - "Add kv-cache-dtype fp8, max-cudagraph-capture-size 2048, max-num-batched-tokens, and stream-interval 20 to server launch args" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1047 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index b49391a3c..4bb623780 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -18,11 +18,11 @@ fi # The yaml files specify HuggingFace model IDs for portability, but we use # local paths to avoid repeated downloading on the shared B300 cluster. if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then - export MODEL_PATH="/data/models/dsr1-fp4" + export MODEL_PATH="/scratch/models/deepseek-r1-0528-nvfp4-v2" export SERVED_MODEL_NAME="deepseek-r1-fp4" export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then - export MODEL_PATH="/data/models/dsr1-fp8" + export MODEL_PATH="/scratch/models/deepseek-r1-0528" export SERVED_MODEL_NAME="deepseek-r1-fp8" export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" else @@ -37,9 +37,9 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" +git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 -git checkout sa-submission-q2-2026 +git checkout sa-submission-q1-2026 echo "Installing srtctl..." export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" @@ -66,7 +66,6 @@ srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX export ISL="$ISL" export OSL="$OSL" -export EVAL_ONLY="${EVAL_ONLY:-false}" # Create srtslurm.yaml for srtctl SRTCTL_ROOT="${GITHUB_WORKSPACE}/${SRT_REPO_DIR}" @@ -102,17 +101,7 @@ cat srtslurm.yaml echo "Running make setup..." make setup ARCH=x86_64 -# Export eval-related env vars for srt-slurm post-benchmark eval -export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" - echo "Submitting job with srtctl..." - -if [[ -z "$CONFIG_FILE" ]]; then - echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 - echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 - exit 1 -fi - # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) @@ -176,66 +165,45 @@ echo "Found logs directory: $LOGS_DIR" cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . -if [[ "${EVAL_ONLY:-false}" != "true" ]]; then - # Find all result subdirectories - RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - - if [ -z "$RESULT_SUBDIRS" ]; then - echo "Warning: No result subdirectories found in $LOGS_DIR" - else - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Find all result JSON files - RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) - - for result_file in $RESULT_FILES; do - if [ -f "$result_file" ]; then - # Extract metadata from filename - # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') - gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') - ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') - gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') - - echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" - - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" - - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi - done - done - fi +# Find all result subdirectories +RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) - echo "All result files processed" +if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" else - echo "EVAL_ONLY=true: Skipping benchmark result collection" -fi + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") -# Collect eval results if eval was requested -if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then - EVAL_DIR="$LOGS_DIR/eval_results" - if [ -d "$EVAL_DIR" ]; then - echo "Extracting eval results from $EVAL_DIR" - shopt -s nullglob - for eval_file in "$EVAL_DIR"/*; do - [ -f "$eval_file" ] || continue - cp "$eval_file" "$GITHUB_WORKSPACE/" - echo "Copied eval artifact: $(basename "$eval_file")" + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi done - shopt -u nullglob - else - echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" - fi + done fi +echo "All result files processed" + # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." @@ -249,19 +217,12 @@ find . -name '.nfs*' -delete 2>/dev/null || true else HF_HUB_CACHE_MOUNT="/scratch/models" - # Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster, - # so point MODEL at the local copy. Other models fall through and use `hf download` - # against the mounted cache from their benchmark script. - if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then - export MODEL="/scratch/models/${MODEL#*/}" - fi + export MODEL="/scratch/models/${MODEL#*/}" SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') - # Pin to one of the known-good B300 nodes; others have hardware/network - # issues that cause benchmarks to hang or fail to start. - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" From 3c2d0039bd94cb5534efbb850efc551ac4784386 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 15:43:05 -0700 Subject: [PATCH 09/18] =?UTF-8?q?chore(isb1):=20drop=20GPU-cell/CI=20edits?= =?UTF-8?q?=20from=20PR=20#1032=20=E2=80=94=20data+contract=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second trim pass. Reverts 12 consortium-owned files to merge-base state and removes 1 net-new per-GPU recipe: - benchmarks/single_node/qwen3.5_{bf16,fp8}_mi{300x,325x,355x}.sh (reverted) - benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh (removed — preserved on fork branch isb1/agentic-benchmark-runners) - runners/launch_b300-nv.sh (reverted) - .github/configs/{amd,nvidia}-master.yaml (reverted) - .github/workflows/{benchmark-tmpl,pr-recipe-reminder}.yml (reverted) - perf-changelog.yaml (reverted) Rationale: per-GPU recipe cells and cross-cutting CI config are owned by AMD/NVIDIA contributors, not by a data-contribution PR. Matches Cam's cherry-pick-not-merge guidance and InferenceX consortium ownership model. Remaining PR scope: datasets/isb1/** + utils/** (replay contract + process_result ISB1 guard + tests) + top-level .gitattributes. --- .github/configs/amd-master.yaml | 29 +++---- .github/configs/nvidia-master.yaml | 21 +---- .github/workflows/benchmark-tmpl.yml | 2 +- .github/workflows/pr-recipe-reminder.yml | 6 +- benchmarks/single_node/qwen3.5_bf16_mi300x.sh | 15 +--- benchmarks/single_node/qwen3.5_bf16_mi325x.sh | 15 +--- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 13 +-- .../single_node/qwen3.5_fp8_b300_mtp.sh | 87 ------------------- benchmarks/single_node/qwen3.5_fp8_mi300x.sh | 15 +--- benchmarks/single_node/qwen3.5_fp8_mi325x.sh | 15 +--- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 13 +-- perf-changelog.yaml | 40 --------- runners/launch_b300-nv.sh | 25 ------ 13 files changed, 30 insertions(+), 266 deletions(-) delete mode 100644 benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a2c424c91..13d0e6146 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -114,7 +114,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -125,14 +125,14 @@ qwen3.5-bf16-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi300x-sglang: - image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi300x @@ -150,7 +150,7 @@ qwen3.5-bf16-mi300x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi325x-sglang: - image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi325x @@ -168,7 +168,7 @@ qwen3.5-bf16-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi325x-sglang: - image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi325x @@ -186,7 +186,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x @@ -197,17 +197,14 @@ qwen3.5-fp8-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp4-mi355x-sglang: - image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413 + image: lmsysorg/sglang:v0.5.10-rocm720-mi35x model: amd/Qwen3.5-397B-A17B-MXFP4 model-prefix: qwen3.5 runner: mi355x @@ -219,15 +216,15 @@ qwen3.5-fp4-mi355x-sglang: osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, conc-start: 4, conc-end: 4 } - isl: 8192 osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, conc-start: 4, conc-end: 32 } qwen3.5-fp8-mi300x-sglang: - image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi300x diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 484be6899..27ee51eef 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1826,7 +1826,7 @@ glm5-fp8-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp4-b200-sglang: - image: lmsysorg/sglang:v0.5.10.post1-cu130 + image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5 model: nvidia/GLM-5-NVFP4 model-prefix: glm5 runner: b200 @@ -1863,25 +1863,6 @@ qwen3.5-fp8-b200-sglang-mtp: search-space: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - -qwen3.5-fp8-b300-sglang-mtp: - image: lmsysorg/sglang:v0.5.10.post1-cu130 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: b300 - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - kimik2.5-int4-b200-vllm: image: vllm/vllm-openai:v0.15.1 model: moonshotai/Kimi-K2.5 diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index d5a6cc1f4..05ab23ef8 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -116,7 +116,7 @@ jobs: # Cleanup SLURM resources if command -v squeue >/dev/null 2>&1; then - if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* || "${{ runner.name }}" == b300-nv* ]]; then + if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == mi325x-amd* || "${{ runner.name }}" == mi300x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-cw* || "${{ runner.name }}" == h200-cw* || "${{ runner.name }}" == b200-nb* || "${{ runner.name }}" == h200-nb* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* ]]; then echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." scancel --name="${{ runner.name }}" || true while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do diff --git a/.github/workflows/pr-recipe-reminder.yml b/.github/workflows/pr-recipe-reminder.yml index a4d0a30a6..a8ca02743 100644 --- a/.github/workflows/pr-recipe-reminder.yml +++ b/.github/workflows/pr-recipe-reminder.yml @@ -40,10 +40,6 @@ jobs: If it is not, please create a PR first before we can merge your PR into the master branch. Let's ensure that the documentation is first class such that the entire ML community can benefit from your hard work! Thank you - https://github.com/vllm-project/recipes - - https://github.com/sgl-project/sgl-cookbook - - **PR authors are responsible for ensuring that after merging, all GitHub Action jobs fully pass.** A lot of the time, failures are just flakes and simply re-running the failed jobs will fix it. If re-running failed jobs is attempted, PR authors are responsible for ensuring it passes. See GitHub's docs on re-running failed jobs: https://docs.github.com/en/actions/how-tos/manage-workflow-runs/re-run-workflows-and-jobs#re-running-failed-jobs-in-a-workflow - - If additional help is needed, PR authors can reach out to core maintainers over Slack.`.replace(/^ /gm, ''); + - https://github.com/sgl-project/sgl-cookbook`.replace(/^ /gm, ''); await github.rest.issues.createComment({ owner, repo, issue_number, body }); diff --git a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh index f7c71963d..8aca9860a 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh @@ -19,34 +19,25 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -CONTEXT_LENGTH=$((ISL + OSL + 20)) -MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor # following Andy Luo linkedin's recipe https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/ python3 -m sglang.launch_server \ - --attention-backend aiter \ + --attention-backend triton \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ - --data-parallel-size 1 \ --trust-remote-code \ - --tokenizer-worker-num 6 \ - --enable-aiter-allreduce-fusion \ - --cuda-graph-max-bs $CONC \ - --disable-radix-cache \ - --max-prefill-tokens $MAX_PREFILL_TOKENS \ - --scheduler-recv-interval 30 \ - --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh index f7c71963d..8aca9860a 100644 --- a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh @@ -19,34 +19,25 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -CONTEXT_LENGTH=$((ISL + OSL + 20)) -MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor # following Andy Luo linkedin's recipe https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/ python3 -m sglang.launch_server \ - --attention-backend aiter \ + --attention-backend triton \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ - --data-parallel-size 1 \ --trust-remote-code \ - --tokenizer-worker-num 6 \ - --enable-aiter-allreduce-fusion \ - --cuda-graph-max-bs $CONC \ - --disable-radix-cache \ - --max-prefill-tokens $MAX_PREFILL_TOKENS \ - --scheduler-recv-interval 30 \ - --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index 6d40e3e3f..701695def 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -9,8 +9,7 @@ check_env_vars \ ISL \ OSL \ RANDOM_RANGE_RATIO \ - RESULT_FILENAME \ - EP_SIZE + RESULT_FILENAME if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -20,14 +19,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -CONTEXT_LENGTH=$((ISL + OSL + 20)) -MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -38,14 +34,7 @@ python3 -m sglang.launch_server \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ - --ep-size $EP_SIZE \ --trust-remote-code \ - --tokenizer-worker-num 6 \ - --enable-aiter-allreduce-fusion \ - --cuda-graph-max-bs $CONC \ - --disable-radix-cache \ - --max-prefill-tokens $MAX_PREFILL_TOKENS \ - --scheduler-recv-interval 30 \ --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh deleted file mode 100644 index a0c5f4828..000000000 --- a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME \ - EP_SIZE - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -nvidia-smi - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -CONTEXT_LENGTH=$((ISL + OSL + 20)) -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" -fi - -# Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor - -set -x -SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ ---trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \ ---enable-symm-mem \ ---disable-radix-cache \ ---quantization fp8 \ ---kv-cache-dtype fp8_e4m3 \ ---mamba-ssm-dtype bfloat16 \ ---attention-backend trtllm_mha \ ---moe-runner-backend flashinfer_trtllm \ ---cuda-graph-max-bs $CONC \ ---max-running-requests $CONC \ ---max-prefill-tokens 16384 \ ---chunked-prefill-size 16384 \ ---mem-fraction-static 0.8 \ ---stream-interval 50 \ ---scheduler-recv-interval 10 \ ---tokenizer-worker-num 6 \ ---tokenizer-path $MODEL \ ---speculative-algorithm EAGLE \ ---speculative-num-steps 3 \ ---speculative-eagle-topk 1 \ ---speculative-num-draft-tokens 4 \ ---context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --use-chat-template - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -# Stop GPU monitoring -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh index fe761d88d..00cc9cf91 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh @@ -19,14 +19,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -CONTEXT_LENGTH=$((ISL + OSL + 20)) -MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -34,20 +31,14 @@ start_gpu_monitor # following AMD Andy linkedin's recipe # https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/ python3 -m sglang.launch_server \ - --attention-backend aiter \ + --attention-backend triton \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ - --data-parallel-size 1 \ --trust-remote-code \ - --tokenizer-worker-num 6 \ - --enable-aiter-allreduce-fusion \ - --cuda-graph-max-bs $CONC \ - --disable-radix-cache \ - --max-prefill-tokens $MAX_PREFILL_TOKENS \ - --scheduler-recv-interval 30 \ - --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh index fe761d88d..00cc9cf91 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh @@ -19,14 +19,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -CONTEXT_LENGTH=$((ISL + OSL + 20)) -MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -34,20 +31,14 @@ start_gpu_monitor # following AMD Andy linkedin's recipe # https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/ python3 -m sglang.launch_server \ - --attention-backend aiter \ + --attention-backend triton \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ - --data-parallel-size 1 \ --trust-remote-code \ - --tokenizer-worker-num 6 \ - --enable-aiter-allreduce-fusion \ - --cuda-graph-max-bs $CONC \ - --disable-radix-cache \ - --max-prefill-tokens $MAX_PREFILL_TOKENS \ - --scheduler-recv-interval 30 \ - --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index 6d40e3e3f..701695def 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -9,8 +9,7 @@ check_env_vars \ ISL \ OSL \ RANDOM_RANGE_RATIO \ - RESULT_FILENAME \ - EP_SIZE + RESULT_FILENAME if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -20,14 +19,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -CONTEXT_LENGTH=$((ISL + OSL + 20)) -MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -38,14 +34,7 @@ python3 -m sglang.launch_server \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ - --ep-size $EP_SIZE \ --trust-remote-code \ - --tokenizer-worker-num 6 \ - --enable-aiter-allreduce-fusion \ - --cuda-graph-max-bs $CONC \ - --disable-radix-cache \ - --max-prefill-tokens $MAX_PREFILL_TOKENS \ - --scheduler-recv-interval 30 \ --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7ca73d687..746d0645d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,9 +1,3 @@ -- config-keys: - - qwen3.5-fp4-mi355x-sglang - description: - - "Update SGLang image from 'lmsysorg/sglang:v0.5.10-rocm720-mi35x' to 'rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413'" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1041 - - config-keys: - kimik2.5-int4-mi300x-vllm description: @@ -1314,16 +1308,6 @@ - "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820 -- config-keys: - - qwen3.5-bf16-mi300x-sglang - - qwen3.5-bf16-mi325x-sglang - - qwen3.5-fp8-mi300x-sglang - - qwen3.5-fp8-mi325x-sglang - description: - - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and MI325X to achieve better performance" - - "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/986 - - config-keys: - glm5-fp4-b200-sglang description: @@ -1364,27 +1348,3 @@ description: - "Enable SGLANG_ENABLE_SPEC_V2=1 for Qwen3.5 FP8 H200 SGLang MTP" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1017 - -- config-keys: - - qwen3.5-fp8-mi355x-sglang - - qwen3.5-bf16-mi355x-sglang - description: - - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI355X to achieve better performance" - - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 for BF16 benchmark" - - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 for FP8 benchmark" - - "Image includes upstream SGLang PRs: https://github.com/sgl-project/sglang/pull/21188, https://github.com/sgl-project/sglang/pull/21421, https://github.com/sgl-project/sglang/pull/20736" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1036 - -- config-keys: - - glm5-fp4-b200-sglang - description: - - "Update SGLang image from nightly-dev-cu13-20260328-a27651d5 to v0.5.10.post1-cu130" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1031 - -- config-keys: - - qwen3.5-fp8-b300-sglang-mtp - description: - - "Add Qwen3.5-397B-A17B-FP8 B300 SGLang MTP benchmark" - - "Image: lmsysorg/sglang:v0.5.10.post1-cu130" - - "EAGLE speculative decoding with MTP, TP=4, concurrency 4-256 for 1k1k and 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1035 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 4bb623780..68da9f2b7 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -6,8 +6,6 @@ SLURM_ACCOUNT="benchmark" set -x -if [[ "$IS_MULTINODE" == "true" ]]; then - # Validate framework if [[ $FRAMEWORK != "dynamo-sglang" && $FRAMEWORK != "dynamo-trt" ]]; then echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang" @@ -213,26 +211,3 @@ for i in 1 2 3 4 5; do sleep 10 done find . -name '.nfs*' -delete 2>/dev/null || true - -else - - HF_HUB_CACHE_MOUNT="/scratch/models" - export MODEL="/scratch/models/${MODEL#*/}" - SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') - SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') - - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" - JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) - - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - - srun --jobid=$JOB_ID \ - --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ - --no-container-mount-home \ - --container-workdir=/workspace/ \ - --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b300${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh - -fi From e0d7506da3fe4571a8cf8fce46f324b0717f9bad Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 21:36:07 -0700 Subject: [PATCH 10/18] feat(isb1): add kv-cache-tester shim + fix LFS precedence Provide a zero-dependency bridge that converts ISB1 multiturn and trace_replay bundles into Cam's kv-cache-tester trace JSON format (prefix-extending hash_ids for KV cache hit computation). Covers both bundle shapes, hydrates schema 0.2.0 prefix_ref sidecars, and ships 15 contract tests (0.61s). Also fixes a Git LFS attribute precedence bug where the inner .gitattributes silently overrode the root rule, making `git lfs pull --include` a no-op for datasets/isb1/exports/**/*.json. - tools/isb1_to_kvcache_tester.py (+771) - tools/test_isb1_to_kvcache_tester.py (+412) - datasets/isb1/.gitattributes: enable LFS filter on exports/**/*.json - datasets/isb1/README.md: how-to-consume, smoke-test, HF publication recipe Co-Authored-By: Claude Opus 4.7 --- datasets/isb1/.gitattributes | 3 +- datasets/isb1/README.md | 170 ++++++ tools/isb1_to_kvcache_tester.py | 771 +++++++++++++++++++++++++++ tools/test_isb1_to_kvcache_tester.py | 412 ++++++++++++++ 4 files changed, 1354 insertions(+), 2 deletions(-) create mode 100644 tools/isb1_to_kvcache_tester.py create mode 100644 tools/test_isb1_to_kvcache_tester.py diff --git a/datasets/isb1/.gitattributes b/datasets/isb1/.gitattributes index d7fa37c52..5998181c2 100644 --- a/datasets/isb1/.gitattributes +++ b/datasets/isb1/.gitattributes @@ -1,2 +1 @@ -exports/**/*.json linguist-generated=true -exports/**/*.json text eol=lf +exports/**/*.json filter=lfs diff=lfs merge=lfs -text linguist-generated=true diff --git a/datasets/isb1/README.md b/datasets/isb1/README.md index 1c9ca6e26..64d855e53 100644 --- a/datasets/isb1/README.md +++ b/datasets/isb1/README.md @@ -4,6 +4,8 @@ This directory is the InferenceX-side consumer package for ISB1 replay. InferenceX consumes committed file artifacts only: - replay export JSON bundles under `datasets/isb1/exports/` +- conversion to SemiAnalysis's `kv-cache-tester` format via + [`tools/isb1_to_kvcache_tester.py`](../../tools/isb1_to_kvcache_tester.py) - consumer configs in `.github/configs/isb1-*.yaml` - replay processing through `utils/bench_serving/benchmark_export_replay.py` - result normalization through `utils/process_result_isb1.py` @@ -75,6 +77,10 @@ metadata. All export files are valid JSON and replay-hydratable via `utils/bench_serving/benchmark_export_replay.py`. +All bundles can also be converted to SemiAnalysis's `kv-cache-tester` +per-conversation trace format via [`tools/isb1_to_kvcache_tester.py`](../../tools/isb1_to_kvcache_tester.py); +see [How to consume](#how-to-consume). + --- ## Support-status vocabulary @@ -113,6 +119,170 @@ Unsafe claims: --- +## How to consume + +Two consumption paths are supported, both fed from the same committed bundles: + +1. **InferenceX-internal replay** — `utils/bench_serving/benchmark_export_replay.py` + directly, with `utils/process_result_isb1.py` for result normalization. +2. **SemiAnalysis `kv-cache-tester`** — convert via + [`tools/isb1_to_kvcache_tester.py`](../../tools/isb1_to_kvcache_tester.py), + then feed the resulting directory to + [`trace_replay_tester.py --trace-directory`](https://github.com/callanjfox/kv-cache-tester) + (PR #993 submodule). + +Path 2 is documented here because it is the path that lets ISB1 traces plug +into SemiAnalysis's existing Slurm benchmarking pipeline without the consumer +having to read our replay code. + +### Step 1 — fetch the bundles (LFS) + +From a clone of InferenceX: + +```bash +git lfs install +git lfs pull --include='datasets/isb1/exports/**/*.json' +``` + +Or, if the bundles are published to Hugging Face (see +[HF publication](#hf-publication)), download directly: + +```bash +huggingface-cli download / \ + --repo-type dataset \ + --local-dir ./isb1_bundles +``` + +### Step 2 — convert one bundle to `kv-cache-tester` format + +```bash +python tools/isb1_to_kvcache_tester.py \ + --export-file datasets/isb1/exports/core/chat_8k1k_qwen3.5.json \ + --output-dir traces_isb1/core_chat_qwen/ \ + --runtime-stack-id standalone:vllm \ + --hardware-profile-id nvidia:h200_sxm_141gb \ + --canonical-model-id qwen3_5_397b_a17b \ + --support-status supported +``` + +This writes one `trace_.json` per cell that passes the +filters, in the flat layout `trace_replay_tester.py --trace-directory` +expects. The schema matches `kv-cache-tester@main` (`trace.id`, `requests[].t/in/out/hash_ids`), +so `normalize_trace()` accepts it as-is. + +To convert every bundle in one shot: + +```bash +python tools/isb1_to_kvcache_tester.py \ + --export-root datasets/isb1/exports/ \ + --output-dir traces_isb1/ +``` + +### Step 3 — replay against a running vLLM / SGLang server + +Using PR #993's own recipes (e.g. `benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh`), +set `TRACE_DIR` to the converted directory and let the existing Slurm wiring +pick it up: + +```bash +TRACE_DIR=$PWD/traces_isb1/core_chat_qwen \ +MODEL=Qwen/Qwen3.5-397B-A17B-FP8 \ +TP=8 USERS=8 OFFLOAD_MODE=off TOTAL_CPU_DRAM_GB=0 \ +RESULT_DIR=$PWD/results/isb1_smoke \ +bash experimental/multiturn/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +``` + +Or, equivalently, invoke Cam's tester directly: + +```bash +python $KV_CACHE_TESTER_DIR/trace_replay_tester.py \ + --api-endpoint http://127.0.0.1:8888 \ + --trace-directory $PWD/traces_isb1/core_chat_qwen \ + --output-dir $PWD/results/isb1_smoke \ + --start-users 2 --max-users 2 --test-duration 60 +``` + +### Step 4 — verify the result + +```bash +jq '.' results/isb1_smoke/results.json | head +``` + +Expected: + +- `trace_replay_tester.py` logs show `Loaded N traces (filtered from N)`. +- Cache-hit rate reported during the run is non-zero for multi-turn bundles + (because each turn's `hash_ids` extends the previous turn's prefix). +- Completed sessions ≥ 1; HTTP error count = 0. + +--- + +## Smoke test + +The one-liner below is the binary go/no-go for "does this PR actually help +SemiAnalysis": + +```bash +# Assumes a vLLM OpenAI server is up on :8888 serving the model. +python tools/isb1_to_kvcache_tester.py \ + --export-file datasets/isb1/exports/core/chat_8k1k_qwen3.5.json \ + --output-dir /tmp/isb1_proof/ \ + --canonical-model-id qwen3_5_397b_a17b \ +&& python $KV_CACHE_TESTER_DIR/trace_replay_tester.py \ + --api-endpoint http://127.0.0.1:8888 \ + --trace-directory /tmp/isb1_proof/ \ + --output-dir /tmp/isb1_proof/out \ + --start-users 1 --max-users 1 --test-duration 30 +``` + +Pass criteria: + +| Artifact | Threshold | +|---|---| +| Shim exit code | `0` | +| `trace_replay_tester.py` exit code | `0` | +| `Loaded N traces` (N) | `≥ 1` | +| Completed sessions | `≥ 1` | +| HTTP errors | `0` | + +Any failure of the above means the PR is not actually plumbed end-to-end for +this bundle and should be reproduced against Cam's `trace_replay_tester.py` +before being claimed as compatible. + +--- + +## HF publication + +The `kv-cache-tester` Slurm recipes accept an HF dataset source via the +`hf_--` prefix convention on `TRACE_DIR` — the wrapper `.sh` +scripts download with `huggingface-cli` and point the tester at the local +mirror. + +To publish an HF mirror of these bundles: + +1. Create a dataset repo (e.g. `semianalysisai/isb1-core-v0`). +2. Mirror the directory layout of `datasets/isb1/exports/` exactly. + (Do not copy the inner `datasets/isb1/.gitattributes` — one top-level + LFS-only `.gitattributes` at the HF repo root is sufficient.) +3. For each published bundle, run + `tools/isb1_to_kvcache_tester.py --export-root --output-dir ` + locally to verify the conversion stays green at the new revision. +4. Pin revisions by HF branch/tag matching the producer's + `schema_version` (e.g. `v0.2.0`). + +Once published, Cam's Slurm scripts can consume a bundle with no code change: + +```bash +TRACE_DIR=hf_semianalysisai--isb1-core-v0 \ +bash experimental/multiturn/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +``` + +(Where the `.sh` does `huggingface-cli download semianalysisai/isb1-core-v0` +into a scratch dir, then runs the converter shim against it before +invoking `trace_replay_tester.py`.) + +--- + ## Related docs - [`COEXISTENCE_WITH_KV_CACHE_TESTER.md`](COEXISTENCE_WITH_KV_CACHE_TESTER.md) — diff --git a/tools/isb1_to_kvcache_tester.py b/tools/isb1_to_kvcache_tester.py new file mode 100644 index 000000000..75fc63838 --- /dev/null +++ b/tools/isb1_to_kvcache_tester.py @@ -0,0 +1,771 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +"""Convert ISB1 replay bundles into `kv-cache-tester` trace files. + +Produces per-conversation JSON files compatible with +`callanjfox/kv-cache-tester`'s `trace_replay_tester.py --trace-directory` +input format (see upstream ``TraceManager.load_traces`` / ``normalize_trace`` +in that repo). Works on both ISB1 bundle adapters shipped by this PR: + +- ``inferencex_multiturn`` (direct ``session.turns[].messages`` shape) +- ``inferencex_trace_replay`` (``events[].input_messages`` shape, with + optional schema ``0.2.0`` ``prefix_ref`` sidecar hydration) + +This shim is the ONLY glue between SemiAnalysis's trace-replay tester and our +ISB1 bundles. It emits one ``trace_.json`` per conversation/cell and +does not import or execute any benchmark harness. It has no third-party +dependencies: standard library only. + +Schema compatibility +-------------------- + +The ``kv-cache-tester`` input schema (verified against +``trace_replay_tester.py@main``): + + { + "id": "", # str, required + "models": [""], # list[str], optional + "block_size": 64, # int, optional (default 64) + "hash_id_scope": "local", # str, optional + "tool_tokens": 0, # int, optional + "system_tokens": 0, # int, optional + "requests": [ + { + "t": 0.0, # float, arrival offset (s) + "type": "n", # "n" non-streaming | "s" streaming + "model": "", + "in": , # input tokens + "out": , # output tokens + "hash_ids": [1, 2, ..., N], # prefix-reuse block sequence + "input_types": ["text"], + "output_types": ["text"], + "stop": "end_turn", + "think_time": # seconds before request + }, ... + ] + } + +The ``hash_ids`` field is the critical one. ``kv-cache-tester`` computes the +cache hit rate by walking each turn's ``hash_ids`` and counting hits against +the previous turn's set, stopping at the first miss. For multi-turn +conversations where every turn builds on the previous history, the correct +mapping is the monotonically extending prefix ``[1, 2, ..., ceil(in/block)]`` +— earlier blocks are reused across turns and the new user message + assistant +turn extend the prefix. + +Usage +----- + +Single bundle: + + python tools/isb1_to_kvcache_tester.py \ + --export-file datasets/isb1/exports/core/chat_8k1k_qwen3.5.json \ + --output-dir traces_isb1/ + +Whole directory tree (reproduces the ``traces_isb1///`` layout): + + python tools/isb1_to_kvcache_tester.py \ + --export-root datasets/isb1/exports/ \ + --output-dir traces_isb1/ + +Filters (subset by cell, matching ``benchmark_export_replay.py`` semantics): + + python tools/isb1_to_kvcache_tester.py \ + --export-file datasets/isb1/exports/core/chat_8k1k.json \ + --output-dir traces_isb1/core_chat_qwen_h200/ \ + --runtime-stack-id standalone:vllm \ + --hardware-profile-id nvidia:h200_sxm_141gb \ + --canonical-model-id qwen3_5_397b_a17b + +Smoke run against Cam's tester (after a vLLM OpenAI server is up on :8888): + + python /path/to/kv-cache-tester/trace_replay_tester.py \ + --api-endpoint http://127.0.0.1:8888 \ + --trace-directory traces_isb1/core_chat_qwen_h200/ \ + --output-dir /tmp/isb1_result/ \ + --start-users 2 --max-users 2 --test-duration 60 + +The shim is deterministic: same input bundle, same filters, same block_size +produces byte-identical output trace files. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import math +import sys +from pathlib import Path +from typing import Any, Iterable, Optional + +# ----------------------------------------------------------------------------- +# Constants +# ----------------------------------------------------------------------------- + +SUPPORTED_ADAPTERS = ("inferencex_multiturn", "inferencex_trace_replay") +DEFAULT_BLOCK_SIZE = 64 # matches kv-cache-tester default +DEFAULT_IMAGE_TOKENS = 512 # matches DEFAULT_IMAGE_TOKEN_ESTIMATE in our exporter +FALLBACK_OUTPUT_TOKENS = 256 # used only if the bundle has no expected_output_tokens + + +# ----------------------------------------------------------------------------- +# Token counting (stdlib-only, matches benchmark_export_replay._fallback_text_token_count) +# ----------------------------------------------------------------------------- + +def _fallback_text_token_count(text: str) -> int: + """Approximate token count (≈ 4 chars / token). + + This matches the fallback path in ``utils/bench_serving/benchmark_export_replay.py`` + and mirrors Cam's tester, which does its own tokenization at replay time + from synthetic content. The only thing this shim must emit accurately is + the *block count* (``ceil(in_tokens / block_size)``); a ~10-20% error in + the absolute token count still produces the correct prefix-reuse pattern + because block IDs are assigned in order. + """ + stripped = (text or "").strip() + if not stripped: + return 0 + return max(1, math.ceil(len(stripped) / 4)) + + +# ----------------------------------------------------------------------------- +# Message flattening (mirrors benchmark_export_replay._extract_message_text) +# ----------------------------------------------------------------------------- + +def _render_block_as_text(block: dict[str, Any]) -> str: + block_type = str(block.get("type", "text")) + text = (block.get("text") or "").strip() + if block_type == "text": + return text + if block_type == "code": + return f"[CODE]\n{text}" if text else "[CODE]" + if block_type == "log": + return f"[LOG]\n{text}" if text else "[LOG]" + if block_type == "document": + label = block.get("asset_path") or block.get("uri") or "" + if text and label: + return f"[DOCUMENT: {label}]\n{text}" + if text: + return f"[DOCUMENT]\n{text}" + return f"[DOCUMENT: {label}]" if label else "[DOCUMENT]" + if block_type == "table": + return f"[TABLE]\n{text}" if text else "[TABLE]" + if block_type == "image": + # images are approximated as fixed-cost tokens; no text to render + return "" + return text or "" + + +def _extract_message_text(message: dict[str, Any]) -> str: + if isinstance(message.get("content"), str): + return message["content"] + blocks = message.get("content_blocks") or [] + parts = [_render_block_as_text(b) for b in blocks if isinstance(b, dict)] + return "\n\n".join(p for p in parts if p) + + +def _count_image_tokens(message: dict[str, Any], image_token_estimate: int) -> int: + blocks = message.get("content_blocks") or [] + return sum( + image_token_estimate + for b in blocks + if isinstance(b, dict) and str(b.get("type")) == "image" + ) + + +def _count_message_tokens(message: dict[str, Any], image_token_estimate: int) -> int: + text_tokens = _fallback_text_token_count(_extract_message_text(message)) + return text_tokens + _count_image_tokens(message, image_token_estimate) + + +def _count_turn_input_tokens( + messages: list[dict[str, Any]], + image_token_estimate: int, +) -> int: + return sum(_count_message_tokens(m, image_token_estimate) for m in messages) + + +# ----------------------------------------------------------------------------- +# Prefix sidecar hydration (schema 0.2.0 inferencex_trace_replay bundles) +# ----------------------------------------------------------------------------- + +def _schema_version_at_least(observed: Any, required: str) -> bool: + if not isinstance(observed, str): + return False + try: + obs = tuple(int(x) for x in observed.split(".")) + req = tuple(int(x) for x in required.split(".")) + except ValueError: + return False + return obs >= req + + +def _load_prefix_artifact( + bundle_path: Path, + prefix_ref: str, + prefix_entry: dict[str, Any], +) -> dict[str, Any]: + rel_path = prefix_entry.get("path") + if not isinstance(rel_path, str) or not rel_path: + raise ValueError( + f"prefix_index[{prefix_ref!r}] missing 'path' in {bundle_path}" + ) + artifact_path = (bundle_path.parent / rel_path).resolve() + if not artifact_path.exists(): + raise FileNotFoundError( + f"prefix artifact not found: {artifact_path} (ref={prefix_ref!r})" + ) + raw = artifact_path.read_bytes() + + expected_sha = prefix_entry.get("sha256") + if isinstance(expected_sha, str) and expected_sha: + actual_sha = hashlib.sha256(raw).hexdigest() + if actual_sha.lower() != expected_sha.lower(): + raise ValueError( + "prefix artifact sha256 mismatch for " + f"{prefix_ref!r}: expected {expected_sha}, got {actual_sha}" + ) + + try: + return json.loads(raw.decode("utf-8")) + except Exception as exc: + raise ValueError( + f"prefix artifact {artifact_path} is not valid JSON: {exc}" + ) from exc + + +def _merge_prefix_into_cell(cell: dict[str, Any], prefix_payload: dict[str, Any]) -> None: + # Schema 0.2.0 bundles store events in the prefix artifact; merge back in-place. + # (Mirrors benchmark_export_replay._merge_prefix_into_trace_replay_cell.) + prefix_events = prefix_payload.get("events") or [] + cell.setdefault("events", []).extend(prefix_events) + # Also merge trace_metadata if the prefix carries extras; preserve cell priority. + p_meta = prefix_payload.get("trace_metadata") or {} + if isinstance(p_meta, dict) and p_meta: + merged = dict(p_meta) + merged.update(cell.get("trace_metadata") or {}) + cell["trace_metadata"] = merged + + +def _hydrate_trace_replay_payload(payload: dict[str, Any], bundle_path: Path) -> None: + if not _schema_version_at_least(payload.get("schema_version"), "0.2.0"): + return + export_cells = list(payload.get("exports", [])) + if not export_cells: + return + + has_prefix_ref = any(cell.get("prefix_ref") for cell in export_cells) + has_embedded_events = any("events" in cell for cell in export_cells) + if has_prefix_ref and has_embedded_events: + raise ValueError( + "mixed legacy/prefix-aware trace replay bundle unsupported in " + f"{bundle_path}; rows cannot mix embedded events with prefix_ref" + ) + if not has_prefix_ref: + return + + missing_prefix_ref = [c for c in export_cells if not c.get("prefix_ref")] + if missing_prefix_ref: + raise ValueError( + f"prefix-aware trace replay bundle missing prefix_ref in {bundle_path}" + ) + + raw_prefix_index = payload.get("prefix_index") + prefix_index = raw_prefix_index if isinstance(raw_prefix_index, dict) else {} + prefix_payloads: dict[str, dict[str, Any]] = {} + for prefix_ref in {str(cell["prefix_ref"]) for cell in export_cells}: + entry = prefix_index.get(prefix_ref) + if not isinstance(entry, dict): + raise ValueError(f"unknown prefix_ref {prefix_ref!r} in {bundle_path}") + prefix_payloads[prefix_ref] = _load_prefix_artifact( + bundle_path, prefix_ref, entry + ) + + for cell in export_cells: + _merge_prefix_into_cell(cell, prefix_payloads[str(cell["prefix_ref"])]) + + +# ----------------------------------------------------------------------------- +# ISB1 → kv-cache-tester per-turn mapping +# ----------------------------------------------------------------------------- + +def _hash_id_sequence(input_tokens: int, block_size: int) -> list[int]: + """Emit the canonical prefix-extending hash_id sequence. + + Each turn's ``hash_ids`` is ``[1, 2, ..., ceil(in/block_size)]``. Since + multi-turn conversations are strictly-extending prefixes (turn N+1 input + is turn N input + assistant response + new user message), the earlier + block IDs appear in every subsequent turn; ``kv-cache-tester``'s + hit-rate walker sees them as cache hits. + """ + if input_tokens <= 0: + return [] + n_blocks = max(1, math.ceil(input_tokens / block_size)) + return list(range(1, n_blocks + 1)) + + +def _build_request_from_turn( + *, + turn_idx: int, + messages: list[dict[str, Any]], + expected_output_tokens: Optional[int], + wait_before_ms: int, + prior_offset_ms: int, + canonical_model_id: str, + image_token_estimate: int, + block_size: int, + fallback_output_tokens: int, +) -> tuple[dict[str, Any], int]: + input_tokens = _count_turn_input_tokens(messages, image_token_estimate) + out_tokens = int(expected_output_tokens) if expected_output_tokens else fallback_output_tokens + out_tokens = max(1, out_tokens) + + arrival_ms = prior_offset_ms + max(0, int(wait_before_ms)) + think_time_s = max(0.0, arrival_ms / 1000.0 if turn_idx == 0 else (arrival_ms - prior_offset_ms) / 1000.0) + + request = { + "t": round(arrival_ms / 1000.0, 3), + "type": "n", + "model": canonical_model_id, + "in": input_tokens, + "out": out_tokens, + "hash_ids": _hash_id_sequence(input_tokens, block_size), + "input_types": ["text"], + "output_types": ["text"], + "stop": "end_turn", + "think_time": round(think_time_s, 3), + } + return request, arrival_ms + + +def _iter_cells_from_multiturn(payload: dict[str, Any]) -> Iterable[dict[str, Any]]: + for cell in payload.get("exports") or []: + if not isinstance(cell, dict): + continue + yield cell + + +def _iter_cells_from_trace_replay(payload: dict[str, Any]) -> Iterable[dict[str, Any]]: + for cell in payload.get("exports") or []: + if not isinstance(cell, dict): + continue + yield cell + + +def _conversation_id(cell: dict[str, Any], adapter_id: str, fallback_idx: int) -> str: + if adapter_id == "inferencex_multiturn": + session = cell.get("session") or {} + cid = session.get("session_id") or cell.get("trace_id") + else: + meta = cell.get("trace_metadata") or {} + cid = meta.get("session_id") or cell.get("trace_id") + if not cid: + cid = f"cell_{fallback_idx:04d}" + return str(cid) + + +def _cell_turns_multiturn(cell: dict[str, Any]) -> list[dict[str, Any]]: + session = cell.get("session") or {} + out = [] + for turn in session.get("turns") or []: + if not isinstance(turn, dict): + continue + out.append({ + "messages": list(turn.get("messages") or []), + "expected_output_tokens": turn.get("expected_output_tokens"), + "wait_before_ms": int(turn.get("wait_before_ms") or 0), + }) + return out + + +def _cell_turns_trace_replay(cell: dict[str, Any]) -> list[dict[str, Any]]: + out = [] + prior_ms = 0 + for event in cell.get("events") or []: + if not isinstance(event, dict): + continue + offset_ms = int(event.get("arrival_time_offset_ms") or 0) + wait_ms = 0 if not out else max(0, offset_ms - prior_ms) + prior_ms = offset_ms + out.append({ + "messages": list(event.get("input_messages") or []), + "expected_output_tokens": event.get("target_output_tokens"), + "wait_before_ms": wait_ms, + }) + return out + + +def _build_trace( + *, + cell: dict[str, Any], + adapter_id: str, + fallback_idx: int, + block_size: int, + fallback_output_tokens: int, + image_token_estimate: int, + max_turns_per_conversation: Optional[int], +) -> Optional[dict[str, Any]]: + conversation_id = _conversation_id(cell, adapter_id, fallback_idx) + canonical_model_id = str(cell.get("canonical_model_id") or "unknown") + + if adapter_id == "inferencex_multiturn": + turns = _cell_turns_multiturn(cell) + else: + turns = _cell_turns_trace_replay(cell) + + if not turns: + return None + + if max_turns_per_conversation is not None: + turns = turns[: max(1, int(max_turns_per_conversation))] + + requests = [] + cumulative_ms = 0 + for turn_idx, turn in enumerate(turns): + request, cumulative_ms = _build_request_from_turn( + turn_idx=turn_idx, + messages=turn["messages"], + expected_output_tokens=turn.get("expected_output_tokens"), + wait_before_ms=turn.get("wait_before_ms", 0), + prior_offset_ms=cumulative_ms, + canonical_model_id=canonical_model_id, + image_token_estimate=image_token_estimate, + block_size=block_size, + fallback_output_tokens=fallback_output_tokens, + ) + requests.append(request) + + total_in = sum(r["in"] for r in requests) + total_out = sum(r["out"] for r in requests) + + return { + "id": conversation_id, + "models": [canonical_model_id], + "block_size": int(block_size), + "hash_id_scope": "local", + "tool_tokens": 0, + "system_tokens": 0, + "requests": requests, + "totals": { + "parent_tokens": {"input": total_in, "output": total_out}, + "subagent_tokens": {"input": 0, "output": 0}, + "combined_tokens": {"input": total_in, "output": total_out}, + "subagent_count": 0, + }, + # ISB1 passthrough tags — kv-cache-tester ignores unknown keys. + "isb1": { + "trace_id": cell.get("trace_id"), + "runtime_stack_id": cell.get("runtime_stack_id"), + "hardware_profile_id": cell.get("hardware_profile_id"), + "canonical_model_id": canonical_model_id, + "support_status": cell.get("support_status"), + "benchmark_certification_status": cell.get( + "benchmark_certification_status" + ), + "context_band": cell.get("context_band"), + "adapter_id": adapter_id, + }, + } + + +# ----------------------------------------------------------------------------- +# Filter logic (subset of benchmark_export_replay.load_replay_sessions) +# ----------------------------------------------------------------------------- + +def _matches(value: Any, allowed: set[str] | None) -> bool: + if allowed is None: + return True + return str(value or "") in allowed + + +def _csv_set(raw: Optional[str]) -> set[str] | None: + if raw is None: + return None + return {v.strip() for v in raw.split(",") if v.strip()} or None + + +# ----------------------------------------------------------------------------- +# Bundle processing +# ----------------------------------------------------------------------------- + +def _load_bundle(export_file: Path) -> dict[str, Any]: + try: + payload = json.loads(export_file.read_text()) + except Exception as exc: + raise ValueError(f"failed to read ISB1 bundle {export_file}: {exc}") from exc + adapter_id = str(payload.get("adapter_id") or "unknown") + if adapter_id not in SUPPORTED_ADAPTERS: + raise ValueError( + f"unsupported ISB1 adapter {adapter_id!r} in {export_file}. " + f"Expected one of {SUPPORTED_ADAPTERS}. " + "This shim converts ISB1 replay bundles only; " + "raw model traces from other pipelines are out of scope." + ) + if adapter_id == "inferencex_trace_replay": + _hydrate_trace_replay_payload(payload, export_file) + return payload + + +def _convert_bundle( + *, + export_file: Path, + output_dir: Path, + runtime_stack_ids: set[str] | None, + hardware_profile_ids: set[str] | None, + canonical_model_ids: set[str] | None, + trace_ids: set[str] | None, + support_statuses: set[str] | None, + block_size: int, + fallback_output_tokens: int, + image_token_estimate: int, + max_conversations: Optional[int], + max_turns_per_conversation: Optional[int], + trace_prefix: str, +) -> tuple[int, int]: + """Returns (written_count, skipped_count).""" + payload = _load_bundle(export_file) + adapter_id = str(payload["adapter_id"]) + + iterator = ( + _iter_cells_from_multiturn(payload) + if adapter_id == "inferencex_multiturn" + else _iter_cells_from_trace_replay(payload) + ) + + output_dir.mkdir(parents=True, exist_ok=True) + + written = 0 + skipped = 0 + emitted_ids: set[str] = set() + for idx, cell in enumerate(iterator): + if not _matches(cell.get("runtime_stack_id"), runtime_stack_ids): + skipped += 1 + continue + if not _matches(cell.get("hardware_profile_id"), hardware_profile_ids): + skipped += 1 + continue + if not _matches(cell.get("canonical_model_id"), canonical_model_ids): + skipped += 1 + continue + if not _matches(cell.get("trace_id"), trace_ids): + skipped += 1 + continue + if not _matches(cell.get("support_status"), support_statuses): + skipped += 1 + continue + + trace = _build_trace( + cell=cell, + adapter_id=adapter_id, + fallback_idx=idx, + block_size=block_size, + fallback_output_tokens=fallback_output_tokens, + image_token_estimate=image_token_estimate, + max_turns_per_conversation=max_turns_per_conversation, + ) + if trace is None: + skipped += 1 + continue + + # Make the filename unique per cell, stable across runs. + base_id = str(trace["id"]) + safe_id = "".join(c if c.isalnum() or c in "-_." else "_" for c in base_id) + if safe_id in emitted_ids: + safe_id = f"{safe_id}_{idx:04d}" + emitted_ids.add(safe_id) + + fname = f"{trace_prefix}{safe_id}.json" + out_path = output_dir / fname + out_path.write_text(json.dumps(trace, indent=2, sort_keys=False)) + written += 1 + + if max_conversations is not None and written >= int(max_conversations): + break + + return written, skipped + + +def _iter_export_files(root: Path) -> Iterable[Path]: + for path in sorted(root.rglob("*.json")): + # Skip manifests/READMEs (anything that isn't an adapter bundle). + if path.name in ("manifest.json", "manifest_qwen3.5.json"): + continue + if "/prefixes/" in path.as_posix(): + continue + try: + with path.open() as fh: + first_chunk = fh.read(4096) + except Exception: + continue + if '"adapter_id"' not in first_chunk: + continue + yield path + + +# ----------------------------------------------------------------------------- +# CLI +# ----------------------------------------------------------------------------- + +def _parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="isb1_to_kvcache_tester", + description=( + "Convert ISB1 replay bundles into kv-cache-tester-compatible " + "per-conversation JSON files." + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + src = parser.add_mutually_exclusive_group(required=True) + src.add_argument( + "--export-file", + type=Path, + help="Single ISB1 bundle JSON (e.g. datasets/isb1/exports/core/chat_8k1k.json).", + ) + src.add_argument( + "--export-root", + type=Path, + help=( + "Directory tree under which every adapter bundle is converted " + "(e.g. datasets/isb1/exports/). Mirrors subpaths under --output-dir." + ), + ) + + parser.add_argument( + "--output-dir", + type=Path, + required=True, + help="Destination directory for trace_*.json files.", + ) + parser.add_argument( + "--trace-prefix", + type=str, + default="isb1_", + help="Filename prefix for emitted trace files (default: 'isb1_').", + ) + parser.add_argument( + "--block-size", + type=int, + default=DEFAULT_BLOCK_SIZE, + help=( + f"Hash block size in tokens (default: {DEFAULT_BLOCK_SIZE}, " + "matches kv-cache-tester default)." + ), + ) + parser.add_argument( + "--image-token-estimate", + type=int, + default=DEFAULT_IMAGE_TOKENS, + help=f"Tokens per image block (default: {DEFAULT_IMAGE_TOKENS}).", + ) + parser.add_argument( + "--fallback-output-tokens", + type=int, + default=FALLBACK_OUTPUT_TOKENS, + help=( + f"Output tokens to emit when a turn has no expected_output_tokens " + f"(default: {FALLBACK_OUTPUT_TOKENS})." + ), + ) + + parser.add_argument("--runtime-stack-id", + help="CSV of runtime_stack_ids to include (e.g. standalone:vllm,standalone:sglang).") + parser.add_argument("--hardware-profile-id", + help="CSV of hardware_profile_ids to include.") + parser.add_argument("--canonical-model-id", + help="CSV of canonical_model_ids to include.") + parser.add_argument("--trace-id", + help="CSV of trace_ids to include.") + parser.add_argument("--support-status", + help="CSV of support_status values to include (e.g. supported,reviewed_preview).") + + parser.add_argument("--max-conversations", type=int, default=None, + help="Stop after writing N conversations per bundle.") + parser.add_argument("--max-turns-per-conversation", type=int, default=None, + help="Truncate each conversation after N turns.") + + parser.add_argument("--quiet", action="store_true", help="Suppress per-bundle progress.") + + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = _parse_args(argv if argv is not None else sys.argv[1:]) + + if args.block_size <= 0: + print(f"ERROR: --block-size must be positive (got {args.block_size})", file=sys.stderr) + return 2 + + runtime_stack_ids = _csv_set(args.runtime_stack_id) + hardware_profile_ids = _csv_set(args.hardware_profile_id) + canonical_model_ids = _csv_set(args.canonical_model_id) + trace_ids = _csv_set(args.trace_id) + support_statuses = _csv_set(args.support_status) + + bundles: list[tuple[Path, Path]] = [] + if args.export_file: + bundles.append((args.export_file.resolve(), args.output_dir.resolve())) + else: + root = args.export_root.resolve() + if not root.is_dir(): + print(f"ERROR: --export-root not a directory: {root}", file=sys.stderr) + return 2 + for f in _iter_export_files(root): + rel = f.relative_to(root).parent + out = args.output_dir.resolve() / rel / f.stem + bundles.append((f, out)) + + if not bundles: + print("ERROR: no ISB1 bundles found", file=sys.stderr) + return 2 + + total_written = 0 + total_skipped = 0 + errors = 0 + for export_file, output_dir in bundles: + try: + written, skipped = _convert_bundle( + export_file=export_file, + output_dir=output_dir, + runtime_stack_ids=runtime_stack_ids, + hardware_profile_ids=hardware_profile_ids, + canonical_model_ids=canonical_model_ids, + trace_ids=trace_ids, + support_statuses=support_statuses, + block_size=args.block_size, + fallback_output_tokens=args.fallback_output_tokens, + image_token_estimate=args.image_token_estimate, + max_conversations=args.max_conversations, + max_turns_per_conversation=args.max_turns_per_conversation, + trace_prefix=args.trace_prefix, + ) + except (ValueError, FileNotFoundError) as exc: + print(f"ERROR: {export_file}: {exc}", file=sys.stderr) + errors += 1 + continue + + total_written += written + total_skipped += skipped + if not args.quiet: + print( + f"ok {export_file}: wrote {written} trace(s) to {output_dir} " + f"(skipped {skipped} cell(s) by filter)" + ) + + if not args.quiet: + print( + f"\ndone: {total_written} trace file(s) written across " + f"{len(bundles)} bundle(s); {total_skipped} cell(s) skipped; " + f"{errors} bundle(s) errored" + ) + + if total_written == 0: + print("ERROR: no traces written — check --runtime-stack-id / --hardware-profile-id filters", + file=sys.stderr) + return 1 + + return 0 if errors == 0 else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/test_isb1_to_kvcache_tester.py b/tools/test_isb1_to_kvcache_tester.py new file mode 100644 index 000000000..35aba515d --- /dev/null +++ b/tools/test_isb1_to_kvcache_tester.py @@ -0,0 +1,412 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Contract tests for ``tools/isb1_to_kvcache_tester.py``. + +These tests lock the bytes-level output schema so ISB1→kv-cache-tester +conversion can't silently drift from what +``callanjfox/kv-cache-tester``'s ``normalize_trace()`` expects. + +We re-implement the minimal ``normalize_trace`` logic inline so tests do not +pull in ``transformers`` / ``numpy`` / ``openai`` just to exercise the +conversion shim. The actual upstream function is +`trace_replay_tester.py::normalize_trace` (see kv-cache-tester repo). +""" + +from __future__ import annotations + +import json +import math +import subprocess +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +SHIM = REPO_ROOT / "tools" / "isb1_to_kvcache_tester.py" + + +# --------------------------------------------------------------------------- +# Local mirror of kv-cache-tester::normalize_trace (sans tokenizer deps). +# Keep in sync with upstream manually — these tests flag drift. +# --------------------------------------------------------------------------- + + +def _normalize_request(req: dict, base_time: float = 0.0) -> dict: + return { + "timestamp": base_time + req.get("t", 0.0), + "type": {"s": "streaming", "n": "non_streaming"}.get( + req.get("type", ""), req.get("type", "streaming") + ), + "input_tokens": req.get("in", 0), + "output_tokens": req.get("out", 0), + "hash_ids": req.get("hash_ids", []), + "stop_reason": req.get("stop", ""), + "model": req.get("model", ""), + } + + +def _normalize_trace(trace: dict) -> dict: + raw = trace.get("requests", []) + parent = [r for r in raw if r.get("type") != "subagent"] + requests = [_normalize_request(r) for r in parent] + + total_input = sum(r["input_tokens"] for r in requests) + + cache_hits = 0 + total_blocks = 0 + for i, req in enumerate(requests): + hash_ids = req["hash_ids"] + if i > 0 and hash_ids: + prev = set(requests[i - 1]["hash_ids"]) + for h in hash_ids: + total_blocks += 1 + if h in prev: + cache_hits += 1 + else: + break + elif hash_ids: + total_blocks += len(hash_ids) + + return { + "metadata": { + "conversation_id": trace.get("id", "unknown"), + "models": trace.get("models", []), + "block_size": trace.get("block_size", 64), + "hash_id_scope": trace.get("hash_id_scope", "per_context"), + "tool_tokens": trace.get("tool_tokens", 0), + "system_tokens": trace.get("system_tokens", 0), + "request_count": len(requests), + "total_input_tokens": total_input, + "cache_hit_rate": cache_hits / total_blocks if total_blocks else 0.0, + }, + "requests": requests, + } + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +def _multiturn_bundle() -> dict: + """Minimal ``inferencex_multiturn`` fixture — 1 cell, 2 turns.""" + return { + "adapter_id": "inferencex_multiturn", + "schema_version": "0.1.0", + "exports": [ + { + "trace_id": "core_chat_8k1k_001", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + "context_band": "core_8k", + "session": { + "session_id": "sess_001", + "turns": [ + { + "turn_idx": 0, + "messages": [ + {"role": "system", "content": "sys" * 16}, + {"role": "user", "content": "u" * 3200}, + ], + "expected_output_tokens": 128, + "wait_before_ms": 0, + }, + { + "turn_idx": 1, + "messages": [ + {"role": "system", "content": "sys" * 16}, + {"role": "user", "content": "u" * 3200}, + {"role": "assistant", "content": "a" * 400}, + {"role": "user", "content": "f" * 400}, + ], + "expected_output_tokens": 96, + "wait_before_ms": 1500, + }, + ], + }, + } + ], + } + + +def _trace_replay_bundle() -> dict: + """Minimal ``inferencex_trace_replay`` fixture (schema 0.1, no prefix_ref).""" + return { + "adapter_id": "inferencex_trace_replay", + "schema_version": "0.1.0", + "exports": [ + { + "trace_id": "ext_64k_001", + "runtime_stack_id": "standalone:sglang", + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + "context_band": "extension_64k", + "trace_metadata": {"session_id": "sess_tr_001"}, + "events": [ + { + "arrival_time_offset_ms": 0, + "input_messages": [ + {"role": "system", "content": "s" * 64}, + {"role": "user", "content": "x" * 12000}, + ], + "target_output_tokens": 100, + }, + { + "arrival_time_offset_ms": 4000, + "input_messages": [ + {"role": "system", "content": "s" * 64}, + {"role": "user", "content": "x" * 12000}, + {"role": "assistant", "content": "r" * 400}, + {"role": "user", "content": "follow" * 10}, + ], + "target_output_tokens": 150, + }, + ], + } + ], + } + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _run_shim( + tmp_path: Path, + bundle: dict, + *, + extra_args: list[str] | None = None, +) -> Path: + bundle_path = tmp_path / "bundle.json" + bundle_path.write_text(json.dumps(bundle)) + out_dir = tmp_path / "out" + cmd = [ + sys.executable, + str(SHIM), + "--export-file", + str(bundle_path), + "--output-dir", + str(out_dir), + "--quiet", + ] + if extra_args: + cmd.extend(extra_args) + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, ( + f"shim failed: stdout={result.stdout!r} stderr={result.stderr!r}" + ) + return out_dir + + +def _load_single_trace(out_dir: Path) -> dict: + files = sorted(out_dir.glob("*.json")) + assert len(files) == 1, f"expected 1 trace file, got {len(files)}: {files}" + return json.loads(files[0].read_text()) + + +# --------------------------------------------------------------------------- +# Contract tests +# --------------------------------------------------------------------------- + + +class TestMultiturnBundle: + def test_top_level_schema(self, tmp_path: Path) -> None: + out = _run_shim(tmp_path, _multiturn_bundle()) + trace = _load_single_trace(out) + + # Cam's tester reads these exact fields (see trace_replay_tester.py::normalize_trace). + assert trace["id"] == "sess_001" + assert trace["models"] == ["qwen3_5_397b_a17b"] + assert trace["block_size"] == 64 + assert trace["hash_id_scope"] == "local" + assert trace["tool_tokens"] == 0 + assert trace["system_tokens"] == 0 + assert isinstance(trace["requests"], list) + assert len(trace["requests"]) == 2 + + def test_per_request_schema(self, tmp_path: Path) -> None: + out = _run_shim(tmp_path, _multiturn_bundle()) + trace = _load_single_trace(out) + + required_keys = {"t", "type", "model", "in", "out", "hash_ids"} + for req in trace["requests"]: + missing = required_keys - req.keys() + assert not missing, f"missing required request keys: {missing}" + assert req["type"] in ("n", "s") + assert isinstance(req["in"], int) and req["in"] > 0 + assert isinstance(req["out"], int) and req["out"] > 0 + assert isinstance(req["hash_ids"], list) and req["hash_ids"] + + def test_hash_ids_are_prefix_extending(self, tmp_path: Path) -> None: + """hash_ids[i+1] must start with hash_ids[i] for cache hit rate > 0.""" + out = _run_shim(tmp_path, _multiturn_bundle()) + trace = _load_single_trace(out) + reqs = trace["requests"] + + for i in range(1, len(reqs)): + prev_hash_ids = reqs[i - 1]["hash_ids"] + curr_hash_ids = reqs[i]["hash_ids"] + assert len(curr_hash_ids) >= len(prev_hash_ids), ( + f"turn {i} has fewer hash_ids than turn {i-1}" + ) + assert curr_hash_ids[: len(prev_hash_ids)] == prev_hash_ids, ( + f"turn {i} hash_ids do not start with turn {i-1} prefix — " + "this breaks kv-cache-tester's cache-hit walker" + ) + + def test_block_size_mapping(self, tmp_path: Path) -> None: + out = _run_shim(tmp_path, _multiturn_bundle(), extra_args=["--block-size", "32"]) + trace = _load_single_trace(out) + assert trace["block_size"] == 32 + for req in trace["requests"]: + expected_blocks = max(1, math.ceil(req["in"] / 32)) + assert req["hash_ids"] == list(range(1, expected_blocks + 1)) + + def test_arrival_timing_monotonic(self, tmp_path: Path) -> None: + out = _run_shim(tmp_path, _multiturn_bundle()) + trace = _load_single_trace(out) + times = [r["t"] for r in trace["requests"]] + assert times == sorted(times), "request arrival times must be monotonic" + # second turn wait_before_ms=1500 → t>=1.5 + assert trace["requests"][1]["t"] >= 1.5 + + def test_isb1_passthrough_tags(self, tmp_path: Path) -> None: + out = _run_shim(tmp_path, _multiturn_bundle()) + trace = _load_single_trace(out) + assert "isb1" in trace + tags = trace["isb1"] + assert tags["adapter_id"] == "inferencex_multiturn" + assert tags["runtime_stack_id"] == "standalone:vllm" + assert tags["hardware_profile_id"] == "nvidia:h200_sxm_141gb" + assert tags["canonical_model_id"] == "qwen3_5_397b_a17b" + assert tags["support_status"] == "supported" + assert tags["benchmark_certification_status"] == "dataset_replay_verified" + assert tags["context_band"] == "core_8k" + + def test_normalize_trace_compatibility(self, tmp_path: Path) -> None: + """The emitted trace must round-trip through normalize_trace cleanly.""" + out = _run_shim(tmp_path, _multiturn_bundle()) + trace = _load_single_trace(out) + normalized = _normalize_trace(trace) + + md = normalized["metadata"] + assert md["conversation_id"] == "sess_001" + assert md["request_count"] == 2 + assert md["total_input_tokens"] > 0 + # Two turns, second extends first's prefix → cache hit rate > 0. + assert md["cache_hit_rate"] > 0.0 + + +class TestTraceReplayBundle: + def test_reads_events(self, tmp_path: Path) -> None: + out = _run_shim(tmp_path, _trace_replay_bundle()) + trace = _load_single_trace(out) + assert trace["id"] == "sess_tr_001" + assert len(trace["requests"]) == 2 + + def test_arrival_offsets(self, tmp_path: Path) -> None: + out = _run_shim(tmp_path, _trace_replay_bundle()) + trace = _load_single_trace(out) + # second event arrival_time_offset_ms=4000 → t=4.0 + assert trace["requests"][0]["t"] == 0.0 + assert trace["requests"][1]["t"] == 4.0 + + def test_prefix_reuse_dominates(self, tmp_path: Path) -> None: + out = _run_shim(tmp_path, _trace_replay_bundle()) + trace = _load_single_trace(out) + normalized = _normalize_trace(trace) + # 12k-token prefix shared across both events → cache hit rate ≈ 49-50%. + assert 0.4 < normalized["metadata"]["cache_hit_rate"] < 0.6 + + +class TestFilters: + def test_runtime_stack_filter_excludes_non_match(self, tmp_path: Path) -> None: + bundle_path = tmp_path / "bundle.json" + bundle_path.write_text(json.dumps(_multiturn_bundle())) + out_dir = tmp_path / "out" + + # Non-matching runtime filter should exit non-zero with "no traces". + result = subprocess.run( + [ + sys.executable, + str(SHIM), + "--export-file", + str(bundle_path), + "--output-dir", + str(out_dir), + "--runtime-stack-id", + "standalone:trtllm", + "--quiet", + ], + capture_output=True, + text=True, + ) + assert result.returncode != 0 + assert "no traces written" in result.stderr.lower() + + def test_support_status_filter_allows_match(self, tmp_path: Path) -> None: + out = _run_shim( + tmp_path, + _multiturn_bundle(), + extra_args=["--support-status", "supported,reviewed_preview"], + ) + assert sorted(out.glob("*.json")) + + def test_max_turns_truncates(self, tmp_path: Path) -> None: + out = _run_shim( + tmp_path, + _multiturn_bundle(), + extra_args=["--max-turns-per-conversation", "1"], + ) + trace = _load_single_trace(out) + assert len(trace["requests"]) == 1 + + +class TestErrorHandling: + def test_rejects_unknown_adapter(self, tmp_path: Path) -> None: + bundle = {"adapter_id": "something_else", "exports": []} + bundle_path = tmp_path / "bundle.json" + bundle_path.write_text(json.dumps(bundle)) + out_dir = tmp_path / "out" + result = subprocess.run( + [ + sys.executable, + str(SHIM), + "--export-file", + str(bundle_path), + "--output-dir", + str(out_dir), + "--quiet", + ], + capture_output=True, + text=True, + ) + assert result.returncode != 0 + assert "unsupported isb1 adapter" in result.stderr.lower() + + def test_rejects_nonpositive_block_size(self, tmp_path: Path) -> None: + bundle_path = tmp_path / "bundle.json" + bundle_path.write_text(json.dumps(_multiturn_bundle())) + result = subprocess.run( + [ + sys.executable, + str(SHIM), + "--export-file", + str(bundle_path), + "--output-dir", + str(tmp_path / "out"), + "--block-size", + "0", + "--quiet", + ], + capture_output=True, + text=True, + ) + assert result.returncode == 2 + assert "--block-size must be positive" in result.stderr From d0e199e7a5da4febc6fd0f14196dc16a3da13e37 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 21:40:43 -0700 Subject: [PATCH 11/18] =?UTF-8?q?refactor(isb1):=20drop=20homegrown=20repl?= =?UTF-8?q?ay=20harness=20=E2=80=94=20defer=20to=20callanjfox/kv-cache-tes?= =?UTF-8?q?ter=20(PR=20#993)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #1032 keeps only data + the conversion shim; Cam's kv-cache-tester submodule on PR #993 owns replay. Delete: - utils/bench_serving/benchmark_export_replay.py - utils/process_result_isb1.py - utils/test_benchmark_export_replay.py - utils/test_process_result_isb1.py Revert to upstream/main: - utils/process_result.py - utils/test_process_result.py Co-Authored-By: Claude Opus 4.7 --- datasets/isb1/README.md | 26 +- tools/isb1_to_kvcache_tester.py | 12 +- .../bench_serving/benchmark_export_replay.py | 1684 ----------------- utils/process_result.py | 17 - utils/process_result_isb1.py | 490 ----- utils/test_benchmark_export_replay.py | 947 --------- utils/test_process_result.py | 27 - utils/test_process_result_isb1.py | 1006 ---------- 8 files changed, 10 insertions(+), 4199 deletions(-) delete mode 100644 utils/bench_serving/benchmark_export_replay.py delete mode 100644 utils/process_result_isb1.py delete mode 100644 utils/test_benchmark_export_replay.py delete mode 100644 utils/test_process_result_isb1.py diff --git a/datasets/isb1/README.md b/datasets/isb1/README.md index 64d855e53..46a4fbbdc 100644 --- a/datasets/isb1/README.md +++ b/datasets/isb1/README.md @@ -4,11 +4,8 @@ This directory is the InferenceX-side consumer package for ISB1 replay. InferenceX consumes committed file artifacts only: - replay export JSON bundles under `datasets/isb1/exports/` -- conversion to SemiAnalysis's `kv-cache-tester` format via - [`tools/isb1_to_kvcache_tester.py`](../../tools/isb1_to_kvcache_tester.py) +- replay via [`tools/isb1_to_kvcache_tester.py`](../../tools/isb1_to_kvcache_tester.py) + Cam's [`kv-cache-tester`](https://github.com/callanjfox/kv-cache-tester) - consumer configs in `.github/configs/isb1-*.yaml` -- replay processing through `utils/bench_serving/benchmark_export_replay.py` -- result normalization through `utils/process_result_isb1.py` InferenceX does **not** import external runtime code and does **not** make live-serving claims from export-file existence alone. @@ -74,12 +71,7 @@ metadata. | `preview/long_context_500k/` | 4 + 2 manifests | 500K chat/code × {gptoss, qwen3.5} | | `preview/long_context_1m/` | 2 + 1 manifest | 1M chat/code × qwen3.5 | -All export files are valid JSON and replay-hydratable via -`utils/bench_serving/benchmark_export_replay.py`. - -All bundles can also be converted to SemiAnalysis's `kv-cache-tester` -per-conversation trace format via [`tools/isb1_to_kvcache_tester.py`](../../tools/isb1_to_kvcache_tester.py); -see [How to consume](#how-to-consume). +All export files are valid JSON and can be consumed via [`tools/isb1_to_kvcache_tester.py`](../../tools/isb1_to_kvcache_tester.py) + Cam's [`kv-cache-tester`](https://github.com/callanjfox/kv-cache-tester); see [How to consume](#how-to-consume). --- @@ -121,19 +113,9 @@ Unsafe claims: ## How to consume -Two consumption paths are supported, both fed from the same committed bundles: - -1. **InferenceX-internal replay** — `utils/bench_serving/benchmark_export_replay.py` - directly, with `utils/process_result_isb1.py` for result normalization. -2. **SemiAnalysis `kv-cache-tester`** — convert via - [`tools/isb1_to_kvcache_tester.py`](../../tools/isb1_to_kvcache_tester.py), - then feed the resulting directory to - [`trace_replay_tester.py --trace-directory`](https://github.com/callanjfox/kv-cache-tester) - (PR #993 submodule). +The supported consumption path is [`tools/isb1_to_kvcache_tester.py`](../../tools/isb1_to_kvcache_tester.py) + Cam's [`kv-cache-tester`](https://github.com/callanjfox/kv-cache-tester) `trace_replay_tester.py --trace-directory` flow. -Path 2 is documented here because it is the path that lets ISB1 traces plug -into SemiAnalysis's existing Slurm benchmarking pipeline without the consumer -having to read our replay code. +This is the path that lets ISB1 traces plug into SemiAnalysis's existing Slurm benchmarking pipeline without the consumer having to read any InferenceX-local replay harness. ### Step 1 — fetch the bundles (LFS) diff --git a/tools/isb1_to_kvcache_tester.py b/tools/isb1_to_kvcache_tester.py index 75fc63838..b454e04b6 100644 --- a/tools/isb1_to_kvcache_tester.py +++ b/tools/isb1_to_kvcache_tester.py @@ -68,7 +68,7 @@ --export-root datasets/isb1/exports/ \ --output-dir traces_isb1/ -Filters (subset by cell, matching ``benchmark_export_replay.py`` semantics): +Filters (subset by cell, matching the legacy replay-loader semantics): python tools/isb1_to_kvcache_tester.py \ --export-file datasets/isb1/exports/core/chat_8k1k.json \ @@ -110,13 +110,13 @@ # ----------------------------------------------------------------------------- -# Token counting (stdlib-only, matches benchmark_export_replay._fallback_text_token_count) +# Token counting (stdlib-only, matches the legacy replay-harness fallback token count) # ----------------------------------------------------------------------------- def _fallback_text_token_count(text: str) -> int: """Approximate token count (≈ 4 chars / token). - This matches the fallback path in ``utils/bench_serving/benchmark_export_replay.py`` + This matches the fallback path in the legacy replay harness and mirrors Cam's tester, which does its own tokenization at replay time from synthetic content. The only thing this shim must emit accurately is the *block count* (``ceil(in_tokens / block_size)``); a ~10-20% error in @@ -130,7 +130,7 @@ def _fallback_text_token_count(text: str) -> int: # ----------------------------------------------------------------------------- -# Message flattening (mirrors benchmark_export_replay._extract_message_text) +# Message flattening (mirrors the legacy replay-harness message extraction) # ----------------------------------------------------------------------------- def _render_block_as_text(block: dict[str, Any]) -> str: @@ -237,7 +237,7 @@ def _load_prefix_artifact( def _merge_prefix_into_cell(cell: dict[str, Any], prefix_payload: dict[str, Any]) -> None: # Schema 0.2.0 bundles store events in the prefix artifact; merge back in-place. - # (Mirrors benchmark_export_replay._merge_prefix_into_trace_replay_cell.) + # (Mirrors the legacy replay-harness prefix merge behavior.) prefix_events = prefix_payload.get("events") or [] cell.setdefault("events", []).extend(prefix_events) # Also merge trace_metadata if the prefix carries extras; preserve cell priority. @@ -470,7 +470,7 @@ def _build_trace( # ----------------------------------------------------------------------------- -# Filter logic (subset of benchmark_export_replay.load_replay_sessions) +# Filter logic (subset of the legacy replay session-loader semantics) # ----------------------------------------------------------------------------- def _matches(value: Any, allowed: set[str] | None) -> bool: diff --git a/utils/bench_serving/benchmark_export_replay.py b/utils/bench_serving/benchmark_export_replay.py deleted file mode 100644 index 0febd47e9..000000000 --- a/utils/bench_serving/benchmark_export_replay.py +++ /dev/null @@ -1,1684 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -r"""Replay ISB1 export sessions against OpenAI-compatible inference servers. - -Supported export formats: - - ``inferencex_multiturn`` (direct-ingest session turns) - - ``inferencex_trace_replay`` (event-based trace replay) - -Supported request modes: - - ``chat``: send full message history to ``/v1/chat/completions`` - - ``completions``: project the message history into a single tagged prompt - and send it to ``/v1/completions`` - - ``auto``: prefer chat for standalone vLLM/SGLang cells and completions - for TRT / Dynamo projection cells -""" - -from __future__ import annotations - -import argparse -import asyncio -import hashlib -import json -import math -import os -import random -import sys -import time -import warnings -from collections import OrderedDict -from dataclasses import dataclass, field -from datetime import datetime -from pathlib import Path -from typing import Any, Callable, Optional - -import aiohttp -import numpy as np -from tqdm.asyncio import tqdm - -try: - from vllm.utils import FlexibleArgumentParser -except ImportError: - from argparse import ArgumentParser as FlexibleArgumentParser - - -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60, sock_read=5 * 60) -DEFAULT_IMAGE_TOKEN_ESTIMATE = 2048 -DEFAULT_FALLBACK_OUTPUT_LEN = 256 -CHAT_NATIVE_RUNTIMES = {"standalone:vllm", "standalone:sglang"} -COMPLETIONS_PREFERRED_RUNTIMES = { - "standalone:trt_llm", - "dynamo:vllm", - "dynamo:sglang", - "dynamo:trt_llm", -} -ROLE_LABELS = { - "system": "SYSTEM", - "user": "USER", - "assistant": "ASSISTANT", - "tool": "TOOL", - "retrieval": "RETRIEVAL", - "execution": "EXECUTION", -} -MODULE_DIR = Path(__file__).resolve().parent -if str(MODULE_DIR) not in sys.path: - sys.path.insert(0, str(MODULE_DIR)) - -TRACE_REPLAY_PREFIX_FIELDS = ( - "events", - "trace_metadata", - "workload_family", - "task_class", - "workload_profile", - "kv_mode", - "coding_profile", - "benchmark_surface", - "benchmark_modifiers", - "workload_shape", - "long_context_contract", - "coding_profile_detail", - "system_expectations", - "reasoning_profile", - "history_visibility", - "context_band", - "adapter_execution_class", -) -_PREFIX_ARTIFACT_CACHE_MAX = 8 -_PREFIX_ARTIFACT_CACHE: OrderedDict[tuple[str, str], dict[str, Any]] = OrderedDict() - - -def _schema_version_tuple(raw_version: Any) -> tuple[int, int, int]: - if raw_version is None: - return (0, 1, 0) - - parts = str(raw_version).split(".") - values: list[int] = [] - for part in parts[:3]: - try: - values.append(int(part)) - except ValueError: - return (0, 0, 0) - while len(values) < 3: - values.append(0) - return tuple(values[:3]) - - -def _schema_version_at_least(raw_version: Any, minimum_version: str) -> bool: - return _schema_version_tuple(raw_version) >= _schema_version_tuple(minimum_version) - - -def _remember_prefix_artifact( - cache_key: tuple[str, str], - prefix_payload: dict[str, Any], -) -> None: - _PREFIX_ARTIFACT_CACHE[cache_key] = prefix_payload - _PREFIX_ARTIFACT_CACHE.move_to_end(cache_key) - while len(_PREFIX_ARTIFACT_CACHE) > _PREFIX_ARTIFACT_CACHE_MAX: - _PREFIX_ARTIFACT_CACHE.popitem(last=False) - - - -def _load_prefix_artifact( - bundle_path: Path, - prefix_ref: str, - prefix_entry: dict[str, Any], -) -> dict[str, Any]: - cache_key = (str(bundle_path), prefix_ref) - cached = _PREFIX_ARTIFACT_CACHE.get(cache_key) - if cached is not None: - _PREFIX_ARTIFACT_CACHE.move_to_end(cache_key) - return cached - - prefix_path = bundle_path.parent / str(prefix_entry.get("relative_path", "")) - raw_prefix = prefix_path.read_bytes() - declared_sha = str(prefix_entry.get("sha256", "")) - actual_sha = hashlib.sha256(raw_prefix).hexdigest() - if actual_sha != declared_sha: - detail = { - "bundle_path": str(bundle_path), - "prefix_ref": prefix_ref, - "declared_sha": declared_sha, - "actual_sha": actual_sha, - } - raise ValueError(f"Prefix artifact SHA-256 mismatch: {detail}") - - prefix_payload = json.loads(raw_prefix) - _remember_prefix_artifact(cache_key, prefix_payload) - return prefix_payload - - - -def _merge_prefix_into_trace_replay_cell( - cell: dict[str, Any], - prefix_payload: dict[str, Any], -) -> None: - for field in TRACE_REPLAY_PREFIX_FIELDS: - if field not in cell and field in prefix_payload: - cell[field] = prefix_payload[field] - - prefix_overrides = cell.get("prefix_overrides") - if isinstance(prefix_overrides, dict): - cell.update(prefix_overrides) - - - -def _hydrate_trace_replay_export_payload( - payload: dict[str, Any], - bundle_path: Path, -) -> None: - if not _schema_version_at_least(payload.get("schema_version"), "0.2.0"): - return - - export_cells = list(payload.get("exports", [])) - if not export_cells: - return - - bundle_path_str = str(bundle_path) - has_prefix_ref = any(cell.get("prefix_ref") for cell in export_cells) - has_embedded_events = any("events" in cell for cell in export_cells) - if has_prefix_ref and has_embedded_events: - raise ValueError( - "Mixed legacy/prefix-aware trace replay bundle unsupported in " - f"{bundle_path_str}; rows cannot mix embedded events with prefix_ref." - ) - - missing_prefix_ref = [cell for cell in export_cells if not cell.get("prefix_ref")] - if missing_prefix_ref: - raise ValueError( - f"Prefix-aware trace replay bundle missing prefix_ref in {bundle_path_str}" - ) - - raw_prefix_index = payload.get("prefix_index") - prefix_index = raw_prefix_index if isinstance(raw_prefix_index, dict) else {} - prefix_payloads: dict[str, dict[str, Any]] = {} - for prefix_ref in {str(cell["prefix_ref"]) for cell in export_cells}: - prefix_entry = prefix_index.get(prefix_ref) - if not isinstance(prefix_entry, dict): - raise ValueError(f"unknown prefix_ref {prefix_ref!r} in {bundle_path_str}") - prefix_payloads[prefix_ref] = _load_prefix_artifact( - bundle_path=bundle_path, - prefix_ref=prefix_ref, - prefix_entry=prefix_entry, - ) - - for cell in export_cells: - _merge_prefix_into_trace_replay_cell( - cell, - prefix_payloads[str(cell["prefix_ref"])], - ) - - -@dataclass -class TurnResult: - turn_idx: int - context_len: int - output_len: int - ttft: float = 0.0 - tpot: float = 0.0 - e2el: float = 0.0 - itl: list[float] = field(default_factory=list) - success: bool = True - error: str = "" - request_mode: str = "chat" - actual_context_len: int = 0 - - -@dataclass -class SessionResult: - session_id: str - turns: list[TurnResult] = field(default_factory=list) - total_input_tokens: int = 0 - total_actual_input_tokens: int = 0 - total_output_tokens: int = 0 - total_duration: float = 0.0 - - -@dataclass -class ReplayTurn: - turn_idx: int - turn_id: Any - output_len: int - wait_before_s: float - context_len: int - actual_context_len: int - chat_messages: list[dict[str, Any]] - completion_prompt: str - - -@dataclass -class ReplaySession: - session_id: str - trace_id: str - runtime_stack_id: str - hardware_profile_id: str - canonical_model_id: str - support_status: str - benchmark_certification_status: str - request_mode: str - adapter_id: str - turns: list[ReplayTurn] - - -def _csv_values(raw: Optional[str]) -> set[str] | None: - if raw is None: - return None - values = {item.strip() for item in raw.split(",") if item.strip()} - return values or None - - -def _matches_filter(value: str, allowed: set[str] | None) -> bool: - return allowed is None or value in allowed - - -def _fallback_text_token_count(text: str) -> int: - stripped = (text or "").strip() - if not stripped: - return 0 - return max(1, math.ceil(len(stripped) / 4)) - - -def build_text_token_counter( - tokenizer_id: Optional[str], - tokenizer_mode: str = "auto", - trust_remote_code: bool = False, -) -> Callable[[str], int]: - if not tokenizer_id: - return _fallback_text_token_count - - try: - from backend_request_func import get_tokenizer - - tokenizer = get_tokenizer( - tokenizer_id, - tokenizer_mode=tokenizer_mode, - trust_remote_code=trust_remote_code, - ) - except Exception as exc: - warnings.warn( - "Falling back to approximate token counting because tokenizer load " - f"failed for {tokenizer_id!r}: {exc}", - stacklevel=2, - ) - return _fallback_text_token_count - - def _count(text: str) -> int: - return len(tokenizer.encode(text or "", add_special_tokens=False)) - - return _count - - -def _render_block_as_text(block: dict[str, Any]) -> str: - block_type = str(block.get("type", "text")) - text = (block.get("text") or "").strip() - if block_type == "text": - return text - if block_type == "code": - return f"[CODE]\n{text}" if text else "[CODE]" - if block_type == "log": - return f"[LOG]\n{text}" if text else "[LOG]" - if block_type == "document": - label = block.get("asset_path") or block.get("uri") or "" - if text and label: - return f"[DOCUMENT: {label}]\n{text}" - if text: - return f"[DOCUMENT]\n{text}" - return f"[DOCUMENT: {label}]" if label else "[DOCUMENT]" - if block_type == "table": - return f"[TABLE]\n{text}" if text else "[TABLE]" - if block_type == "image": - label = block.get("uri") or block.get("asset_path") or text or "image" - return f"[IMAGE: {label}]" - return text or f"[{block_type.upper()}]" - - -def _extract_message_text(message: dict[str, Any]) -> str: - if isinstance(message.get("content"), str): - body = message["content"] - elif isinstance(message.get("content"), list): - parts: list[str] = [] - for part in message["content"]: - part_type = str(part.get("type", "text")) - if part_type == "text": - parts.append((part.get("text") or "").strip()) - elif part_type == "image_url": - url = "" - if isinstance(part.get("image_url"), dict): - url = part["image_url"].get("url") or "" - parts.append(f"[IMAGE: {url or 'image'}]") - body = "\n\n".join(item for item in parts if item) - else: - content_blocks = message.get("content_blocks") or [] - body = "\n\n".join( - filter(None, (_render_block_as_text(block) for block in content_blocks)) - ) - - role = str(message.get("role", "user")) - if role in {"tool", "retrieval", "execution"}: - prefix = f"[{ROLE_LABELS.get(role, role.upper())} RESULT]" - return f"{prefix}\n{body}" if body else prefix - return body - - -def _message_to_chat_payload(message: dict[str, Any]) -> dict[str, Any]: - role = str(message.get("role", "user")) - projected_role = role if role in {"system", "user", "assistant"} else "user" - content_blocks = message.get("content_blocks") or [] - - if not content_blocks: - return {"role": projected_role, "content": _extract_message_text(message)} - - parts: list[dict[str, Any]] = [] - if role not in {"system", "user", "assistant"}: - parts.append( - { - "type": "text", - "text": f"[{ROLE_LABELS.get(role, role.upper())} RESULT]", - } - ) - - for block in content_blocks: - block_type = str(block.get("type", "text")) - if block_type == "image" and block.get("uri"): - parts.append( - { - "type": "image_url", - "image_url": {"url": block["uri"]}, - } - ) - continue - - text = _render_block_as_text(block) - if text: - parts.append({"type": "text", "text": text}) - - if not parts: - return {"role": projected_role, "content": ""} - if len(parts) == 1 and parts[0]["type"] == "text": - return {"role": projected_role, "content": parts[0]["text"]} - return {"role": projected_role, "content": parts} - - -def _message_token_estimate( - message: dict[str, Any], - count_text_tokens: Callable[[str], int], - image_token_estimate: int, -) -> int: - content_blocks = message.get("content_blocks") or [] - if not content_blocks: - return count_text_tokens(_extract_message_text(message)) - - total = 0 - role = str(message.get("role", "user")) - if role in {"tool", "retrieval", "execution"}: - total += count_text_tokens(f"[{ROLE_LABELS.get(role, role.upper())} RESULT]") - - for block in content_blocks: - block_type = str(block.get("type", "text")) - if block_type == "image": - total += int( - block.get("asset_token_count") - or block.get("metadata", {}).get("token_count") - or image_token_estimate - ) - continue - if block.get("asset_token_count") and block.get("asset_path"): - total += int(block["asset_token_count"]) - continue - total += count_text_tokens(_render_block_as_text(block)) - return total - - -def _chat_payload_token_count( - chat_messages: list[dict[str, Any]], - count_text_tokens: Callable[[str], int], -) -> int: - """Count tokens in the rendered chat payload that will actually be sent over HTTP.""" - total = 0 - for msg in chat_messages: - content = msg.get("content", "") - if isinstance(content, str): - total += count_text_tokens(content) - elif isinstance(content, list): - for part in content: - if part.get("type") == "text": - total += count_text_tokens(part.get("text", "")) - elif part.get("type") == "image_url": - total += DEFAULT_IMAGE_TOKEN_ESTIMATE - return total - - -def _messages_to_completion_prompt(messages: list[dict[str, Any]]) -> str: - prompt_parts: list[str] = [] - for message in messages: - role = ROLE_LABELS.get(str(message.get("role", "user")), "USER") - body = _extract_message_text(message).strip() - prompt_parts.append(f"{role}:\n{body}" if body else f"{role}:") - prompt_parts.append("ASSISTANT:\n") - return "\n\n".join(prompt_parts) - - -def resolve_request_mode(runtime_stack_id: str, requested_mode: str) -> str: - if requested_mode != "auto": - return requested_mode - if runtime_stack_id in CHAT_NATIVE_RUNTIMES: - return "chat" - if runtime_stack_id in COMPLETIONS_PREFERRED_RUNTIMES: - return "completions" - return "chat" - - -def _parse_prometheus_sample(line: str) -> tuple[str, float] | None: - """Parse a Prometheus sample line into ``(metric_name, value)``.""" - raw_line = line.strip() - if not raw_line or raw_line.startswith("#"): - return None - - try: - metric_with_labels, raw_value = raw_line.rsplit(maxsplit=1) - metric_name = metric_with_labels.split("{", 1)[0] - return metric_name, float(raw_value) - except (TypeError, ValueError): - return None - - -def _resolve_output_len( - raw_output_len: Any, - fallback_output_len: int, - output_len_cap: Optional[int], -) -> int: - try: - output_len = int(raw_output_len) - except (TypeError, ValueError): - output_len = fallback_output_len - if output_len <= 0: - output_len = fallback_output_len - if output_len_cap is not None: - output_len = min(output_len, output_len_cap) - return output_len - - -def _build_turn_from_messages( - turn_idx: int, - turn_id: Any, - messages: list[dict[str, Any]], - output_len: int, - wait_before_s: float, - request_mode: str, - count_text_tokens: Callable[[str], int], - image_token_estimate: int, -) -> ReplayTurn: - chat_messages = [_message_to_chat_payload(message) for message in messages] - completion_prompt = _messages_to_completion_prompt(messages) - if request_mode == "chat": - context_len = sum( - _message_token_estimate(message, count_text_tokens, image_token_estimate) - for message in messages - ) - actual_context_len = _chat_payload_token_count(chat_messages, count_text_tokens) - else: - context_len = count_text_tokens(completion_prompt) - actual_context_len = context_len # completions mode already uses rendered text - return ReplayTurn( - turn_idx=turn_idx, - turn_id=turn_id, - output_len=output_len, - wait_before_s=wait_before_s, - context_len=context_len, - actual_context_len=actual_context_len, - chat_messages=chat_messages, - completion_prompt=completion_prompt, - ) - - -def _build_session_from_multiturn_cell( - cell: dict[str, Any], - request_mode: str, - count_text_tokens: Callable[[str], int], - image_token_estimate: int, - ignore_waits: bool, - fallback_output_len: int, - output_len_cap: Optional[int], - max_turns_per_session: Optional[int], -) -> ReplaySession: - session = cell["session"] - turns: list[ReplayTurn] = [] - for raw_turn in session.get("turns", []): - turns.append( - _build_turn_from_messages( - turn_idx=int(raw_turn.get("turn_idx", len(turns))), - turn_id=raw_turn.get("turn_id"), - messages=list(raw_turn.get("messages", [])), - output_len=_resolve_output_len( - raw_turn.get("expected_output_tokens"), - fallback_output_len, - output_len_cap, - ), - wait_before_s=0.0 - if ignore_waits - else float(raw_turn.get("wait_before_ms", 0)) / 1000.0, - request_mode=request_mode, - count_text_tokens=count_text_tokens, - image_token_estimate=image_token_estimate, - ) - ) - if max_turns_per_session is not None and len(turns) >= max_turns_per_session: - break - - return ReplaySession( - session_id=str(session.get("session_id", cell["trace_id"])), - trace_id=str(cell["trace_id"]), - runtime_stack_id=str(cell["runtime_stack_id"]), - hardware_profile_id=str(cell["hardware_profile_id"]), - canonical_model_id=str(cell["canonical_model_id"]), - support_status=str(cell.get("support_status", "unknown")), - benchmark_certification_status=str( - cell.get("benchmark_certification_status", "unknown") - ), - request_mode=request_mode, - adapter_id="inferencex_multiturn", - turns=turns, - ) - - -def _build_session_from_trace_replay_cell( - cell: dict[str, Any], - request_mode: str, - count_text_tokens: Callable[[str], int], - image_token_estimate: int, - ignore_waits: bool, - fallback_output_len: int, - output_len_cap: Optional[int], - max_turns_per_session: Optional[int], -) -> ReplaySession: - turns: list[ReplayTurn] = [] - prior_offset_ms = 0 - for index, event in enumerate(cell.get("events", [])): - offset_ms = int(event.get("arrival_time_offset_ms", 0) or 0) - wait_before_ms = 0 if index == 0 else max(0, offset_ms - prior_offset_ms) - prior_offset_ms = offset_ms - turns.append( - _build_turn_from_messages( - turn_idx=index, - turn_id=event.get("turn_id"), - messages=list(event.get("input_messages", [])), - output_len=_resolve_output_len( - event.get("target_output_tokens"), - fallback_output_len, - output_len_cap, - ), - wait_before_s=0.0 if ignore_waits else wait_before_ms / 1000.0, - request_mode=request_mode, - count_text_tokens=count_text_tokens, - image_token_estimate=image_token_estimate, - ) - ) - if max_turns_per_session is not None and len(turns) >= max_turns_per_session: - break - - return ReplaySession( - session_id=str(cell.get("trace_metadata", {}).get("session_id", cell["trace_id"])), - trace_id=str(cell["trace_id"]), - runtime_stack_id=str(cell["runtime_stack_id"]), - hardware_profile_id=str(cell["hardware_profile_id"]), - canonical_model_id=str(cell["canonical_model_id"]), - support_status=str(cell.get("support_status", "unknown")), - benchmark_certification_status=str( - cell.get("benchmark_certification_status", "unknown") - ), - request_mode=request_mode, - adapter_id="inferencex_trace_replay", - turns=turns, - ) - - -def load_replay_sessions( - export_file: str, - count_text_tokens: Callable[[str], int], - runtime_stack_ids: set[str] | None = None, - hardware_profile_ids: set[str] | None = None, - canonical_model_ids: set[str] | None = None, - trace_ids: set[str] | None = None, - support_statuses: set[str] | None = None, - request_mode: str = "auto", - image_token_estimate: int = DEFAULT_IMAGE_TOKEN_ESTIMATE, - ignore_waits: bool = False, - fallback_output_len: int = DEFAULT_FALLBACK_OUTPUT_LEN, - output_len_cap: Optional[int] = None, - session_offset: int = 0, - max_sessions: Optional[int] = None, - max_turns_per_session: Optional[int] = None, - shuffle_sessions: bool = False, - seed: int = 0, - allow_mixed_selection: bool = False, -) -> tuple[list[ReplaySession], dict[str, Any]]: - bundle_path = Path(export_file).resolve() - payload = json.loads(bundle_path.read_text()) - adapter_id = str(payload.get("adapter_id", "unknown")) - if adapter_id == "inferencex_trace_replay": - _hydrate_trace_replay_export_payload(payload, bundle_path) - export_cells = list(payload.get("exports", [])) - if adapter_id not in {"inferencex_multiturn", "inferencex_trace_replay"}: - raise ValueError( - f"Unsupported export adapter {adapter_id!r}. Expected " - "'inferencex_multiturn' or 'inferencex_trace_replay'." - ) - - selected_cells = [ - cell - for cell in export_cells - if _matches_filter(str(cell.get("runtime_stack_id", "")), runtime_stack_ids) - and _matches_filter(str(cell.get("hardware_profile_id", "")), hardware_profile_ids) - and _matches_filter(str(cell.get("canonical_model_id", "")), canonical_model_ids) - and _matches_filter(str(cell.get("trace_id", "")), trace_ids) - and _matches_filter(str(cell.get("support_status", "")), support_statuses) - ] - if not selected_cells: - raise ValueError( - "No export cells matched the requested filters. " - "Check runtime_stack_id / hardware_profile_id / canonical_model_id / " - "trace_id / support_status." - ) - - if shuffle_sessions: - random.Random(seed).shuffle(selected_cells) - - if session_offset: - selected_cells = selected_cells[session_offset:] - if max_sessions is not None: - selected_cells = selected_cells[:max_sessions] - if not selected_cells: - raise ValueError("Selection became empty after applying session_offset/max_sessions.") - - uniqueness = { - "runtime_stack_id": sorted({str(cell["runtime_stack_id"]) for cell in selected_cells}), - "hardware_profile_id": sorted({str(cell["hardware_profile_id"]) for cell in selected_cells}), - "canonical_model_id": sorted({str(cell["canonical_model_id"]) for cell in selected_cells}), - } - if not allow_mixed_selection: - mixed_fields = [field for field, values in uniqueness.items() if len(values) > 1] - if mixed_fields: - details = ", ".join(f"{field}={uniqueness[field]}" for field in mixed_fields) - raise ValueError( - "Selected export cells span multiple target server identities; " - f"filter more narrowly or pass --allow-mixed-selection. Mixed fields: {details}" - ) - - sessions: list[ReplaySession] = [] - for cell in selected_cells: - resolved_mode = resolve_request_mode(str(cell["runtime_stack_id"]), request_mode) - if adapter_id == "inferencex_multiturn": - sessions.append( - _build_session_from_multiturn_cell( - cell=cell, - request_mode=resolved_mode, - count_text_tokens=count_text_tokens, - image_token_estimate=image_token_estimate, - ignore_waits=ignore_waits, - fallback_output_len=fallback_output_len, - output_len_cap=output_len_cap, - max_turns_per_session=max_turns_per_session, - ) - ) - else: - sessions.append( - _build_session_from_trace_replay_cell( - cell=cell, - request_mode=resolved_mode, - count_text_tokens=count_text_tokens, - image_token_estimate=image_token_estimate, - ignore_waits=ignore_waits, - fallback_output_len=fallback_output_len, - output_len_cap=output_len_cap, - max_turns_per_session=max_turns_per_session, - ) - ) - - selection_metadata = { - "adapter_id": adapter_id, - "export_file": str(export_file), - "selected_sessions": len(sessions), - "trace_ids": [session.trace_id for session in sessions], - "runtime_stack_ids": sorted({session.runtime_stack_id for session in sessions}), - "hardware_profile_ids": sorted({session.hardware_profile_id for session in sessions}), - "canonical_model_ids": sorted({session.canonical_model_id for session in sessions}), - "support_statuses": sorted({session.support_status for session in sessions}), - "support_status_counts": { - status: sum(1 for session in sessions if session.support_status == status) - for status in sorted({session.support_status for session in sessions}) - }, - "benchmark_certification_statuses": sorted( - {session.benchmark_certification_status for session in sessions} - ), - "benchmark_certification_status_counts": { - status: sum( - 1 - for session in sessions - if session.benchmark_certification_status == status - ) - for status in sorted( - {session.benchmark_certification_status for session in sessions} - ) - }, - "request_mode_mix": { - mode: sum(1 for session in sessions if session.request_mode == mode) - for mode in sorted({session.request_mode for session in sessions}) - }, - } - return sessions, selection_metadata - - -async def _iter_sse_lines( - response: aiohttp.ClientResponse, -): - """Yield individual SSE data payloads from a streaming response. - - Buffers partial lines across TCP chunks and splits multi-line chunks. - Handles the common case where multiple ``data: {...}`` frames arrive - in a single TCP read, or a single frame is split across reads. - """ - buffer = b"" - async for chunk in response.content: - buffer += chunk - while b"\n" in buffer: - line, buffer = buffer.split(b"\n", 1) - line = line.strip() - if not line: - continue - decoded = line.decode("utf-8") - if decoded.startswith(":"): - continue # SSE comment / keep-alive - if decoded.startswith("data: "): - payload_str = decoded[6:].strip() - elif decoded.startswith("data:"): - payload_str = decoded[5:].strip() - else: - continue - if payload_str == "[DONE]": - return - yield payload_str - # Flush remaining buffer - remaining = buffer.strip() - if remaining: - decoded = remaining.decode("utf-8") - for prefix in ("data: ", "data:"): - if decoded.startswith(prefix): - payload_str = decoded[len(prefix):].strip() - if payload_str and payload_str != "[DONE]": - yield payload_str - break - - -async def _stream_chat_request( - api_url: str, - payload: dict[str, Any], - headers: dict[str, str], - context_len: int, - count_text_tokens: Callable[[str], int], - request_mode: str, -) -> tuple[TurnResult, int]: - turn = TurnResult( - turn_idx=-1, - context_len=context_len, - output_len=0, - success=False, - request_mode=request_mode, - ) - generated_text = "" - ttft = 0.0 - st = time.perf_counter() - most_recent_timestamp = st - - async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: - async with session.post(url=api_url, json=payload, headers=headers) as response: - if response.status != 200: - error_text = (await response.text()).strip() - turn.error = f"HTTP {response.status}: {error_text or response.reason}" - return turn, response.status - - async for sse_payload in _iter_sse_lines(response): - data = json.loads(sse_payload) - if choices := data.get("choices"): - delta = choices[0].get("delta", {}) - content = delta.get("content") - if isinstance(content, list): - content = "".join( - part.get("text", "") - for part in content - if isinstance(part, dict) and part.get("type") == "text" - ) - if content: - timestamp = time.perf_counter() - if ttft == 0.0: - ttft = timestamp - st - turn.ttft = ttft - else: - turn.itl.append(timestamp - most_recent_timestamp) - most_recent_timestamp = timestamp - generated_text += content - elif usage := data.get("usage"): - turn.output_len = int(usage.get("completion_tokens") or 0) - - turn.e2el = max(0.0, most_recent_timestamp - st) - turn.success = True - if turn.output_len == 0 and generated_text: - turn.output_len = count_text_tokens(generated_text) - if turn.output_len > 1: - turn.tpot = (turn.e2el - turn.ttft) / (turn.output_len - 1) - return turn, 200 - - -async def _send_chat_turn( - chat_messages: list[dict[str, Any]], - model_id: str, - model_name: Optional[str], - api_url: str, - output_len: int, - context_len: int, - count_text_tokens: Callable[[str], int], - ignore_eos: bool = False, -) -> TurnResult: - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', 'EMPTY')}", - } - payload_base = { - "model": model_name or model_id, - "messages": chat_messages, - "temperature": 0.0, - "stream": True, - "stream_options": {"include_usage": True}, - } - if ignore_eos: - payload_base["ignore_eos"] = True - - errors: list[str] = [] - for max_tokens_key in ("max_completion_tokens", "max_tokens"): - payload = {**payload_base, max_tokens_key: output_len} - turn, status = await _stream_chat_request( - api_url=api_url, - payload=payload, - headers=headers, - context_len=context_len, - count_text_tokens=count_text_tokens, - request_mode="chat", - ) - if turn.success: - return turn - errors.append(turn.error) - if status not in {400, 404, 422}: - break - - return TurnResult( - turn_idx=-1, - context_len=context_len, - output_len=0, - success=False, - error=" | ".join(error for error in errors if error), - request_mode="chat", - ) - - -async def _send_completion_turn( - prompt: str, - model_id: str, - model_name: Optional[str], - api_url: str, - output_len: int, - context_len: int, - count_text_tokens: Callable[[str], int], - ignore_eos: bool = False, -) -> TurnResult: - payload = { - "model": model_name or model_id, - "prompt": prompt, - "temperature": 0.0, - "max_tokens": output_len, - "stream": True, - "stream_options": {"include_usage": True}, - } - if ignore_eos: - payload["ignore_eos"] = True - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', 'EMPTY')}", - } - - turn = TurnResult( - turn_idx=-1, - context_len=context_len, - output_len=0, - success=False, - request_mode="completions", - ) - generated_text = "" - ttft = 0.0 - st = time.perf_counter() - most_recent_timestamp = st - - try: - async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: - async with session.post(url=api_url, json=payload, headers=headers) as response: - if response.status != 200: - error_text = (await response.text()).strip() - turn.error = f"HTTP {response.status}: {error_text or response.reason}" - return turn - - async for sse_payload in _iter_sse_lines(response): - data = json.loads(sse_payload) - if choices := data.get("choices"): - choice = choices[0] - content = choice.get("text") - if content is None: - delta = choice.get("delta", {}) - content = delta.get("content") - if isinstance(content, list): - content = "".join( - part.get("text", "") - for part in content - if isinstance(part, dict) and part.get("type") == "text" - ) - if content: - timestamp = time.perf_counter() - if ttft == 0.0: - ttft = timestamp - st - turn.ttft = ttft - else: - turn.itl.append(timestamp - most_recent_timestamp) - most_recent_timestamp = timestamp - generated_text += content - elif usage := data.get("usage"): - turn.output_len = int(usage.get("completion_tokens") or 0) - except Exception as exc: - turn.error = str(exc) - return turn - - turn.e2el = max(0.0, most_recent_timestamp - st) - turn.success = True - if turn.output_len == 0 and generated_text: - turn.output_len = count_text_tokens(generated_text) - if turn.output_len > 1: - turn.tpot = (turn.e2el - turn.ttft) / (turn.output_len - 1) - return turn - - -async def poll_server_metrics(api_url: str, interval: float = 2.0) -> list[dict[str, float]]: - """Poll ``/metrics`` periodically to capture KV / cache status.""" - import urllib.parse - - parsed = urllib.parse.urlparse(api_url) - metrics_url = f"{parsed.scheme}://{parsed.netloc}/metrics" - metrics_history: list[dict[str, float]] = [] - - try: - async with aiohttp.ClientSession(trust_env=True) as session: - while True: - try: - async with session.get(metrics_url, timeout=aiohttp.ClientTimeout(total=5.0)) as response: - if response.status == 200: - text = await response.text() - snapshot: dict[str, float] = {} - for line in text.split("\n"): - parsed_line = _parse_prometheus_sample(line) - if parsed_line is None: - continue - metric_name, metric_value = parsed_line - if metric_name == "vllm:gpu_cache_usage_perc": - snapshot["vllm_gpu_cache_usage"] = metric_value - elif metric_name == "vllm:cpu_cache_usage_perc": - snapshot["vllm_cpu_cache_usage"] = metric_value - elif metric_name == "sglang:cache_hit_rate": - snapshot["sglang_cache_hit_rate"] = metric_value - elif metric_name == "sglang:kv_cache_usage": - snapshot["sglang_kv_cache_usage"] = metric_value - elif metric_name == "sglang:token_usage": - snapshot["sglang_token_usage"] = metric_value - elif metric_name == "vllm:num_preemptions_total": - snapshot["vllm_preemptions_total"] = metric_value - elif metric_name == "vllm:num_requests_running": - snapshot["vllm_requests_running"] = metric_value - elif metric_name == "vllm:num_requests_waiting": - snapshot["vllm_requests_waiting"] = metric_value - if snapshot: - metrics_history.append(snapshot) - except Exception: - pass - await asyncio.sleep(interval) - except asyncio.CancelledError: - pass - - return metrics_history - - -def _percentile(values: list[float], percentile: float) -> float: - if not values: - return 0.0 - return float(np.percentile(values, percentile)) - - -def calculate_multiturn_metrics( - session_results: list[SessionResult], - max_turns: int, - selected_percentiles: list[float], -) -> dict[str, Any]: - ms = 1000.0 - per_turn: dict[str, dict[str, Any]] = {} - - for turn_index in range(max_turns): - ttfts: list[float] = [] - tpots: list[float] = [] - e2els: list[float] = [] - context_lens: list[int] = [] - actual_context_lens: list[int] = [] - output_lens: list[int] = [] - successes = 0 - for session in session_results: - if turn_index < len(session.turns): - turn = session.turns[turn_index] - if turn.success: - ttfts.append(turn.ttft) - tpots.append(turn.tpot) - e2els.append(turn.e2el) - context_lens.append(turn.context_len) - actual_context_lens.append(turn.actual_context_len) - output_lens.append(turn.output_len) - successes += 1 - - key = f"turn_{turn_index + 1}" - metrics: dict[str, Any] = { - "completed": successes, - "mean_context_len": float(np.mean(context_lens)) if context_lens else 0.0, - "mean_actual_context_len": float(np.mean(actual_context_lens)) if actual_context_lens else 0.0, - "mean_output_len": float(np.mean(output_lens)) if output_lens else 0.0, - } - for label, values in (("ttft", ttfts), ("tpot", tpots), ("e2el", e2els)): - metrics[f"mean_{label}_ms"] = float(np.mean(values)) * ms if values else 0.0 - metrics[f"median_{label}_ms"] = float(np.median(values)) * ms if values else 0.0 - metrics[f"std_{label}_ms"] = float(np.std(values)) * ms if values else 0.0 - for percentile in selected_percentiles: - percentile_label = str(int(percentile)) if int(percentile) == percentile else str(percentile) - metrics[f"p{percentile_label}_{label}_ms"] = _percentile(values, percentile) * ms - per_turn[key] = metrics - - all_ttfts: list[float] = [] - all_tpots: list[float] = [] - all_e2els: list[float] = [] - total_input = 0 - total_actual_input = 0 - total_output = 0 - completed_sessions = 0 - total_wall = 0.0 - max_actual_context_per_turn = 0 - - for session in session_results: - if session.turns and all(turn.success for turn in session.turns): - completed_sessions += 1 - total_input += session.total_input_tokens - total_actual_input += session.total_actual_input_tokens - total_output += session.total_output_tokens - total_wall = max(total_wall, session.total_duration) - for turn in session.turns: - if turn.success: - all_ttfts.append(turn.ttft) - all_tpots.append(turn.tpot) - all_e2els.append(turn.e2el) - if turn.actual_context_len > max_actual_context_per_turn: - max_actual_context_per_turn = turn.actual_context_len - - aggregate: dict[str, Any] = { - "completed_sessions": completed_sessions, - "total_sessions": len(session_results), - "total_input_tokens": total_input, - "total_actual_input_tokens": total_actual_input, - "max_actual_context_len_per_turn": max_actual_context_per_turn, - "total_output_tokens": total_output, - "total_wall_time_s": total_wall, - "session_throughput_sps": completed_sessions / total_wall if total_wall > 0 else 0.0, - "output_throughput_tps": total_output / total_wall if total_wall > 0 else 0.0, - "total_token_throughput_tps": (total_input + total_output) / total_wall if total_wall > 0 else 0.0, - } - for label, values in (("ttft", all_ttfts), ("tpot", all_tpots), ("e2el", all_e2els)): - aggregate[f"mean_{label}_ms"] = float(np.mean(values)) * ms if values else 0.0 - aggregate[f"median_{label}_ms"] = float(np.median(values)) * ms if values else 0.0 - aggregate[f"std_{label}_ms"] = float(np.std(values)) * ms if values else 0.0 - for percentile in selected_percentiles: - percentile_label = str(int(percentile)) if int(percentile) == percentile else str(percentile) - aggregate[f"p{percentile_label}_{label}_ms"] = _percentile(values, percentile) * ms - - return {"per_turn_metrics": per_turn, "aggregate_metrics": aggregate} - - -async def _run_replay_session( - session: ReplaySession, - model_id: str, - model_name: Optional[str], - chat_api_url: str, - completion_api_url: str, - count_text_tokens: Callable[[str], int], - pbar: Optional[tqdm], - ignore_eos: bool, -) -> SessionResult: - result = SessionResult(session_id=session.session_id) - start = time.perf_counter() - - for replay_turn in session.turns: - if replay_turn.wait_before_s > 0: - await asyncio.sleep(replay_turn.wait_before_s) - - if session.request_mode == "chat": - turn_result = await _send_chat_turn( - chat_messages=replay_turn.chat_messages, - model_id=model_id, - model_name=model_name, - api_url=chat_api_url, - output_len=replay_turn.output_len, - context_len=replay_turn.context_len, - count_text_tokens=count_text_tokens, - ignore_eos=ignore_eos, - ) - else: - turn_result = await _send_completion_turn( - prompt=replay_turn.completion_prompt, - model_id=model_id, - model_name=model_name, - api_url=completion_api_url, - output_len=replay_turn.output_len, - context_len=replay_turn.context_len, - count_text_tokens=count_text_tokens, - ignore_eos=ignore_eos, - ) - - turn_result.turn_idx = replay_turn.turn_idx - turn_result.actual_context_len = replay_turn.actual_context_len - result.turns.append(turn_result) - if turn_result.success: - result.total_input_tokens += turn_result.context_len - result.total_actual_input_tokens += turn_result.actual_context_len - result.total_output_tokens += turn_result.output_len - if pbar is not None: - pbar.update(1) - - result.total_duration = time.perf_counter() - start - return result - - -async def _run_warmup_sessions( - sessions: list[ReplaySession], - model_id: str, - model_name: Optional[str], - chat_api_url: str, - completion_api_url: str, - count_text_tokens: Callable[[str], int], - num_warmup_sessions: int, - ignore_eos: bool, -) -> None: - if num_warmup_sessions <= 0 or not sessions: - return - - print(f"Running {num_warmup_sessions} warmup session(s) (results discarded) ...") - warmup_jobs: list[asyncio.Task[SessionResult]] = [] - for index in range(num_warmup_sessions): - source = sessions[index % len(sessions)] - warmup_turns = [ - ReplayTurn( - turn_idx=turn.turn_idx, - turn_id=turn.turn_id, - output_len=turn.output_len, - wait_before_s=0.0, - context_len=turn.context_len, - actual_context_len=turn.actual_context_len, - chat_messages=turn.chat_messages, - completion_prompt=turn.completion_prompt, - ) - for turn in source.turns[: min(2, len(source.turns))] - ] - warmup_jobs.append( - asyncio.create_task( - _run_replay_session( - session=ReplaySession( - session_id=f"warmup-{index}", - trace_id=source.trace_id, - runtime_stack_id=source.runtime_stack_id, - hardware_profile_id=source.hardware_profile_id, - canonical_model_id=source.canonical_model_id, - support_status=source.support_status, - benchmark_certification_status=source.benchmark_certification_status, - request_mode=source.request_mode, - adapter_id=source.adapter_id, - turns=warmup_turns, - ), - model_id=model_id, - model_name=model_name, - chat_api_url=chat_api_url, - completion_api_url=completion_api_url, - count_text_tokens=count_text_tokens, - pbar=None, - ignore_eos=ignore_eos, - ) - ) - ) - - results = await asyncio.gather(*warmup_jobs, return_exceptions=True) - succeeded = sum( - 1 - for result in results - if isinstance(result, SessionResult) and any(turn.success for turn in result.turns) - ) - failed = num_warmup_sessions - succeeded - if failed: - print( - f" ⚠️ {failed}/{num_warmup_sessions} warmup session(s) failed. " - "Check the server endpoint and selected export cell." - ) - else: - print(f" ✅ {succeeded} warmup session(s) completed successfully.") - print() - - -async def run_export_replay_benchmark( - sessions: list[ReplaySession], - selection_metadata: dict[str, Any], - model_id: str, - model_name: Optional[str], - chat_api_url: str, - completion_api_url: str, - count_text_tokens: Callable[[str], int], - max_concurrency: int, - selected_percentiles: list[float], - disable_tqdm: bool, - num_warmup_sessions: int = 1, - ignore_eos: bool = False, -) -> dict[str, Any]: - if not sessions: - raise ValueError("No replay sessions were selected.") - - max_turns = max(len(session.turns) for session in sessions) - total_turns = sum(len(session.turns) for session in sessions) - - print("============================================================") - print(" Export Replay Selection") - print("============================================================") - print(f" Adapter: {selection_metadata['adapter_id']}") - print(f" Sessions selected: {selection_metadata['selected_sessions']}") - print(f" Runtime stack(s): {', '.join(selection_metadata['runtime_stack_ids'])}") - print(f" Hardware profile(s): {', '.join(selection_metadata['hardware_profile_ids'])}") - print(f" Canonical model(s): {', '.join(selection_metadata['canonical_model_ids'])}") - print( - " Support status(es): " - f"{', '.join(selection_metadata['support_statuses'])}" - ) - print( - " Certification status: " - f"{', '.join(selection_metadata['benchmark_certification_statuses'])}" - ) - print(f" Request mode mix: {selection_metadata['request_mode_mix']}") - print(f" Total turns: {total_turns}") - print("============================================================") - print() - - await _run_warmup_sessions( - sessions=sessions, - model_id=model_id, - model_name=model_name, - chat_api_url=chat_api_url, - completion_api_url=completion_api_url, - count_text_tokens=count_text_tokens, - num_warmup_sessions=num_warmup_sessions, - ignore_eos=ignore_eos, - ) - - pbar = None if disable_tqdm else tqdm(total=total_turns, desc="turns") - semaphore = asyncio.Semaphore(max_concurrency) - - async def _limited_run(session: ReplaySession) -> SessionResult: - async with semaphore: - return await _run_replay_session( - session=session, - model_id=model_id, - model_name=model_name, - chat_api_url=chat_api_url, - completion_api_url=completion_api_url, - count_text_tokens=count_text_tokens, - pbar=pbar, - ignore_eos=ignore_eos, - ) - - print( - f"Starting export replay benchmark: {len(sessions)} sessions, " - f"max_turns={max_turns}, max_concurrency={max_concurrency}" - ) - benchmark_start = time.perf_counter() - metrics_task = asyncio.create_task(poll_server_metrics(chat_api_url, interval=2.0)) - jobs = [asyncio.create_task(_limited_run(session)) for session in sessions] - session_results = await asyncio.gather(*jobs) - benchmark_duration = time.perf_counter() - benchmark_start - - metrics_task.cancel() - try: - server_metrics = await metrics_task - except asyncio.CancelledError: - server_metrics = [] - - if pbar is not None: - pbar.close() - - metrics = calculate_multiturn_metrics( - session_results=session_results, - max_turns=max_turns, - selected_percentiles=selected_percentiles, - ) - aggregate = metrics["aggregate_metrics"] - per_turn = metrics["per_turn_metrics"] - - cache_usage_avg = 0.0 - cache_hit_rate_avg = 0.0 - gpu_cache_usage_avg = 0.0 - gpu_cache_usage_peak = 0.0 - cpu_cache_usage_avg = 0.0 - cpu_cache_usage_peak = 0.0 - gpu_cache_metric_name: str | None = None - cpu_cache_metric_name: str | None = None - observability_status = "no_cache_metrics" - cpu_samples: list[float] = [] - kv_offload_observed = False - if server_metrics: - vllm_gpu_samples = [ - item["vllm_gpu_cache_usage"] - for item in server_metrics - if "vllm_gpu_cache_usage" in item - ] - sglang_gpu_samples: list[float] = [] - saw_sglang_kv_metric = False - saw_sglang_token_metric = False - for item in server_metrics: - if "sglang_kv_cache_usage" in item: - sglang_gpu_samples.append(item["sglang_kv_cache_usage"]) - saw_sglang_kv_metric = True - elif "sglang_token_usage" in item: - sglang_gpu_samples.append(item["sglang_token_usage"]) - saw_sglang_token_metric = True - - if saw_sglang_kv_metric: - gpu_cache_metric_name = "sglang:kv_cache_usage" - elif saw_sglang_token_metric: - gpu_cache_metric_name = "sglang:token_usage" - - if vllm_gpu_samples: - gpu_samples = vllm_gpu_samples - gpu_cache_metric_name = "vllm:gpu_cache_usage_perc" - else: - gpu_samples = sglang_gpu_samples - - cpu_samples = [ - item["vllm_cpu_cache_usage"] - for item in server_metrics - if "vllm_cpu_cache_usage" in item - ] - if cpu_samples: - cpu_cache_metric_name = "vllm:cpu_cache_usage_perc" - cache_hit_samples = [ - item["sglang_cache_hit_rate"] - for item in server_metrics - if "sglang_cache_hit_rate" in item - ] - - if gpu_samples: - gpu_cache_usage_avg = float(np.mean(gpu_samples)) - gpu_cache_usage_peak = float(np.max(gpu_samples)) - cache_usage_avg = gpu_cache_usage_avg - if cpu_samples: - cpu_cache_usage_avg = float(np.mean(cpu_samples)) - cpu_cache_usage_peak = float(np.max(cpu_samples)) - kv_offload_observed = any(sample > 0.0 for sample in cpu_samples) - if cache_hit_samples: - cache_hit_rate_avg = float(np.mean(cache_hit_samples)) - if cpu_samples: - observability_status = "direct_cpu_cache_metric" - elif gpu_samples or cache_hit_samples: - observability_status = "indirect_without_cpu_cache_metric" - - print() - print("{s:{c}^{n}}".format(s=" Export Replay Benchmark Result ", n=60, c="=")) - print(f" {'Completed sessions:':<35} {aggregate['completed_sessions']}/{aggregate['total_sessions']}") - print(f" {'Benchmark duration (s):':<35} {benchmark_duration:.2f}") - print(f" {'Total input tokens (estimated):':<35} {aggregate['total_input_tokens']}") - print(f" {'Total input tokens (actual sent):':<35} {aggregate['total_actual_input_tokens']}") - print(f" {'Max actual context/turn:':<35} {aggregate['max_actual_context_len_per_turn']}") - print(f" {'Total output tokens:':<35} {aggregate['total_output_tokens']}") - print(f" {'Session throughput (sessions/s):':<35} {aggregate['session_throughput_sps']:.2f}") - print(f" {'Output throughput (tok/s):':<35} {aggregate['output_throughput_tps']:.2f}") - print(f" {'Total throughput (tok/s):':<35} {aggregate['total_token_throughput_tps']:.2f}") - if server_metrics: - print() - print(f" {'Server KV Cache Usage (avg):':<35} {cache_usage_avg:.1%}") - if cpu_cache_metric_name: - print(f" {'Server CPU Cache Usage (avg):':<35} {cpu_cache_usage_avg:.1%}") - if cache_hit_rate_avg > 0: - print(f" {'Prefix Cache Hit Rate (avg):':<35} {cache_hit_rate_avg:.1%}") - if observability_status == "indirect_without_cpu_cache_metric": - print( - f" {'Offload observability:':<35} " - "indirect only (no direct CPU cache metric)" - ) - print() - print("{s:{c}^{n}}".format(s=" Per-Turn TTFT Progression ", n=60, c="-")) - print(f" {'Turn':<8} {'Est Ctx':<10} {'Act Ctx':<10} {'Mean TTFT':<14} {'P99 TTFT':<14} {'Mean E2EL':<14}") - print(f" {'─'*8} {'─'*10} {'─'*10} {'─'*14} {'─'*14} {'─'*14}") - for turn_index in range(max_turns): - key = f"turn_{turn_index + 1}" - if key not in per_turn: - continue - turn_metrics = per_turn[key] - print( - f" {turn_index + 1:<8} " - f"{turn_metrics['mean_context_len']:<10.0f} " - f"{turn_metrics.get('mean_actual_context_len', 0.0):<10.0f} " - f"{turn_metrics['mean_ttft_ms']:<14.1f} " - f"{turn_metrics.get('p99_ttft_ms', 0.0):<14.1f} " - f"{turn_metrics['mean_e2el_ms']:<14.1f}" - ) - print("=" * 60) - - return { - "mode": "export_replay", - "adapter_id": selection_metadata["adapter_id"], - "selection": selection_metadata, - "duration": benchmark_duration, - "num_sessions": len(sessions), - "max_turns": max_turns, - "max_concurrency": max_concurrency, - "num_warmup_sessions": num_warmup_sessions, - "server_metrics_summary": { - "cache_usage_avg": cache_usage_avg, - "cache_hit_rate_avg": cache_hit_rate_avg, - "gpu_cache_usage_avg": gpu_cache_usage_avg, - "gpu_cache_usage_peak": gpu_cache_usage_peak, - "gpu_cache_metric_name": gpu_cache_metric_name, - "cpu_cache_usage_avg": cpu_cache_usage_avg, - "cpu_cache_usage_peak": cpu_cache_usage_peak, - "cpu_cache_metric_name": cpu_cache_metric_name, - "cpu_cache_metric_available": bool(cpu_samples), - "observability_status": observability_status, - # Observability-only signal; not a certification or quality claim. - "kv_offload_observed": kv_offload_observed, - "samples": len(server_metrics), - "preemption_count": int( - max( - (item.get("vllm_preemptions_total", 0.0) for item in server_metrics), - default=0.0, - ) - ) if server_metrics else 0, - "peak_requests_running": float( - max( - (item.get("vllm_requests_running", 0.0) for item in server_metrics), - default=0.0, - ) - ) if server_metrics else 0.0, - "peak_requests_waiting": float( - max( - (item.get("vllm_requests_waiting", 0.0) for item in server_metrics), - default=0.0, - ) - ) if server_metrics else 0.0, - }, - "depth_telemetry": { - "total_estimated_input_tokens": aggregate["total_input_tokens"], - "total_actual_input_tokens": aggregate["total_actual_input_tokens"], - "max_actual_context_len_per_turn": aggregate["max_actual_context_len_per_turn"], - }, - **metrics, - } - - -def main(args: argparse.Namespace) -> None: - random.seed(args.seed) - np.random.seed(args.seed) - - base_url = args.base_url or f"http://{args.host}:{args.port}" - base_url = base_url.rstrip("/") - chat_api_url = args.chat_api_url or f"{base_url}{args.chat_endpoint}" - completion_api_url = args.completion_api_url or f"{base_url}{args.completion_endpoint}" - - tokenizer_id = None if args.skip_tokenizer_load else (args.tokenizer or args.model) - count_text_tokens = build_text_token_counter( - tokenizer_id=tokenizer_id, - tokenizer_mode=args.tokenizer_mode, - trust_remote_code=args.trust_remote_code, - ) - sessions, selection_metadata = load_replay_sessions( - export_file=args.export_file, - count_text_tokens=count_text_tokens, - runtime_stack_ids=_csv_values(args.runtime_stack_id), - hardware_profile_ids=_csv_values(args.hardware_profile_id), - canonical_model_ids=_csv_values(args.canonical_model_id), - trace_ids=_csv_values(args.trace_id), - support_statuses=_csv_values(args.support_status), - request_mode=args.request_mode, - image_token_estimate=args.image_token_estimate, - ignore_waits=args.ignore_waits, - fallback_output_len=args.fallback_output_len, - output_len_cap=args.max_output_len, - session_offset=args.session_offset, - max_sessions=args.max_sessions, - max_turns_per_session=args.max_turns_per_session, - shuffle_sessions=args.shuffle_sessions, - seed=args.seed, - allow_mixed_selection=args.allow_mixed_selection, - ) - - result = asyncio.run( - run_export_replay_benchmark( - sessions=sessions, - selection_metadata=selection_metadata, - model_id=args.model, - model_name=args.served_model_name, - chat_api_url=chat_api_url, - completion_api_url=completion_api_url, - count_text_tokens=count_text_tokens, - max_concurrency=args.max_concurrency, - selected_percentiles=[float(item) for item in args.metric_percentiles.split(",")], - disable_tqdm=args.disable_tqdm, - num_warmup_sessions=args.num_warmup_sessions, - ignore_eos=args.ignore_eos, - ) - ) - - if args.save_result: - result_json: dict[str, Any] = { - "date": datetime.now().strftime("%Y%m%d-%H%M%S"), - "model_id": args.model, - } - if tokenizer_id is not None: - result_json["tokenizer_id"] = tokenizer_id - if args.metadata: - for item in args.metadata: - if "=" in item: - key, value = item.split("=", 1) - result_json[key.strip()] = value.strip() - result_json = {**result_json, **result} - - file_name = args.result_filename or f"export-replay-{Path(args.export_file).stem}.json" - if args.result_dir: - os.makedirs(args.result_dir, exist_ok=True) - file_name = os.path.join(args.result_dir, file_name) - - with open(file_name, "w", encoding="utf-8") as handle: - json.dump(result_json, handle, indent=2) - print(f"\nResults saved to {file_name}") - - -if __name__ == "__main__": - parser = FlexibleArgumentParser( - description=( - "Replay ISB1 export sessions against an OpenAI-compatible server. " - "Supports chat-completions replay for standalone vLLM/SGLang and " - "prompt-projected completions replay for TRT / Dynamo-style cells." - ) - ) - - parser.add_argument("--export-file", type=str, required=True, - help="Path to an inferencex_multiturn or inferencex_trace_replay export JSON") - parser.add_argument("--base-url", type=str, default=None, - help="Server base URL, e.g. http://0.0.0.0:8000") - parser.add_argument("--host", type=str, default="127.0.0.1") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--chat-endpoint", type=str, default="/v1/chat/completions") - parser.add_argument("--completion-endpoint", type=str, default="/v1/completions") - parser.add_argument("--chat-api-url", type=str, default=None, - help="Override the full chat endpoint URL") - parser.add_argument("--completion-api-url", type=str, default=None, - help="Override the full completions endpoint URL") - - parser.add_argument("--model", type=str, required=True, - help="Model identifier sent to the target server") - parser.add_argument("--served-model-name", type=str, default=None, - help="Served model name if different from --model") - parser.add_argument("--tokenizer", type=str, default=None, - help="Tokenizer name/path if different from --model") - parser.add_argument("--tokenizer-mode", type=str, default="auto", - choices=["auto", "slow", "mistral", "custom"]) - parser.add_argument("--trust-remote-code", action="store_true") - parser.add_argument("--skip-tokenizer-load", action="store_true", - help="Use approximate token counting instead of loading a tokenizer") - - parser.add_argument("--runtime-stack-id", type=str, default=None, - help="Comma-separated runtime_stack_id filter(s)") - parser.add_argument("--hardware-profile-id", type=str, default=None, - help="Comma-separated hardware_profile_id filter(s)") - parser.add_argument("--canonical-model-id", type=str, default=None, - help="Comma-separated canonical_model_id filter(s)") - parser.add_argument("--trace-id", type=str, default=None, - help="Comma-separated trace_id filter(s)") - parser.add_argument("--support-status", type=str, default=None, - help="Comma-separated support_status filter(s)") - parser.add_argument("--request-mode", type=str, default="auto", - choices=["auto", "chat", "completions"]) - parser.add_argument("--allow-mixed-selection", action="store_true", - help="Allow multiple runtime/model/hardware identities in one run") - parser.add_argument("--shuffle-sessions", action="store_true") - parser.add_argument("--session-offset", type=int, default=0) - parser.add_argument("--max-sessions", type=int, default=None) - parser.add_argument("--max-turns-per-session", type=int, default=None) - parser.add_argument("--ignore-waits", action="store_true", - help="Ignore export wait_before/arrival-time gaps") - parser.add_argument("--fallback-output-len", type=int, default=DEFAULT_FALLBACK_OUTPUT_LEN, - help="Fallback output length when export metadata is missing") - parser.add_argument("--max-output-len", type=int, default=None, - help="Optional cap applied to each exported target output length") - parser.add_argument("--image-token-estimate", type=int, default=DEFAULT_IMAGE_TOKEN_ESTIMATE, - help="Approximate token cost for image blocks when no explicit token count exists") - - parser.add_argument("--max-concurrency", type=int, default=8, - help="Maximum concurrently active replay sessions") - parser.add_argument("--num-warmup-sessions", type=int, default=1, - help="Warmup sessions to prime KV/prefix cache before measurement") - parser.add_argument("--ignore-eos", action="store_true") - - parser.add_argument("--save-result", action="store_true") - parser.add_argument("--result-dir", type=str, default=None) - parser.add_argument("--result-filename", type=str, default=None) - parser.add_argument("--metadata", metavar="KEY=VALUE", nargs="*") - parser.add_argument("--metric-percentiles", type=str, default="90,99,99.9") - - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--disable-tqdm", action="store_true") - - main(parser.parse_args()) diff --git a/utils/process_result.py b/utils/process_result.py index e680239d1..0a84a1f18 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -4,15 +4,6 @@ from pathlib import Path -def fail_if_isb1_replay_requested(): - """Guard against sending ISB1 replay results through the throughput processor.""" - if os.environ.get('BENCHMARK_TYPE') == 'isb1_replay': - raise SystemExit( - 'process_result.py does not support ISB1 replay results. ' - 'Use utils/process_result_isb1.py instead.' - ) - - def get_required_env_vars(required_vars): """Load and validate required environment variables.""" env_values = {} @@ -31,8 +22,6 @@ def get_required_env_vars(required_vars): return env_values -fail_if_isb1_replay_requested() - # Base required env vars base_env = get_required_env_vars([ 'RUNNER_TYPE', 'FRAMEWORK', 'PRECISION', 'SPEC_DECODING', @@ -53,12 +42,6 @@ def get_required_env_vars(required_vars): with open(f'{result_filename}.json') as f: bmk_result = json.load(f) -if 'aggregate_metrics' in bmk_result and 'total_token_throughput_tps' in bmk_result['aggregate_metrics']: - raise SystemExit( - 'Detected an ISB1 replay-style result payload in process_result.py. ' - 'Use utils/process_result_isb1.py instead.' - ) - data = { 'hw': hw, 'conc': int(bmk_result['max_concurrency']), diff --git a/utils/process_result_isb1.py b/utils/process_result_isb1.py deleted file mode 100644 index 7f338ab2c..000000000 --- a/utils/process_result_isb1.py +++ /dev/null @@ -1,490 +0,0 @@ -import json -import os -import re -import sys -from pathlib import Path -from typing import Any, Optional, Tuple - -ISB1_RUNNABLE_CERTIFICATION_STATUSES = ["dataset_replay_verified"] - - -def get_required_env_vars(required_vars): - """Load and validate required environment variables.""" - env_values = {} - missing_env_vars = [] - - for var_name in required_vars: - value = os.environ.get(var_name) - if value is None: - missing_env_vars.append(var_name) - env_values[var_name] = value - - if missing_env_vars: - raise EnvironmentError( - f"Missing required environment variables: {', '.join(missing_env_vars)}" - ) - - return env_values - - -def parse_export_shape(export_file: str) -> Tuple[int, int, Optional[str], str, dict[str, Any]]: - """Derive ISL/OSL plus export lane/surface and preview metadata from the export path/file.""" - export_path = Path(export_file) - match = re.search(r"(?P\d+)k(?P\d+)k", export_path.stem) - - isl = int(os.environ.get("ISL", "0") or 0) - osl = int(os.environ.get("OSL", "0") or 0) - surface = export_path.stem - metadata: dict[str, Any] = {} - - if match: - isl = int(match.group("isl")) * 1024 - osl = int(match.group("osl")) * 1024 - surface = export_path.stem[: match.start()].rstrip("_-") or export_path.stem - - lane = None - if "exports" in export_path.parts: - exports_idx = export_path.parts.index("exports") - if exports_idx + 1 < len(export_path.parts): - lane = export_path.parts[exports_idx + 1] - if lane == "preview" and exports_idx + 2 < len(export_path.parts): - lane = f"preview/{export_path.parts[exports_idx + 2]}" - - try: - payload = json.loads(export_path.read_text()) - except (FileNotFoundError, json.JSONDecodeError): - payload = None - - if payload is not None: - served_shape = payload.get("served_shape") or {} - isl = int(served_shape.get("isl", isl) or isl) - osl = int(served_shape.get("osl", osl) or osl) - surface = payload.get("surface") or payload.get("adapter_surface") or surface - - context_bands = sorted( - { - cell.get("context_band") - for cell in payload.get("exports", []) - if cell.get("context_band") - } - ) - metadata = { - "adapter_id": payload.get("adapter_id"), - "bundle_id": payload.get("bundle_id"), - "profile_id": payload.get("profile_id"), - "duration_tier": payload.get("duration_tier"), - "context_bands": context_bands, - "adapter_support_status": payload.get("adapter_support_status"), - "profile_tier": payload.get("tier"), - } - producer_handoff = payload.get("producer_handoff_metadata") or {} - if producer_handoff: - metadata["producer_handoff_class"] = producer_handoff.get("class") - metadata["producer_claim_boundary"] = producer_handoff.get("claim_boundary") - - # Extract producer KV expectations from first export cell trace_metadata - first_cell = (payload.get("exports") or [{}])[0] if payload.get("exports") else {} - trace_metadata = first_cell.get("trace_metadata", {}) - if trace_metadata: - metadata["producer_estimated_kv_bytes_peak"] = trace_metadata.get("estimated_kv_bytes_peak") - pressure_profile = trace_metadata.get("context_pressure_profile", {}) - metadata["producer_expected_offload_mode"] = ( - pressure_profile.get("expected_offload_mode") - or trace_metadata.get("expected_offload_mode") - ) - - return isl, osl, lane, surface, metadata - - -def validate_support_status_selection( - expected_support_status: Optional[str], selection: dict[str, Any] -) -> None: - """Ensure processed ISB1 output is labeled with the tier actually selected by the harness.""" - if not expected_support_status: - return - - selected_statuses = selection.get("support_statuses") or [] - if not selected_statuses: - raise ValueError( - "ISB1 replay result is missing selection.support_statuses; " - "cannot certify the processed support tier." - ) - - unique_statuses = sorted(set(selected_statuses)) - if unique_statuses != [expected_support_status]: - raise ValueError( - "ISB1 replay result support-status mismatch: " - f"workflow requested '{expected_support_status}' but harness selected {unique_statuses}." - ) - - -def validate_certification_selection(selection: dict[str, Any]) -> None: - """Ensure processed ISB1 output carries the expected runnable certification.""" - selected_statuses = selection.get("benchmark_certification_statuses") or [] - if not selected_statuses: - raise ValueError( - "ISB1 replay result is missing selection.benchmark_certification_statuses; " - "cannot certify the processed replay result." - ) - - unique_statuses = sorted(set(selected_statuses)) - if unique_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES: - raise ValueError( - "ISB1 replay result benchmark-certification mismatch: " - "current consumer lanes require " - f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}, but harness selected {unique_statuses}." - ) - - -def build_context_pressure_signal( - context_pressure_class: str, - kv_offload_observed: bool, - peak_cpu_cache_usage: float, - cpu_cache_metric_available: bool, - depth_coverage_ratio: Optional[float] = None, - max_actual_context_len: Optional[int] = None, -) -> dict[str, Any]: - """Emit a machine-readable status for preview-lane context-pressure validation.""" - if context_pressure_class == "standard": - status = "not_applicable" - reason = "standard_context" - requires_log_review = False - elif depth_coverage_ratio is not None and depth_coverage_ratio < 0.1: - status = "depth_mismatch" - reason = "configured_depth_not_exercised" - requires_log_review = True - elif not cpu_cache_metric_available: - status = "observability_gap" - reason = "no_direct_cpu_cache_metric" - requires_log_review = True - elif not kv_offload_observed and peak_cpu_cache_usage == 0.0: - status = "suspicious" - reason = "high_context_without_cpu_cache_usage" - requires_log_review = True - else: - status = "ok" - reason = "cpu_cache_signal_present" - requires_log_review = False - - result = { - "status": status, - "reason": reason, - "requires_log_review": requires_log_review, - "cpu_cache_metric_available": cpu_cache_metric_available, - } - if depth_coverage_ratio is not None: - result["depth_coverage_ratio"] = round(depth_coverage_ratio, 4) - if max_actual_context_len is not None: - result["max_actual_context_len"] = max_actual_context_len - return result - - -def build_runtime_overrides(replay_result: dict[str, Any]) -> dict[str, Optional[str]]: - """Return a stable runtime-overrides payload for aggregated ISB1 results.""" - override_mapping = { - "vllm_cpu_offload_gb": "VLLM_CPU_OFFLOAD_GB", - "vllm_swap_space_gb": "VLLM_SWAP_SPACE_GB", - "sglang_mem_fraction_override": "SGLANG_MEM_FRACTION_OVERRIDE", - "sglang_chunked_prefill_override": "SGLANG_CHUNKED_PREFILL_OVERRIDE", - } - runtime_overrides: dict[str, Optional[str]] = {} - - for result_key, env_var in override_mapping.items(): - value = replay_result.get(result_key) - if value in (None, ""): - value = os.environ.get(env_var) - runtime_overrides[result_key] = value if value not in (None, "") else None - - return runtime_overrides - - -def build_artifact_stems(result_filename: str) -> dict[str, str]: - """Return artifact names emitted by benchmark-isb1-tmpl.yml for this result stem.""" - return { - "processed": f"isb1_{result_filename}", - "raw_replay": f"replay_{result_filename}", - "server_logs": f"server_logs_{result_filename}", - "gpu_metrics": f"gpu_metrics_{result_filename}", - } - - -def build_dispatch_ref() -> Optional[str]: - """Return the best available workflow dispatch ref for traceability.""" - for env_var in ("DISPATCH_REF", "INPUT_REF", "GITHUB_REF"): - value = os.environ.get(env_var) - if value not in (None, ""): - return value - return None - - -base_env = get_required_env_vars( - [ - "RUNNER_TYPE", - "FRAMEWORK", - "PRECISION", - "RESULT_FILENAME", - "MODEL_PREFIX", - "IMAGE", - "TP", - "EP_SIZE", - "DP_ATTENTION", - "BENCHMARK_TYPE", - "EXPORT_FILE", - "RUNTIME_STACK_ID", - "HARDWARE_PROFILE_ID", - "CANONICAL_MODEL_ID", - "REQUEST_MODE", - "MAX_CONCURRENCY", - ] -) - -result_filename = base_env["RESULT_FILENAME"] -with open(f"{result_filename}.json") as f: - replay_result = json.load(f) - -aggregate = replay_result["aggregate_metrics"] -tp_size = int(base_env["TP"]) -ep_size = int(base_env["EP_SIZE"]) -validate_support_status_selection( - os.environ.get("SUPPORT_STATUS") or None, - replay_result.get("selection", {}), -) -validate_certification_selection(replay_result.get("selection", {})) -isl, osl, export_lane, benchmark_surface, export_metadata = parse_export_shape( - base_env["EXPORT_FILE"] -) - -total_tput = float(aggregate["total_token_throughput_tps"]) -output_tput = float(aggregate["output_throughput_tps"]) - -server_metrics_summary = replay_result.get("server_metrics_summary", {}) -cpu_cache_metric_available_raw = server_metrics_summary.get("cpu_cache_metric_available") -cpu_cache_metric_available = bool(cpu_cache_metric_available_raw) -if cpu_cache_metric_available_raw is None: - # Backward-compatibility shim for older replay outputs that predate the - # explicit availability field. Presence of the metric name/fields is a - # better signal than the sampled value because a real metric can be present - # and legitimately report 0.0. - cpu_cache_metric_available = bool(server_metrics_summary.get("cpu_cache_metric_name")) or any( - metric_name in server_metrics_summary - for metric_name in ("cpu_cache_usage_avg", "cpu_cache_usage_peak") - ) - -data = { - "hw": base_env["RUNNER_TYPE"], - "conc": int(replay_result.get("max_concurrency", base_env["MAX_CONCURRENCY"])), - "image": base_env["IMAGE"], - "model": replay_result["model_id"], - "infmax_model_prefix": base_env["MODEL_PREFIX"], - "framework": base_env["FRAMEWORK"], - "precision": base_env["PRECISION"], - "spec_decoding": os.environ.get("SPEC_DECODING", "none"), - "disagg": False, - "isl": isl, - "osl": osl, - "is_multinode": False, - "tp": tp_size, - "ep": ep_size, - "dp_attention": base_env["DP_ATTENTION"], - "tput_per_gpu": total_tput / tp_size, - "output_tput_per_gpu": output_tput / tp_size, - "input_tput_per_gpu": (total_tput - output_tput) / tp_size, - "benchmark_type": base_env["BENCHMARK_TYPE"], - "result_filename": result_filename, - "artifact_stems": build_artifact_stems(result_filename), - "dispatch_ref": build_dispatch_ref(), - "export_file": base_env["EXPORT_FILE"], - "export_lane": export_lane, - "benchmark_surface": benchmark_surface, - "adapter_id": export_metadata.get("adapter_id"), - "bundle_id": export_metadata.get("bundle_id"), - "profile_id": export_metadata.get("profile_id"), - "duration_tier": export_metadata.get("duration_tier"), - "context_bands": export_metadata.get("context_bands", []), - "adapter_support_status": export_metadata.get("adapter_support_status"), - "profile_tier": export_metadata.get("profile_tier"), - "producer_handoff_class": export_metadata.get("producer_handoff_class"), - "producer_claim_boundary": export_metadata.get("producer_claim_boundary"), - "runtime_stack_id": base_env["RUNTIME_STACK_ID"], - "hardware_profile_id": base_env["HARDWARE_PROFILE_ID"], - "canonical_model_id": base_env["CANONICAL_MODEL_ID"], - "support_status": os.environ.get("SUPPORT_STATUS") or None, - "benchmark_certification_status": replay_result.get("selection", {}).get( - "benchmark_certification_statuses", [None] - )[0], - "request_mode": base_env["REQUEST_MODE"], - "workload_type": os.environ.get("WORKLOAD_TYPE") or benchmark_surface, - "benchmark_duration_s": ( - float(os.environ["BENCHMARK_DURATION_S"]) - if os.environ.get("BENCHMARK_DURATION_S") not in (None, "") - else None - ), - "campaign_class": ( - "kv_stress" - if base_env["BENCHMARK_TYPE"] == "isb1_kv_stress" - else "replay" - ), - "harness_request_mode": replay_result.get("harness_request_mode", "auto"), - "mode": replay_result.get("mode"), - "selection": replay_result.get("selection", {}), - "aggregate_metrics": aggregate, - "per_turn_metrics": replay_result.get("per_turn_metrics", {}), - "server_metrics_summary": server_metrics_summary, - "cache_observability_status": server_metrics_summary.get("observability_status"), - "gpu_cache_metric_name": server_metrics_summary.get("gpu_cache_metric_name"), - "cpu_cache_metric_name": server_metrics_summary.get("cpu_cache_metric_name"), - "cpu_cache_metric_available": cpu_cache_metric_available, - "kv_offload_observed": bool(server_metrics_summary.get("kv_offload_observed", False)), - "peak_gpu_cache_usage": float(server_metrics_summary.get("gpu_cache_usage_peak", 0.0)), - "peak_cpu_cache_usage": float(server_metrics_summary.get("cpu_cache_usage_peak", 0.0)), - "session_throughput_sps": float(aggregate.get("session_throughput_sps", 0.0)), - "completed_sessions": int(aggregate.get("completed_sessions", 0)), - "total_sessions": int(aggregate.get("total_sessions", 0)), - "num_sessions": replay_result.get("num_sessions"), - "max_turns": replay_result.get("max_turns"), - "num_warmup_sessions": replay_result.get( - "num_warmup_sessions", int(os.environ.get("NUM_WARMUP_SESSIONS", "0") or 0) - ), - "max_model_len": ( - int(os.environ["MAX_MODEL_LEN"]) - if os.environ.get("MAX_MODEL_LEN") not in (None, "") - else None - ), - "max_sessions": ( - int(os.environ["MAX_SESSIONS"]) - if os.environ.get("MAX_SESSIONS") not in (None, "") - else None - ), - "max_turns_per_session": ( - int(os.environ["MAX_TURNS_PER_SESSION"]) - if os.environ.get("MAX_TURNS_PER_SESSION") not in (None, "") - else None - ), - "max_output_len": ( - int(os.environ["MAX_OUTPUT_LEN"]) - if os.environ.get("MAX_OUTPUT_LEN") not in (None, "") - else None - ), - "ignore_waits": os.environ.get("IGNORE_WAITS", "false").lower() == "true", - "ignore_eos": os.environ.get("IGNORE_EOS", "false").lower() == "true", - "offload_mode": os.environ.get("OFFLOAD_MODE") or None, - "kv_cache_dtype": os.environ.get("KV_CACHE_DTYPE") or None, - "disable_prefix_caching": os.environ.get("DISABLE_PREFIX_CACHING", "false").lower() == "true", - "runtime_overrides": build_runtime_overrides(replay_result), -} - -effective_max_context_depth = data["max_model_len"] or (isl + osl + 200) -data["effective_max_context_depth"] = effective_max_context_depth -if effective_max_context_depth > 600000: - data["context_pressure_class"] = "extended_1m" -elif effective_max_context_depth > 200000: - data["context_pressure_class"] = "extended_500k" -else: - data["context_pressure_class"] = "standard" - -# Depth telemetry: actual vs configured context depth -depth_telemetry = replay_result.get("depth_telemetry", {}) -max_actual_context_len = int(depth_telemetry.get("max_actual_context_len_per_turn") or 0) or None -total_actual_input_tokens = int(depth_telemetry.get("total_actual_input_tokens") or 0) or None -depth_coverage_ratio = None -if max_actual_context_len and effective_max_context_depth > 0: - depth_coverage_ratio = max_actual_context_len / effective_max_context_depth - -data["total_actual_input_tokens"] = total_actual_input_tokens -data["max_actual_context_len_per_turn"] = max_actual_context_len -data["depth_coverage_ratio"] = round(depth_coverage_ratio, 4) if depth_coverage_ratio is not None else None -data["depth_gap_tokens"] = ( - effective_max_context_depth - max_actual_context_len - if max_actual_context_len is not None else None -) - -# Depth coverage classification -if depth_coverage_ratio is not None: - if depth_coverage_ratio >= 0.9: - data["depth_coverage_class"] = "full" - elif depth_coverage_ratio >= 0.5: - data["depth_coverage_class"] = "partial" - elif depth_coverage_ratio >= 0.1: - data["depth_coverage_class"] = "bounded_preview" - else: - data["depth_coverage_class"] = "configuration_only" -else: - data["depth_coverage_class"] = None - -# Producer expectation comparison -producer_estimated_kv_bytes_peak = export_metadata.get("producer_estimated_kv_bytes_peak") -producer_expected_offload_mode = export_metadata.get("producer_expected_offload_mode") -data["producer_estimated_kv_bytes_peak"] = producer_estimated_kv_bytes_peak -data["producer_expected_offload_mode"] = producer_expected_offload_mode - -offload_mode_match = None -if producer_expected_offload_mode and data["context_pressure_class"] != "standard": - if producer_expected_offload_mode in ("hard_offload", "soft_offload"): - offload_mode_match = data["kv_offload_observed"] - elif producer_expected_offload_mode == "none": - offload_mode_match = True -data["producer_expectation_validation"] = { - "offload_mode_match": offload_mode_match, - "kv_bytes_validation": "not_available", - "depth_exercised": bool(depth_coverage_ratio and depth_coverage_ratio >= 0.5), -} - -# Preemption count from server metrics -data["preemption_count"] = int( - server_metrics_summary.get("preemption_count") - or replay_result.get("preemption_count") - or 0 -) - -context_pressure_signal = build_context_pressure_signal( - context_pressure_class=data["context_pressure_class"], - kv_offload_observed=data["kv_offload_observed"], - peak_cpu_cache_usage=data["peak_cpu_cache_usage"], - cpu_cache_metric_available=data["cpu_cache_metric_available"], - depth_coverage_ratio=depth_coverage_ratio, - max_actual_context_len=max_actual_context_len, -) -data["context_pressure_signal"] = context_pressure_signal -data["context_pressure_suspicious"] = context_pressure_signal["status"] == "suspicious" - -if data["context_pressure_suspicious"]: - print( - "WARNING: Preview lane at " - f"max-model-len={effective_max_context_depth} saw no CPU cache usage. " - "The server may have silently capped context or failed to activate KV offload. " - "Check server.log for OOM or context truncation.", - file=sys.stderr, - ) -elif context_pressure_signal["status"] == "depth_mismatch": - print( - "WARNING: Preview lane at " - f"max-model-len={effective_max_context_depth} had max actual context of " - f"{max_actual_context_len} tokens (depth_coverage_ratio=" - f"{depth_coverage_ratio:.4f}). The server was configured for " - f"{data['context_pressure_class'].replace('extended_', '')} but requests only exercised " - f"{max_actual_context_len} tokens. This is expected for file-backed replay previews; " - "it does not prove KV pressure at the configured depth.", - file=sys.stderr, - ) -elif context_pressure_signal["status"] == "observability_gap": - print( - "WARNING: Preview lane at " - f"max-model-len={effective_max_context_depth} lacks a direct CPU cache metric " - "for this framework. Inspect server.log and operator tuning notes before " - "treating the run as credible long-context evidence.", - file=sys.stderr, - ) - -for key, value in aggregate.items(): - if key.endswith("_ms"): - data[key.replace("_ms", "")] = float(value) / 1000.0 - if "tpot" in key: - metric_value = float(value) - data[key.replace("_ms", "").replace("tpot", "intvty")] = ( - 1000.0 / metric_value if metric_value > 0 else 0.0 - ) - -print(json.dumps(data, indent=2)) - -with open(f"agg_{result_filename}.json", "w") as f: - json.dump(data, f, indent=2) diff --git a/utils/test_benchmark_export_replay.py b/utils/test_benchmark_export_replay.py deleted file mode 100644 index 3c168fa65..000000000 --- a/utils/test_benchmark_export_replay.py +++ /dev/null @@ -1,947 +0,0 @@ -import asyncio -import hashlib -import json -from pathlib import Path - -import pytest -from aiohttp import web - -from bench_serving.benchmark_export_replay import ( - load_replay_sessions, - run_export_replay_benchmark, -) - - -def _count_tokens(text: str) -> int: - return max(1, len((text or "").split())) if text else 0 - - -def _multiturn_payload(runtime_stack_id: str = "standalone:sglang") -> dict: - return { - "adapter_id": "inferencex_multiturn", - "exports": [ - { - "trace_id": "trace-chat-1", - "runtime_stack_id": runtime_stack_id, - "hardware_profile_id": "nvidia:h200_sxm_141gb", - "canonical_model_id": "qwen3_30b_a3b", - "support_status": "supported", - "benchmark_certification_status": "dataset_replay_verified", - "session": { - "session_id": "session-chat-1", - "turns": [ - { - "turn_idx": 0, - "turn_id": 0, - "messages": [ - { - "role": "user", - "content_blocks": [ - {"type": "text", "text": "Investigate the flaky test."} - ], - } - ], - "expected_output_tokens": 8, - "wait_before_ms": 0, - }, - { - "turn_idx": 1, - "turn_id": 1, - "messages": [ - { - "role": "user", - "content_blocks": [ - {"type": "text", "text": "Investigate the flaky test."} - ], - }, - { - "role": "assistant", - "content_blocks": [ - {"type": "text", "text": "I found a race in the setup."} - ], - }, - { - "role": "tool", - "content_blocks": [ - {"type": "log", "text": "pytest -k flaky_test -> failed"} - ], - }, - ], - "expected_output_tokens": 6, - "wait_before_ms": 10, - }, - ], - }, - } - ], - } - - -def _trace_replay_payload(runtime_stack_id: str = "standalone:trt_llm") -> dict: - return { - "adapter_id": "inferencex_trace_replay", - "exports": [ - { - "trace_id": "trace-replay-1", - "runtime_stack_id": runtime_stack_id, - "hardware_profile_id": "nvidia:b200_sxm_180gb", - "canonical_model_id": "gpt_oss_120b", - "support_status": "supported", - "benchmark_certification_status": "dataset_replay_verified", - "trace_metadata": {"session_id": "session-replay-1"}, - "events": [ - { - "turn_id": 0, - "arrival_time_offset_ms": 0, - "input_messages": [ - { - "role": "user", - "content_blocks": [ - {"type": "text", "text": "Summarize the incident report."} - ], - } - ], - "target_output_tokens": 7, - }, - { - "turn_id": 1, - "arrival_time_offset_ms": 25, - "input_messages": [ - { - "role": "user", - "content_blocks": [ - {"type": "text", "text": "Summarize the incident report."} - ], - }, - { - "role": "assistant", - "content_blocks": [ - {"type": "text", "text": "The outage started after deploy."} - ], - }, - ], - "target_output_tokens": 5, - }, - ], - } - ], - } - - -def _write_json_and_sha(path: Path, payload: dict) -> str: - text = json.dumps(payload) - path.write_text(text) - return hashlib.sha256(text.encode()).hexdigest() - - -def _trace_replay_prefix_payload() -> dict: - base_cell = _trace_replay_payload()["exports"][0] - return { - "events": base_cell["events"], - "trace_metadata": base_cell["trace_metadata"], - "workload_family": "coding", - "task_class": "incident_debugging", - "workload_profile": "code_assistant", - "kv_mode": "shared_prefix", - "coding_profile": "bugfix", - "benchmark_surface": "code", - "benchmark_modifiers": ["prefix_aware"], - "workload_shape": {"turn_count": 2}, - "long_context_contract": {"band": "extension_131k"}, - "coding_profile_detail": {"language": "python"}, - "system_expectations": {"tools_allowed": True}, - "reasoning_profile": "standard", - "history_visibility": "full", - "context_band": "lc2_32k_64k", - "adapter_execution_class": "trace_replay_projection", - } - - -def _prefix_aware_trace_replay_payload( - tmp_path: Path, - *, - prefix_ref: str = "prefix-1", - runtime_stack_id: str = "standalone:trt_llm", -) -> tuple[dict, Path, dict]: - prefix_payload = _trace_replay_prefix_payload() - sidecar_path = tmp_path / "prefixes" / f"{prefix_ref}.json" - sidecar_path.parent.mkdir(parents=True, exist_ok=True) - sidecar_sha = _write_json_and_sha(sidecar_path, prefix_payload) - - payload = { - "schema_version": "0.2.0", - "adapter_id": "inferencex_trace_replay", - "prefix_index": { - prefix_ref: { - "relative_path": f"prefixes/{prefix_ref}.json", - "sha256": sidecar_sha, - } - }, - "exports": [ - { - "trace_id": "trace-replay-1", - "runtime_stack_id": runtime_stack_id, - "hardware_profile_id": "nvidia:b200_sxm_180gb", - "canonical_model_id": "gpt_oss_120b", - "support_status": "supported", - "benchmark_certification_status": "dataset_replay_verified", - "prefix_ref": prefix_ref, - } - ], - } - return payload, sidecar_path, prefix_payload - - -async def _start_mock_server( - sse_mode: str = "normal", - metrics_text: str | None = None, -) -> tuple[web.AppRunner, str]: - """Start a mock OpenAI-compatible server. - - sse_mode controls how SSE frames are written to the wire: - - "normal": one data frame per write (default) - - "multiline": multiple data frames packed into a single write - - "split": a single data frame split across two writes - """ - - async def _stream_response(request: web.Request, chunks: list[dict]) -> web.StreamResponse: - response = web.StreamResponse( - status=200, - headers={"Content-Type": "text/event-stream"}, - ) - await response.prepare(request) - - if sse_mode == "multiline": - # Pack ALL data frames into a single TCP write - blob = b"" - for chunk in chunks: - blob += f"data: {json.dumps(chunk)}\n\n".encode() - blob += b"data: [DONE]\n\n" - await response.write(blob) - elif sse_mode == "split": - # Split the first frame across two writes - for idx, chunk in enumerate(chunks): - frame = f"data: {json.dumps(chunk)}\n\n".encode() - if idx == 0: - mid = len(frame) // 2 - await response.write(frame[:mid]) - await asyncio.sleep(0.005) - await response.write(frame[mid:]) - else: - await response.write(frame) - await asyncio.sleep(0.005) - await response.write(b"data: [DONE]\n\n") - else: - for chunk in chunks: - await response.write(f"data: {json.dumps(chunk)}\n\n".encode()) - await asyncio.sleep(0.005) - await response.write(b"data: [DONE]\n\n") - - await response.write_eof() - return response - - async def chat_handler(request: web.Request) -> web.StreamResponse: - payload = await request.json() - # Verify the fallback from max_completion_tokens -> max_tokens. - if "max_completion_tokens" in payload: - return web.json_response({"error": "unsupported field"}, status=400) - assert payload["messages"] - return await _stream_response( - request, - [ - {"choices": [{"delta": {"content": "patched"}}]}, - {"usage": {"completion_tokens": 2}}, - ], - ) - - async def completions_handler(request: web.Request) -> web.StreamResponse: - payload = await request.json() - assert payload["prompt"].startswith("USER:") - return await _stream_response( - request, - [ - {"choices": [{"text": "resolved"}]}, - {"usage": {"completion_tokens": 2}}, - ], - ) - - async def metrics_handler(_: web.Request) -> web.Response: - return web.Response( - text=metrics_text - or ( - "vllm:gpu_cache_usage_perc 0.42\n" - "vllm:cpu_cache_usage_perc 0.25\n" - "sglang:cache_hit_rate 0.8\n" - ) - ) - - app = web.Application() - app.router.add_post("/v1/chat/completions", chat_handler) - app.router.add_post("/v1/completions", completions_handler) - app.router.add_get("/metrics", metrics_handler) - - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite(runner, host="127.0.0.1", port=0) - await site.start() - sockets = getattr(site, "_server").sockets - port = sockets[0].getsockname()[1] - return runner, f"http://127.0.0.1:{port}" - - -def test_load_replay_sessions_multiturn_chat(tmp_path: Path) -> None: - export_file = tmp_path / "multiturn.json" - export_file.write_text(json.dumps(_multiturn_payload())) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:sglang"}, - hardware_profile_ids={"nvidia:h200_sxm_141gb"}, - canonical_model_ids={"qwen3_30b_a3b"}, - request_mode="auto", - ignore_waits=False, - ) - - assert len(sessions) == 1 - assert sessions[0].request_mode == "chat" - assert sessions[0].turns[1].wait_before_s == 0.01 - assert selection["support_statuses"] == ["supported"] - assert selection["support_status_counts"] == {"supported": 1} - assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"] - assert selection["benchmark_certification_status_counts"] == { - "dataset_replay_verified": 1 - } - assert selection["request_mode_mix"] == {"chat": 1} - - -def test_load_replay_sessions_trace_replay_auto_uses_completions(tmp_path: Path) -> None: - export_file = tmp_path / "trace_replay.json" - export_file.write_text(json.dumps(_trace_replay_payload())) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:trt_llm"}, - hardware_profile_ids={"nvidia:b200_sxm_180gb"}, - canonical_model_ids={"gpt_oss_120b"}, - request_mode="auto", - ) - - assert len(sessions) == 1 - assert sessions[0].request_mode == "completions" - assert sessions[0].turns[1].wait_before_s == 0.025 - assert sessions[0].turns[0].completion_prompt.startswith("USER:") - assert selection["support_statuses"] == ["supported"] - assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"] - assert selection["request_mode_mix"] == {"completions": 1} - - -def test_load_replay_sessions_support_status_filter(tmp_path: Path) -> None: - payload = _multiturn_payload() - payload["exports"].append( - { - **payload["exports"][0], - "trace_id": "trace-chat-preview", - "support_status": "reviewed_preview", - } - ) - export_file = tmp_path / "multiturn_mixed_status.json" - export_file.write_text(json.dumps(payload)) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:sglang"}, - hardware_profile_ids={"nvidia:h200_sxm_141gb"}, - canonical_model_ids={"qwen3_30b_a3b"}, - support_statuses={"supported"}, - request_mode="auto", - ignore_waits=False, - ) - - assert [session.trace_id for session in sessions] == ["trace-chat-1"] - assert selection["support_statuses"] == ["supported"] - assert selection["support_status_counts"] == {"supported": 1} - assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"] - - -def test_run_export_replay_benchmark_chat(tmp_path: Path) -> None: - export_file = tmp_path / "multiturn.json" - export_file.write_text(json.dumps(_multiturn_payload())) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:sglang"}, - hardware_profile_ids={"nvidia:h200_sxm_141gb"}, - canonical_model_ids={"qwen3_30b_a3b"}, - request_mode="chat", - ignore_waits=True, - ) - - async def _run() -> dict: - runner, base_url = await _start_mock_server() - try: - return await run_export_replay_benchmark( - sessions=sessions, - selection_metadata=selection, - model_id="Qwen/Qwen3-30B-A3B", - model_name=None, - chat_api_url=f"{base_url}/v1/chat/completions", - completion_api_url=f"{base_url}/v1/completions", - count_text_tokens=_count_tokens, - max_concurrency=1, - selected_percentiles=[99], - disable_tqdm=True, - num_warmup_sessions=1, - ) - finally: - await runner.cleanup() - - result = asyncio.run(_run()) - assert result["aggregate_metrics"]["completed_sessions"] == 1 - assert result["selection"]["request_mode_mix"] == {"chat": 1} - assert result["server_metrics_summary"]["samples"] >= 0 - assert result["server_metrics_summary"]["gpu_cache_usage_peak"] == 0.42 - assert result["server_metrics_summary"]["cpu_cache_usage_peak"] == 0.25 - assert result["server_metrics_summary"]["gpu_cache_metric_name"] == "vllm:gpu_cache_usage_perc" - assert result["server_metrics_summary"]["cpu_cache_metric_name"] == "vllm:cpu_cache_usage_perc" - assert result["server_metrics_summary"]["cpu_cache_metric_available"] is True - assert result["server_metrics_summary"]["observability_status"] == "direct_cpu_cache_metric" - assert result["server_metrics_summary"]["kv_offload_observed"] is True - - -def test_run_export_replay_benchmark_completions(tmp_path: Path) -> None: - export_file = tmp_path / "trace_replay.json" - export_file.write_text(json.dumps(_trace_replay_payload())) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:trt_llm"}, - hardware_profile_ids={"nvidia:b200_sxm_180gb"}, - canonical_model_ids={"gpt_oss_120b"}, - request_mode="completions", - ignore_waits=True, - ) - - async def _run() -> dict: - runner, base_url = await _start_mock_server() - try: - return await run_export_replay_benchmark( - sessions=sessions, - selection_metadata=selection, - model_id="gpt-oss-120b", - model_name=None, - chat_api_url=f"{base_url}/v1/chat/completions", - completion_api_url=f"{base_url}/v1/completions", - count_text_tokens=_count_tokens, - max_concurrency=1, - selected_percentiles=[99], - disable_tqdm=True, - num_warmup_sessions=0, - ) - finally: - await runner.cleanup() - - result = asyncio.run(_run()) - assert result["aggregate_metrics"]["completed_sessions"] == 1 - assert result["selection"]["request_mode_mix"] == {"completions": 1} - - -def test_run_export_replay_benchmark_sglang_token_usage_metrics(tmp_path: Path) -> None: - export_file = tmp_path / "multiturn_sglang_metrics.json" - export_file.write_text(json.dumps(_multiturn_payload(runtime_stack_id="standalone:sglang"))) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:sglang"}, - hardware_profile_ids={"nvidia:h200_sxm_141gb"}, - canonical_model_ids={"qwen3_30b_a3b"}, - request_mode="chat", - ignore_waits=True, - ) - - async def _run() -> dict: - runner, base_url = await _start_mock_server( - metrics_text=( - 'sglang:token_usage{model_name="Qwen/Qwen3-30B-A3B"} 0.61\n' - 'sglang:cache_hit_rate{model_name="Qwen/Qwen3-30B-A3B"} 0.8\n' - ) - ) - try: - return await run_export_replay_benchmark( - sessions=sessions, - selection_metadata=selection, - model_id="Qwen/Qwen3-30B-A3B", - model_name=None, - chat_api_url=f"{base_url}/v1/chat/completions", - completion_api_url=f"{base_url}/v1/completions", - count_text_tokens=_count_tokens, - max_concurrency=1, - selected_percentiles=[99], - disable_tqdm=True, - num_warmup_sessions=0, - ) - finally: - await runner.cleanup() - - result = asyncio.run(_run()) - summary = result["server_metrics_summary"] - assert result["aggregate_metrics"]["completed_sessions"] == 1 - assert summary["samples"] >= 0 - assert summary["gpu_cache_usage_peak"] == 0.61 - assert summary["gpu_cache_metric_name"] == "sglang:token_usage" - assert summary["cpu_cache_metric_name"] is None - assert summary["cpu_cache_metric_available"] is False - assert summary["cache_hit_rate_avg"] == 0.8 - assert summary["observability_status"] == "indirect_without_cpu_cache_metric" - assert summary["kv_offload_observed"] is False - - -def test_sse_multiline_chunks(tmp_path: Path) -> None: - """Verify replay works when the server packs multiple SSE frames into one TCP write.""" - export_file = tmp_path / "multiturn.json" - export_file.write_text(json.dumps(_multiturn_payload())) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:sglang"}, - hardware_profile_ids={"nvidia:h200_sxm_141gb"}, - canonical_model_ids={"qwen3_30b_a3b"}, - request_mode="chat", - ignore_waits=True, - ) - - async def _run() -> dict: - runner, base_url = await _start_mock_server(sse_mode="multiline") - try: - return await run_export_replay_benchmark( - sessions=sessions, - selection_metadata=selection, - model_id="Qwen/Qwen3-30B-A3B", - model_name=None, - chat_api_url=f"{base_url}/v1/chat/completions", - completion_api_url=f"{base_url}/v1/completions", - count_text_tokens=_count_tokens, - max_concurrency=1, - selected_percentiles=[99], - disable_tqdm=True, - num_warmup_sessions=0, - ) - finally: - await runner.cleanup() - - result = asyncio.run(_run()) - assert result["aggregate_metrics"]["completed_sessions"] == 1 - - -def test_sse_split_across_chunks(tmp_path: Path) -> None: - """Verify replay works when a single SSE frame is split across TCP writes.""" - export_file = tmp_path / "multiturn.json" - export_file.write_text(json.dumps(_multiturn_payload())) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:sglang"}, - hardware_profile_ids={"nvidia:h200_sxm_141gb"}, - canonical_model_ids={"qwen3_30b_a3b"}, - request_mode="chat", - ignore_waits=True, - ) - - async def _run() -> dict: - runner, base_url = await _start_mock_server(sse_mode="split") - try: - return await run_export_replay_benchmark( - sessions=sessions, - selection_metadata=selection, - model_id="Qwen/Qwen3-30B-A3B", - model_name=None, - chat_api_url=f"{base_url}/v1/chat/completions", - completion_api_url=f"{base_url}/v1/completions", - count_text_tokens=_count_tokens, - max_concurrency=1, - selected_percentiles=[99], - disable_tqdm=True, - num_warmup_sessions=0, - ) - finally: - await runner.cleanup() - - result = asyncio.run(_run()) - assert result["aggregate_metrics"]["completed_sessions"] == 1 - - -def test_empty_content_no_phantom_itl(tmp_path: Path) -> None: - """Verify that SSE chunks with empty/null content don't inflate ITL counts.""" - export_file = tmp_path / "multiturn.json" - # Use a single-turn export to isolate ITL counting - single_turn_payload = { - "adapter_id": "inferencex_multiturn", - "exports": [ - { - "trace_id": "trace-itl-1", - "runtime_stack_id": "standalone:sglang", - "hardware_profile_id": "nvidia:h200_sxm_141gb", - "canonical_model_id": "qwen3_30b_a3b", - "support_status": "supported", - "session": { - "session_id": "session-itl-1", - "turns": [ - { - "turn_idx": 0, - "turn_id": 0, - "messages": [ - { - "role": "user", - "content_blocks": [ - {"type": "text", "text": "Hello"} - ], - } - ], - "expected_output_tokens": 4, - "wait_before_ms": 0, - }, - ], - }, - } - ], - } - export_file.write_text(json.dumps(single_turn_payload)) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:sglang"}, - hardware_profile_ids={"nvidia:h200_sxm_141gb"}, - canonical_model_ids={"qwen3_30b_a3b"}, - request_mode="chat", - ignore_waits=True, - ) - - async def _run() -> dict: - # Custom server that sends empty-content chunks between real ones - async def _chat_with_empty(request: web.Request) -> web.StreamResponse: - payload = await request.json() - if "max_completion_tokens" in payload: - return web.json_response({"error": "unsupported"}, status=400) - - response = web.StreamResponse( - status=200, - headers={"Content-Type": "text/event-stream"}, - ) - await response.prepare(request) - # Frame 1: real content - await response.write( - f'data: {{"choices": [{{"delta": {{"content": "hello"}}}}]}}\n\n'.encode() - ) - await asyncio.sleep(0.005) - # Frame 2: empty content (should not generate ITL entry) - await response.write( - f'data: {{"choices": [{{"delta": {{"content": ""}}}}]}}\n\n'.encode() - ) - await asyncio.sleep(0.005) - # Frame 3: null content (should not generate ITL entry) - await response.write( - f'data: {{"choices": [{{"delta": {{}}}}]}}\n\n'.encode() - ) - await asyncio.sleep(0.005) - # Frame 4: real content - await response.write( - f'data: {{"choices": [{{"delta": {{"content": " world"}}}}]}}\n\n'.encode() - ) - await asyncio.sleep(0.005) - # Usage frame - await response.write( - f'data: {{"usage": {{"completion_tokens": 2}}}}\n\n'.encode() - ) - await response.write(b"data: [DONE]\n\n") - await response.write_eof() - return response - - app = web.Application() - app.router.add_post("/v1/chat/completions", _chat_with_empty) - app.router.add_get("/metrics", lambda _: web.Response(text="")) - - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite(runner, host="127.0.0.1", port=0) - await site.start() - sockets = getattr(site, "_server").sockets - port = sockets[0].getsockname()[1] - base_url = f"http://127.0.0.1:{port}" - - try: - return await run_export_replay_benchmark( - sessions=sessions, - selection_metadata=selection, - model_id="Qwen/Qwen3-30B-A3B", - model_name=None, - chat_api_url=f"{base_url}/v1/chat/completions", - completion_api_url=f"{base_url}/v1/completions", - count_text_tokens=_count_tokens, - max_concurrency=1, - selected_percentiles=[99], - disable_tqdm=True, - num_warmup_sessions=0, - ) - finally: - await runner.cleanup() - - result = asyncio.run(_run()) - agg = result["aggregate_metrics"] - assert agg["completed_sessions"] == 1 - # With 2 real content chunks, ITL should have exactly 1 entry - # (first content = TTFT, second content = 1 ITL). Empty/null chunks - # must not inflate this count. - turn_metrics = result["per_turn_metrics"]["turn_1"] - assert turn_metrics["completed"] == 1 - - -def test_actual_context_len_for_file_backed_assets(tmp_path: Path) -> None: - """Verify that actual_context_len counts rendered payload tokens, not asset metadata.""" - payload = { - "adapter_id": "inferencex_trace_replay", - "exports": [ - { - "trace_id": "test-asset-trace", - "runtime_stack_id": "standalone:vllm", - "hardware_profile_id": "nvidia:h200_sxm_141gb", - "canonical_model_id": "gpt_oss_120b", - "support_status": "reviewed_preview", - "benchmark_certification_status": "dataset_replay_verified", - "context_band": "xlc2_384k_512k", - "trace_metadata": { - "session_id": "test-session", - "estimated_kv_bytes_peak": 27000000000, - "expected_offload_mode": "soft_offload", - }, - "events": [ - { - "event_id": "evt-0", - "trace_id": "test-asset-trace", - "session_id": "test-session", - "turn_id": 0, - "arrival_time_offset_ms": 0, - "input_messages": [ - { - "role": "user", - "content_blocks": [ - {"type": "text", "text": "Analyze this codebase"}, - { - "type": "table", - "text": None, - "asset_path": "synthetic_v0/context_assets/big_file.md", - "asset_token_count": 500000, - "asset_byte_count": 2500000, - }, - ], - } - ], - "output": {"output_token_count": 100}, - } - ], - } - ], - } - export_file = tmp_path / "asset_test.json" - export_file.write_text(json.dumps(payload)) - - sessions, _ = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:vllm"}, - hardware_profile_ids={"nvidia:h200_sxm_141gb"}, - canonical_model_ids={"gpt_oss_120b"}, - request_mode="chat", - ignore_waits=True, - ) - - assert len(sessions) == 1 - turn = sessions[0].turns[0] - - # Estimated context_len should include the 500k asset_token_count - assert turn.context_len >= 500000 - - # Actual context_len should be much smaller — just the rendered text - # "[TABLE]" is ~1 token + "Analyze this codebase" is ~3 tokens - assert turn.actual_context_len < 100 - assert turn.actual_context_len > 0 - - # The gap proves the measurement works - assert turn.context_len > turn.actual_context_len * 100 - - -def test_depth_telemetry_in_benchmark_result(tmp_path: Path) -> None: - """Verify depth_telemetry block is emitted in benchmark results.""" - export_file = tmp_path / "multiturn.json" - export_file.write_text(json.dumps(_multiturn_payload())) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:sglang"}, - hardware_profile_ids={"nvidia:h200_sxm_141gb"}, - canonical_model_ids={"qwen3_30b_a3b"}, - request_mode="chat", - ignore_waits=True, - ) - - async def _run() -> dict: - runner, base_url = await _start_mock_server() - try: - return await run_export_replay_benchmark( - sessions=sessions, - selection_metadata=selection, - model_id="Qwen/Qwen3-30B-A3B", - model_name=None, - chat_api_url=f"{base_url}/v1/chat/completions", - completion_api_url=f"{base_url}/v1/completions", - count_text_tokens=_count_tokens, - max_concurrency=1, - selected_percentiles=[99], - disable_tqdm=True, - num_warmup_sessions=0, - ) - finally: - await runner.cleanup() - - result = asyncio.run(_run()) - - # depth_telemetry block must exist - assert "depth_telemetry" in result - dt = result["depth_telemetry"] - assert "total_estimated_input_tokens" in dt - assert "total_actual_input_tokens" in dt - assert "max_actual_context_len_per_turn" in dt - assert dt["total_actual_input_tokens"] > 0 - assert dt["max_actual_context_len_per_turn"] > 0 - - # Aggregate metrics must also carry actual input tokens - agg = result["aggregate_metrics"] - assert "total_actual_input_tokens" in agg - assert "max_actual_context_len_per_turn" in agg - - # Per-turn metrics should have actual context length - for turn_key, turn_metrics in result["per_turn_metrics"].items(): - assert "mean_actual_context_len" in turn_metrics - - -def test_load_replay_sessions_prefix_hydrates_v020_bundle(tmp_path: Path) -> None: - payload, _, prefix_payload = _prefix_aware_trace_replay_payload(tmp_path) - payload["exports"].append( - { - **payload["exports"][0], - "canonical_model_id": "glm_5", - } - ) - export_file = tmp_path / "trace_replay_v020.json" - export_file.write_text(json.dumps(payload)) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:trt_llm"}, - hardware_profile_ids={"nvidia:b200_sxm_180gb"}, - canonical_model_ids={"gpt_oss_120b"}, - request_mode="auto", - ) - - assert len(sessions) == 1 - assert sessions[0].session_id == prefix_payload["trace_metadata"]["session_id"] - assert sessions[0].request_mode == "completions" - assert sessions[0].turns[1].wait_before_s == 0.025 - assert sessions[0].turns[0].completion_prompt.startswith("USER:") - assert selection["canonical_model_ids"] == ["gpt_oss_120b"] - assert selection["request_mode_mix"] == {"completions": 1} - - -def test_load_replay_sessions_prefix_sha_mismatch_raises(tmp_path: Path) -> None: - payload, _, _ = _prefix_aware_trace_replay_payload(tmp_path) - payload["prefix_index"]["prefix-1"]["sha256"] = "deadbeef" - export_file = tmp_path / "trace_replay_bad_sha.json" - export_file.write_text(json.dumps(payload)) - - with pytest.raises(ValueError, match="declared_sha"): - load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:trt_llm"}, - hardware_profile_ids={"nvidia:b200_sxm_180gb"}, - canonical_model_ids={"gpt_oss_120b"}, - ) - - -def test_load_replay_sessions_unknown_prefix_ref_raises(tmp_path: Path) -> None: - payload, _, _ = _prefix_aware_trace_replay_payload(tmp_path) - payload["exports"][0]["prefix_ref"] = "missing-prefix" - export_file = tmp_path / "trace_replay_unknown_prefix.json" - export_file.write_text(json.dumps(payload)) - - with pytest.raises(ValueError, match="unknown prefix_ref"): - load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:trt_llm"}, - hardware_profile_ids={"nvidia:b200_sxm_180gb"}, - canonical_model_ids={"gpt_oss_120b"}, - ) - - -def test_load_replay_sessions_legacy_v010_skips_prefix_hydrator(tmp_path: Path) -> None: - payload = _trace_replay_payload() - payload["prefix_index"] = { - "unused-prefix": { - "relative_path": "prefixes/unused-prefix.json", - "sha256": "not-used", - } - } - export_file = tmp_path / "trace_replay_legacy.json" - export_file.write_text(json.dumps(payload)) - - sessions, selection = load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:trt_llm"}, - hardware_profile_ids={"nvidia:b200_sxm_180gb"}, - canonical_model_ids={"gpt_oss_120b"}, - request_mode="auto", - ) - - assert len(sessions) == 1 - assert sessions[0].session_id == "session-replay-1" - assert selection["request_mode_mix"] == {"completions": 1} - - -def test_load_replay_sessions_rejects_mixed_prefix_and_embedded_events_bundle( - tmp_path: Path, -) -> None: - payload, _, _ = _prefix_aware_trace_replay_payload(tmp_path) - payload["exports"].append( - { - "trace_id": "trace-replay-legacy-row", - "runtime_stack_id": "standalone:sglang", - "hardware_profile_id": "nvidia:h200_sxm_141gb", - "canonical_model_id": "qwen3_30b_a3b", - "support_status": "supported", - "benchmark_certification_status": "dataset_replay_verified", - "events": _trace_replay_payload(runtime_stack_id="standalone:sglang")["exports"][0]["events"], - "trace_metadata": {"session_id": "legacy-session"}, - } - ) - export_file = tmp_path / "trace_replay_mixed_bundle.json" - export_file.write_text(json.dumps(payload)) - - with pytest.raises(ValueError, match="Mixed legacy/prefix-aware"): - load_replay_sessions( - export_file=str(export_file), - count_text_tokens=_count_tokens, - runtime_stack_ids={"standalone:trt_llm"}, - hardware_profile_ids={"nvidia:b200_sxm_180gb"}, - canonical_model_ids={"gpt_oss_120b"}, - ) diff --git a/utils/test_process_result.py b/utils/test_process_result.py index 8bc51d593..2a6389a78 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -47,7 +47,6 @@ def base_env_vars(): "OSL": "1024", "DISAGG": "false", "MODEL_PREFIX": "dsr1", - "IMAGE": "lmsysorg/sglang:v0.4.6.post5-cu126", } @@ -300,32 +299,6 @@ def test_missing_result_file(self, tmp_path, single_node_env_vars): assert result.returncode != 0 - def test_isb1_replay_env_guard(self, tmp_path, sample_benchmark_result, single_node_env_vars): - """ISB1 replay runs should fail fast with a helpful processor redirect.""" - env = single_node_env_vars.copy() - env["BENCHMARK_TYPE"] = "isb1_replay" - - result = run_script(tmp_path, env, sample_benchmark_result) - - assert result.returncode != 0 - assert "Use utils/process_result_isb1.py instead" in result.stderr - - def test_isb1_replay_payload_guard(self, tmp_path, single_node_env_vars): - """Replay-shaped payloads should be rejected even without BENCHMARK_TYPE set.""" - replay_like_result = { - "model_id": "test-model", - "max_concurrency": 4, - "aggregate_metrics": { - "total_token_throughput_tps": 1000.0, - "output_throughput_tps": 800.0, - }, - } - - result = run_script(tmp_path, single_node_env_vars, replay_like_result) - - assert result.returncode != 0 - assert "Detected an ISB1 replay-style result payload" in result.stderr - # ============================================================================= # Test latency and throughput calculations diff --git a/utils/test_process_result_isb1.py b/utils/test_process_result_isb1.py deleted file mode 100644 index f2a4f06fb..000000000 --- a/utils/test_process_result_isb1.py +++ /dev/null @@ -1,1006 +0,0 @@ -import json -import subprocess -import sys -from pathlib import Path - -import pytest - -SCRIPT_PATH = Path(__file__).parent / "process_result_isb1.py" - - -def write_export_fixture(tmp_path: Path, relative_path: str, payload: dict) -> str: - export_path = tmp_path / relative_path - export_path.parent.mkdir(parents=True, exist_ok=True) - export_path.write_text(json.dumps(payload)) - return str(export_path.relative_to(tmp_path)) - - -@pytest.fixture -def sample_replay_result(): - return { - "model_id": "deepseek-ai/DeepSeek-R1-0528", - "mode": "export_replay", - "max_concurrency": 8, - "num_sessions": 2, - "max_turns": 4, - "num_warmup_sessions": 1, - "harness_request_mode": "auto", - "selection": { - "adapter_id": "inferencex_multiturn", - "selected_sessions": 2, - "runtime_stack_ids": ["vllm-0.8.5-h200"], - "hardware_profile_ids": ["h200-8gpu"], - "canonical_model_ids": ["deepseek-r1-0528"], - "support_statuses": ["supported"], - "support_status_counts": {"supported": 2}, - "benchmark_certification_statuses": ["dataset_replay_verified"], - "benchmark_certification_status_counts": { - "dataset_replay_verified": 2 - }, - "request_mode_mix": {"chat": 2}, - }, - "server_metrics_summary": { - "cache_usage_avg": 0.45, - "cache_hit_rate_avg": 0.15, - "gpu_cache_usage_avg": 0.45, - "gpu_cache_usage_peak": 0.78, - "gpu_cache_metric_name": "vllm:gpu_cache_usage_perc", - "cpu_cache_usage_avg": 0.12, - "cpu_cache_usage_peak": 0.31, - "cpu_cache_metric_name": "vllm:cpu_cache_usage_perc", - "cpu_cache_metric_available": True, - "observability_status": "direct_cpu_cache_metric", - "kv_offload_observed": True, - "samples": 5, - }, - "per_turn_metrics": { - "turn_1": { - "completed": 2, - "mean_context_len": 8192.0, - "mean_ttft_ms": 180.0, - "p99_ttft_ms": 300.0, - "mean_e2el_ms": 1000.0, - } - }, - "aggregate_metrics": { - "completed_sessions": 2, - "total_sessions": 2, - "total_input_tokens": 1000, - "total_output_tokens": 300, - "total_wall_time_s": 2.0, - "session_throughput_sps": 1.0, - "output_throughput_tps": 150.0, - "total_token_throughput_tps": 650.0, - "mean_ttft_ms": 200.0, - "median_ttft_ms": 180.0, - "p99_ttft_ms": 500.0, - "mean_tpot_ms": 20.0, - "median_tpot_ms": 25.0, - "p99_tpot_ms": 50.0, - "mean_e2el_ms": 1200.0, - "median_e2el_ms": 1100.0, - "p99_e2el_ms": 2000.0, - }, - } - - -@pytest.fixture -def base_env(): - return { - "RUNNER_TYPE": "h200-cw-1", - "FRAMEWORK": "vllm", - "PRECISION": "fp8", - "RESULT_FILENAME": "isb1_result", - "MODEL_PREFIX": "dsr1", - "IMAGE": "vllm/vllm-openai:v0.8.5", - "TP": "8", - "EP_SIZE": "1", - "DP_ATTENTION": "false", - "BENCHMARK_TYPE": "isb1_replay", - "EXPORT_FILE": "datasets/isb1/exports/core/chat_8k1k.json", - "RUNTIME_STACK_ID": "vllm-0.8.5-h200", - "HARDWARE_PROFILE_ID": "h200-8gpu", - "CANONICAL_MODEL_ID": "deepseek-r1-0528", - "SUPPORT_STATUS": "supported", - "REQUEST_MODE": "multi-turn", - "MAX_CONCURRENCY": "8", - "SPEC_DECODING": "none", - "IGNORE_WAITS": "true", - "GITHUB_REF": "refs/heads/test-isb1-traceability", - } - - -def run_script(tmp_path, env, replay_result, result_filename="isb1_result"): - result_file = tmp_path / f"{result_filename}.json" - result_file.write_text(json.dumps(replay_result)) - - env = env.copy() - env["RESULT_FILENAME"] = result_filename - - return subprocess.run( - [sys.executable, str(SCRIPT_PATH)], - cwd=tmp_path, - env=env, - capture_output=True, - text=True, - ) - - -def assert_traceability_fields( - output_data: dict, result_filename: str, dispatch_ref: str = "refs/heads/test-isb1-traceability" -): - assert output_data["result_filename"] == result_filename - assert output_data["artifact_stems"] == { - "processed": f"isb1_{result_filename}", - "raw_replay": f"replay_{result_filename}", - "server_logs": f"server_logs_{result_filename}", - "gpu_metrics": f"gpu_metrics_{result_filename}", - } - assert output_data["dispatch_ref"] == dispatch_ref - - -def test_isb1_replay_processing(tmp_path, sample_replay_result, base_env): - export_file = write_export_fixture( - tmp_path, - "datasets/isb1/exports/core/chat_8k1k.json", - { - "adapter_id": "inferencex_multiturn", - "bundle_id": "bundle-core-chat", - "surface": "chat", - "exports": [ - { - "trace_id": "trace-1", - "runtime_stack_id": "vllm-0.8.5-h200", - "hardware_profile_id": "h200-8gpu", - "canonical_model_id": "deepseek-r1-0528", - "support_status": "supported", - } - ], - }, - ) - env = base_env.copy() - env["EXPORT_FILE"] = export_file - - result = run_script(tmp_path, env, sample_replay_result) - assert result.returncode == 0, f"Script failed: {result.stderr}" - - output_data = json.loads(result.stdout) - - assert output_data["benchmark_type"] == "isb1_replay" - assert output_data["request_mode"] == "multi-turn" - assert output_data["harness_request_mode"] == "auto" - assert output_data["isl"] == 8192 - assert output_data["osl"] == 1024 - assert output_data["export_lane"] == "core" - assert output_data["benchmark_surface"] == "chat" - assert output_data["support_status"] == "supported" - assert output_data["benchmark_certification_status"] == "dataset_replay_verified" - assert output_data["effective_max_context_depth"] == 8192 + 1024 + 200 - assert output_data["context_pressure_class"] == "standard" - assert output_data["context_pressure_signal"]["status"] == "not_applicable" - assert output_data["context_pressure_suspicious"] is False - assert output_data["completed_sessions"] == 2 - assert output_data["session_throughput_sps"] == pytest.approx(1.0) - assert output_data["tput_per_gpu"] == pytest.approx(650.0 / 8) - assert output_data["output_tput_per_gpu"] == pytest.approx(150.0 / 8) - assert output_data["input_tput_per_gpu"] == pytest.approx((650.0 - 150.0) / 8) - assert output_data["median_ttft"] == pytest.approx(0.18) - assert output_data["median_intvty"] == pytest.approx(40.0) - assert output_data["median_e2el"] == pytest.approx(1.1) - assert output_data["kv_offload_observed"] is True - assert output_data["peak_gpu_cache_usage"] == pytest.approx(0.78) - assert output_data["peak_cpu_cache_usage"] == pytest.approx(0.31) - assert output_data["selection"]["request_mode_mix"] == {"chat": 2} - assert output_data["selection"]["support_status_counts"] == {"supported": 2} - assert output_data["per_turn_metrics"]["turn_1"]["completed"] == 2 - assert output_data["runtime_overrides"] == { - "vllm_cpu_offload_gb": None, - "vllm_swap_space_gb": None, - "sglang_mem_fraction_override": None, - "sglang_chunked_prefill_override": None, - } - assert_traceability_fields(output_data, "isb1_result") - - output_file = tmp_path / "agg_isb1_result.json" - assert output_file.exists() - persisted_output = json.loads(output_file.read_text()) - assert_traceability_fields(persisted_output, "isb1_result") - - -def test_offload_mode_env_propagation(tmp_path, sample_replay_result, base_env): - export_file = write_export_fixture( - tmp_path, - "datasets/isb1/exports/core/chat_8k1k.json", - { - "adapter_id": "inferencex_multiturn", - "surface": "chat", - "exports": [ - { - "trace_id": "trace-1", - "runtime_stack_id": "vllm-0.8.5-h200", - "hardware_profile_id": "h200-8gpu", - "canonical_model_id": "deepseek-r1-0528", - "support_status": "supported", - } - ], - }, - ) - env = base_env.copy() - env["EXPORT_FILE"] = export_file - env["OFFLOAD_MODE"] = "noprefix" - env["KV_CACHE_DTYPE"] = "fp8" - env["DISABLE_PREFIX_CACHING"] = "true" - - result = run_script(tmp_path, env, sample_replay_result, result_filename="isb1_offload_env") - assert result.returncode == 0, f"Script failed: {result.stderr}" - - output_data = json.loads(result.stdout) - assert output_data["offload_mode"] == "noprefix" - assert output_data["kv_cache_dtype"] == "fp8" - assert output_data["disable_prefix_caching"] is True - - -def test_support_status_mismatch_fails(tmp_path, sample_replay_result, base_env): - export_file = write_export_fixture( - tmp_path, - "datasets/isb1/exports/core/chat_8k1k.json", - { - "adapter_id": "inferencex_multiturn", - "surface": "chat", - "exports": [ - { - "trace_id": "trace-1", - "runtime_stack_id": "vllm-0.8.5-h200", - "hardware_profile_id": "h200-8gpu", - "canonical_model_id": "deepseek-r1-0528", - "support_status": "supported", - } - ], - }, - ) - replay_result = { - **sample_replay_result, - "selection": { - **sample_replay_result["selection"], - "support_statuses": ["supported"], - "support_status_counts": {"supported": 2}, - }, - } - env = base_env.copy() - env["EXPORT_FILE"] = export_file - env["SUPPORT_STATUS"] = "reviewed_preview" - - result = run_script(tmp_path, env, replay_result, result_filename="isb1_mismatch") - assert result.returncode != 0 - assert "support-status mismatch" in result.stderr - - -def test_certification_status_mismatch_fails(tmp_path, sample_replay_result, base_env): - export_file = write_export_fixture( - tmp_path, - "datasets/isb1/exports/core/chat_8k1k.json", - { - "adapter_id": "inferencex_multiturn", - "surface": "chat", - "exports": [ - { - "trace_id": "trace-1", - "runtime_stack_id": "vllm-0.8.5-h200", - "hardware_profile_id": "h200-8gpu", - "canonical_model_id": "deepseek-r1-0528", - "support_status": "supported", - "benchmark_certification_status": "dataset_replay_verified", - } - ], - }, - ) - replay_result = { - **sample_replay_result, - "selection": { - **sample_replay_result["selection"], - "benchmark_certification_statuses": ["pending_review"], - "benchmark_certification_status_counts": {"pending_review": 2}, - }, - } - env = base_env.copy() - env["EXPORT_FILE"] = export_file - - result = run_script(tmp_path, env, replay_result, result_filename="isb1_cert_mismatch") - assert result.returncode != 0 - assert "benchmark-certification mismatch" in result.stderr - - -def test_missing_required_env_vars_fails(tmp_path, sample_replay_result): - result_file = tmp_path / "isb1_result.json" - result_file.write_text(json.dumps(sample_replay_result)) - - result = subprocess.run( - [sys.executable, str(SCRIPT_PATH)], - cwd=tmp_path, - env={"PATH": "/usr/bin", "RESULT_FILENAME": "isb1_result"}, - capture_output=True, - text=True, - ) - - assert result.returncode != 0 - assert "Missing required environment variables" in result.stderr - - -def test_dispatch_ref_prefers_explicit_override(tmp_path, sample_replay_result, base_env): - export_file = write_export_fixture( - tmp_path, - "datasets/isb1/exports/core/chat_8k1k.json", - { - "adapter_id": "inferencex_multiturn", - "bundle_id": "bundle-core-chat", - "surface": "chat", - "exports": [ - { - "trace_id": "trace-1", - "runtime_stack_id": "vllm-0.8.5-h200", - "hardware_profile_id": "h200-8gpu", - "canonical_model_id": "deepseek-r1-0528", - "support_status": "supported", - } - ], - }, - ) - env = base_env.copy() - env["EXPORT_FILE"] = export_file - env["DISPATCH_REF"] = "refs/tags/isb1-dispatch-override" - - result = run_script(tmp_path, env, sample_replay_result, result_filename="isb1_dispatch_override") - assert result.returncode == 0, f"Script failed: {result.stderr}" - - output_data = json.loads(result.stdout) - assert_traceability_fields( - output_data, - "isb1_dispatch_override", - dispatch_ref="refs/tags/isb1-dispatch-override", - ) - - -def test_preview_offload_core_processing(tmp_path, sample_replay_result, base_env): - preview_export = ( - write_export_fixture( - tmp_path, - "datasets/isb1/exports/preview/offload_core/" - "inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json", - { - "adapter_id": "inferencex_multiturn", - "profile_id": "chat_hopper_blackwell_offload_core_v1", - "duration_tier": "smoke", - "adapter_surface": "chat", - "tier": "reviewed_preview", - "adapter_support_status": "reviewed_preview", - "exports": [ - { - "context_band": "lc1_8k_16k", - }, - { - "context_band": "lc3_96k_128k", - }, - ], - "producer_handoff_metadata": { - "class": "phase_2_offload_core_preview", - "claim_boundary": "Not blanket certification.", - }, - }, - ) - ) - - env = base_env.copy() - env["EXPORT_FILE"] = preview_export - env["SUPPORT_STATUS"] = "reviewed_preview" - env["MAX_MODEL_LEN"] = "131272" - replay_result = { - **sample_replay_result, - "selection": { - **sample_replay_result["selection"], - "support_statuses": ["reviewed_preview"], - "support_status_counts": {"reviewed_preview": 2}, - }, - } - - result = run_script(tmp_path, env, replay_result, result_filename="isb1_preview") - assert result.returncode == 0, f"Script failed: {result.stderr}" - - output_data = json.loads(result.stdout) - assert output_data["export_lane"] == "preview/offload_core" - assert output_data["benchmark_surface"] == "chat" - assert output_data["profile_id"] == "chat_hopper_blackwell_offload_core_v1" - assert output_data["duration_tier"] == "smoke" - assert output_data["context_bands"] == ["lc1_8k_16k", "lc3_96k_128k"] - assert output_data["producer_handoff_class"] == "phase_2_offload_core_preview" - assert output_data["support_status"] == "reviewed_preview" - assert output_data["isl"] == 0 - assert output_data["osl"] == 0 - assert_traceability_fields(output_data, "isb1_preview") - - -def test_qwen_500k_preview_processing_preserves_served_shape_and_context_band( - tmp_path, sample_replay_result, base_env -): - preview_export = write_export_fixture( - tmp_path, - "datasets/isb1/exports/preview/long_context_500k/" - "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json", - { - "adapter_id": "inferencex_trace_replay", - "bundle_id": "isb1_preview_long_context_500k_vllm_code_xlc2_qwen3_5", - "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1", - "duration_tier": "standard", - "surface": "code", - "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, - "tier": "reviewed_preview", - "adapter_support_status": "reviewed_preview", - "producer_handoff_metadata": { - "class": "bounded_500k_class", - "claim_boundary": "Replay-derived 500k preview only.", - }, - "exports": [ - { - "context_band": "xlc2_384k_512k", - "support_status": "reviewed_preview", - "benchmark_certification_status": "dataset_replay_verified", - "runtime_stack_id": "standalone:vllm", - "hardware_profile_id": "nvidia:b200_sxm_180gb", - "canonical_model_id": "qwen3_5_397b_a17b", - "kv_mode": "offload_cliff", - }, - { - "context_band": "xlc2_384k_512k", - "support_status": "reviewed_preview", - "benchmark_certification_status": "dataset_replay_verified", - "runtime_stack_id": "standalone:vllm", - "hardware_profile_id": "nvidia:h100_sxm_80gb", - "canonical_model_id": "qwen3_5_397b_a17b", - "kv_mode": "offload_cliff", - }, - { - "context_band": "xlc2_384k_512k", - "support_status": "reviewed_preview", - "benchmark_certification_status": "dataset_replay_verified", - "runtime_stack_id": "standalone:vllm", - "hardware_profile_id": "nvidia:h200_sxm_141gb", - "canonical_model_id": "qwen3_5_397b_a17b", - "kv_mode": "offload_cliff", - }, - ], - }, - ) - - env = base_env.copy() - env.update( - { - "RUNNER_TYPE": "b200-cw-1", - "FRAMEWORK": "vllm", - "MODEL_PREFIX": "qwen3.5", - "IMAGE": "vllm/vllm-openai:v0.8.5", - "EXPORT_FILE": preview_export, - "RUNTIME_STACK_ID": "standalone:vllm", - "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", - "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", - "SUPPORT_STATUS": "reviewed_preview", - "MAX_MODEL_LEN": "524288", - "VLLM_CPU_OFFLOAD_GB": "120", - "VLLM_SWAP_SPACE_GB": "24", - } - ) - replay_result = { - **sample_replay_result, - "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", - "vllm_cpu_offload_gb": "128", - "vllm_swap_space_gb": "32", - "selection": { - **sample_replay_result["selection"], - "runtime_stack_ids": ["standalone:vllm"], - "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], - "canonical_model_ids": ["qwen3_5_397b_a17b"], - "support_statuses": ["reviewed_preview"], - "support_status_counts": {"reviewed_preview": 3}, - "request_mode_mix": {"code": 3}, - }, - } - - result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k") - assert result.returncode == 0, f"Script failed: {result.stderr}" - - output_data = json.loads(result.stdout) - assert output_data["export_lane"] == "preview/long_context_500k" - assert output_data["benchmark_surface"] == "code" - assert output_data["profile_id"] == "coding_qwen3.5_xlc2_500k_preview_v1" - assert output_data["context_bands"] == ["xlc2_384k_512k"] - assert output_data["producer_handoff_class"] == "bounded_500k_class" - assert output_data["support_status"] == "reviewed_preview" - assert output_data["benchmark_certification_status"] == "dataset_replay_verified" - assert output_data["isl"] == 131072 - assert output_data["osl"] == 1024 - assert output_data["max_model_len"] == 524288 - assert output_data["effective_max_context_depth"] == 524288 - assert output_data["context_pressure_class"] == "extended_500k" - assert output_data["context_pressure_signal"]["status"] == "ok" - assert output_data["context_pressure_suspicious"] is False - assert output_data["kv_offload_observed"] is True - assert output_data["runtime_overrides"] == { - "vllm_cpu_offload_gb": "128", - "vllm_swap_space_gb": "32", - "sglang_mem_fraction_override": None, - "sglang_chunked_prefill_override": None, - } - assert_traceability_fields(output_data, "isb1_qwen_500k") - - -def test_qwen_1m_preview_processing_preserves_8k_served_shape_and_offload_metadata( - tmp_path, sample_replay_result, base_env -): - preview_export = write_export_fixture( - tmp_path, - "datasets/isb1/exports/preview/long_context_1m/" - "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json", - { - "adapter_id": "inferencex_trace_replay", - "bundle_id": "isb1_preview_long_context_1m_vllm_code_ulc2_qwen3_5", - "profile_id": "coding_qwen3.5_ulc2_1m_preview_v1", - "duration_tier": "standard", - "surface": "code", - "served_shape": {"shape_family": "8k1k", "isl": 8192, "osl": 1024}, - "tier": "reviewed_preview", - "adapter_support_status": "reviewed_preview", - "producer_handoff_metadata": { - "class": "bounded_1m_class", - "claim_boundary": "Manual 1M preview only.", - }, - "exports": [ - { - "context_band": "ulc2_1m_plus", - "support_status": "reviewed_preview", - "benchmark_certification_status": "dataset_replay_verified", - "runtime_stack_id": "standalone:vllm", - "hardware_profile_id": "nvidia:b200_sxm_180gb", - "canonical_model_id": "qwen3_5_397b_a17b", - "kv_mode": "offload_cliff", - } - ], - }, - ) - - env = base_env.copy() - env.update( - { - "RUNNER_TYPE": "b200-cw-1", - "FRAMEWORK": "vllm", - "MODEL_PREFIX": "qwen3.5", - "IMAGE": "vllm/vllm-openai:v0.8.5", - "EXPORT_FILE": preview_export, - "RUNTIME_STACK_ID": "standalone:vllm", - "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", - "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", - "SUPPORT_STATUS": "reviewed_preview", - "MAX_MODEL_LEN": "1048576", - "MAX_SESSIONS": "1", - "MAX_TURNS_PER_SESSION": "3", - } - ) - replay_result = { - **sample_replay_result, - "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", - "selection": { - **sample_replay_result["selection"], - "runtime_stack_ids": ["standalone:vllm"], - "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], - "canonical_model_ids": ["qwen3_5_397b_a17b"], - "support_statuses": ["reviewed_preview"], - "support_status_counts": {"reviewed_preview": 1}, - "request_mode_mix": {"code": 1}, - }, - } - - result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_1m") - assert result.returncode == 0, f"Script failed: {result.stderr}" - - output_data = json.loads(result.stdout) - assert output_data["export_lane"] == "preview/long_context_1m" - assert output_data["benchmark_surface"] == "code" - assert output_data["profile_id"] == "coding_qwen3.5_ulc2_1m_preview_v1" - assert output_data["context_bands"] == ["ulc2_1m_plus"] - assert output_data["producer_handoff_class"] == "bounded_1m_class" - assert output_data["support_status"] == "reviewed_preview" - assert output_data["benchmark_certification_status"] == "dataset_replay_verified" - assert output_data["isl"] == 8192 - assert output_data["osl"] == 1024 - assert output_data["max_model_len"] == 1048576 - assert output_data["effective_max_context_depth"] == 1048576 - assert output_data["context_pressure_class"] == "extended_1m" - assert output_data["context_pressure_signal"]["status"] == "ok" - assert output_data["context_pressure_suspicious"] is False - assert output_data["max_sessions"] == 1 - assert output_data["max_turns_per_session"] == 3 - assert output_data["kv_offload_observed"] is True - assert_traceability_fields(output_data, "isb1_qwen_1m") - - -def test_context_pressure_warning_on_high_context_without_cpu_cache( - tmp_path, sample_replay_result, base_env -): - preview_export = write_export_fixture( - tmp_path, - "datasets/isb1/exports/preview/long_context_500k/" - "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json", - { - "adapter_id": "inferencex_trace_replay", - "bundle_id": "isb1_preview_long_context_500k_vllm_code_xlc2_qwen3_5", - "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1", - "duration_tier": "standard", - "surface": "code", - "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, - "tier": "reviewed_preview", - "adapter_support_status": "reviewed_preview", - "exports": [ - { - "context_band": "xlc2_384k_512k", - "support_status": "reviewed_preview", - "benchmark_certification_status": "dataset_replay_verified", - "runtime_stack_id": "standalone:vllm", - "hardware_profile_id": "nvidia:b200_sxm_180gb", - "canonical_model_id": "qwen3_5_397b_a17b", - "kv_mode": "offload_cliff", - } - ], - }, - ) - - env = base_env.copy() - env.update( - { - "RUNNER_TYPE": "b200-cw-1", - "FRAMEWORK": "vllm", - "MODEL_PREFIX": "qwen3.5", - "IMAGE": "vllm/vllm-openai:v0.8.5", - "EXPORT_FILE": preview_export, - "RUNTIME_STACK_ID": "standalone:vllm", - "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", - "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", - "SUPPORT_STATUS": "reviewed_preview", - "MAX_MODEL_LEN": "524288", - } - ) - replay_result = { - **sample_replay_result, - "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", - "selection": { - **sample_replay_result["selection"], - "runtime_stack_ids": ["standalone:vllm"], - "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], - "canonical_model_ids": ["qwen3_5_397b_a17b"], - "support_statuses": ["reviewed_preview"], - "support_status_counts": {"reviewed_preview": 1}, - "request_mode_mix": {"code": 1}, - }, - "server_metrics_summary": { - "cache_usage_avg": 0.45, - "cache_hit_rate_avg": 0.15, - "gpu_cache_usage_avg": 0.45, - "gpu_cache_usage_peak": 0.91, - "gpu_cache_metric_name": "vllm:gpu_cache_usage_perc", - "cpu_cache_usage_avg": 0.0, - "cpu_cache_usage_peak": 0.0, - "cpu_cache_metric_name": "vllm:cpu_cache_usage_perc", - "cpu_cache_metric_available": True, - "observability_status": "direct_cpu_cache_metric", - "kv_offload_observed": False, - "samples": 5, - }, - } - - result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_warn") - assert result.returncode == 0, f"Script failed: {result.stderr}" - assert "saw no CPU cache usage" in result.stderr - - output_data = json.loads(result.stdout) - assert output_data["context_pressure_signal"]["status"] == "suspicious" - assert output_data["context_pressure_suspicious"] is True - assert_traceability_fields(output_data, "isb1_qwen_500k_warn") - - -def test_context_pressure_signal_marks_sglang_observability_gap( - tmp_path, sample_replay_result, base_env -): - preview_export = write_export_fixture( - tmp_path, - "datasets/isb1/exports/preview/long_context_500k/" - "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json", - { - "adapter_id": "inferencex_trace_replay", - "bundle_id": "isb1_preview_long_context_500k_sglang_code_xlc2_qwen3_5", - "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1", - "duration_tier": "standard", - "surface": "code", - "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, - "tier": "reviewed_preview", - "adapter_support_status": "reviewed_preview", - "exports": [ - { - "context_band": "xlc2_384k_512k", - "support_status": "reviewed_preview", - "benchmark_certification_status": "dataset_replay_verified", - "runtime_stack_id": "standalone:sglang", - "hardware_profile_id": "nvidia:b200_sxm_180gb", - "canonical_model_id": "qwen3_5_397b_a17b", - "kv_mode": "offload_cliff", - } - ], - }, - ) - - env = base_env.copy() - env.update( - { - "RUNNER_TYPE": "b200-cw-1", - "FRAMEWORK": "sglang", - "MODEL_PREFIX": "qwen3.5", - "IMAGE": "lmsysorg/sglang:v0.5.9-cu130", - "EXPORT_FILE": preview_export, - "RUNTIME_STACK_ID": "standalone:sglang", - "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", - "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", - "SUPPORT_STATUS": "reviewed_preview", - "MAX_MODEL_LEN": "524288", - "SGLANG_MEM_FRACTION_OVERRIDE": "0.77", - "SGLANG_CHUNKED_PREFILL_OVERRIDE": "65536", - } - ) - replay_result = { - **sample_replay_result, - "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", - "selection": { - **sample_replay_result["selection"], - "runtime_stack_ids": ["standalone:sglang"], - "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], - "canonical_model_ids": ["qwen3_5_397b_a17b"], - "support_statuses": ["reviewed_preview"], - "support_status_counts": {"reviewed_preview": 1}, - "request_mode_mix": {"code": 1}, - }, - "server_metrics_summary": { - "cache_usage_avg": 0.52, - "cache_hit_rate_avg": 0.23, - "gpu_cache_usage_avg": 0.52, - "gpu_cache_usage_peak": 0.88, - "gpu_cache_metric_name": "sglang:token_usage", - "cpu_cache_usage_avg": 0.0, - "cpu_cache_usage_peak": 0.0, - "cpu_cache_metric_name": None, - "cpu_cache_metric_available": False, - "observability_status": "indirect_without_cpu_cache_metric", - "kv_offload_observed": False, - "samples": 5, - }, - } - - result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_sglang") - assert result.returncode == 0, f"Script failed: {result.stderr}" - assert "lacks a direct CPU cache metric" in result.stderr - - output_data = json.loads(result.stdout) - assert output_data["context_pressure_signal"]["status"] == "observability_gap" - assert output_data["context_pressure_signal"]["requires_log_review"] is True - assert output_data["context_pressure_suspicious"] is False - assert output_data["runtime_overrides"] == { - "vllm_cpu_offload_gb": None, - "vllm_swap_space_gb": None, - "sglang_mem_fraction_override": "0.77", - "sglang_chunked_prefill_override": "65536", - } - assert_traceability_fields(output_data, "isb1_qwen_500k_sglang") - - -def test_depth_coverage_ratio_for_500k_preview(tmp_path, base_env, sample_replay_result): - """Verify depth coverage ratio and class for a 500k preview with 131k actual tokens.""" - export_payload = { - "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, - "surface": "code", - "exports": [ - { - "runtime_stack_id": "standalone:vllm", - "hardware_profile_id": "h200-8gpu", - "canonical_model_id": "qwen3_5_397b_a17b", - "support_status": "reviewed_preview", - "benchmark_certification_status": "dataset_replay_verified", - "context_band": "xlc2_384k_512k", - "trace_metadata": { - "estimated_kv_bytes_peak": 27294647296, - "context_pressure_profile": { - "expected_offload_mode": "soft_offload", - }, - "expected_offload_mode": "soft_offload", - }, - } - ], - } - export_file = write_export_fixture( - tmp_path, "datasets/isb1/exports/preview/long_context_500k/test_500k.json", export_payload - ) - - env = base_env.copy() - env["EXPORT_FILE"] = export_file - env["MODEL_PREFIX"] = "qwen3.5" - env["CANONICAL_MODEL_ID"] = "qwen3_5_397b_a17b" - env["SUPPORT_STATUS"] = "reviewed_preview" - env["MAX_MODEL_LEN"] = "524288" - env["FRAMEWORK"] = "vllm" - - replay_result = sample_replay_result.copy() - replay_result["selection"] = { - **replay_result["selection"], - "support_statuses": ["reviewed_preview"], - } - replay_result["server_metrics_summary"] = { - "gpu_cache_usage_avg": 0.35, - "gpu_cache_usage_peak": 0.42, - "cpu_cache_usage_avg": 0.15, - "cpu_cache_usage_peak": 0.25, - "cpu_cache_metric_available": True, - "observability_status": "direct_cpu_cache_metric", - "kv_offload_observed": True, - "samples": 10, - } - replay_result["depth_telemetry"] = { - "total_estimated_input_tokens": 500000, - "total_actual_input_tokens": 131072, - "max_actual_context_len_per_turn": 131072, - } - - result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_depth") - assert result.returncode == 0, f"Script failed: {result.stderr}" - - output_data = json.loads(result.stdout) - - # Depth coverage ratio: 131072 / 524288 ≈ 0.25 - assert output_data["depth_coverage_ratio"] is not None - assert 0.24 < output_data["depth_coverage_ratio"] < 0.26 - assert output_data["depth_coverage_class"] == "bounded_preview" - assert output_data["max_actual_context_len_per_turn"] == 131072 - assert output_data["depth_gap_tokens"] == 524288 - 131072 - - # Producer expectation validation - assert output_data["producer_estimated_kv_bytes_peak"] == 27294647296 - assert output_data["producer_expected_offload_mode"] == "soft_offload" - assert output_data["producer_expectation_validation"]["offload_mode_match"] is True - assert output_data["producer_expectation_validation"]["depth_exercised"] is False - - # Preemption count - assert output_data["preemption_count"] == 0 - - -def test_depth_mismatch_warning_for_configuration_only(tmp_path, base_env, sample_replay_result): - """Verify depth_mismatch status when actual context is <10% of configured.""" - export_payload = { - "served_shape": {"shape_family": "8k1k", "isl": 8192, "osl": 1024}, - "surface": "code", - "exports": [ - { - "runtime_stack_id": "standalone:vllm", - "hardware_profile_id": "h200-8gpu", - "canonical_model_id": "qwen3_5_397b_a17b", - "support_status": "reviewed_preview", - "benchmark_certification_status": "dataset_replay_verified", - "context_band": "ulc2_1m_plus", - "trace_metadata": { - "estimated_kv_bytes_peak": 39500000000, - "expected_offload_mode": "hard_offload", - }, - } - ], - } - export_file = write_export_fixture( - tmp_path, "datasets/isb1/exports/preview/long_context_1m/test_1m.json", export_payload - ) - - env = base_env.copy() - env["EXPORT_FILE"] = export_file - env["MODEL_PREFIX"] = "qwen3.5" - env["CANONICAL_MODEL_ID"] = "qwen3_5_397b_a17b" - env["SUPPORT_STATUS"] = "reviewed_preview" - env["MAX_MODEL_LEN"] = "1048576" - env["FRAMEWORK"] = "vllm" - - replay_result = sample_replay_result.copy() - replay_result["selection"] = { - **replay_result["selection"], - "support_statuses": ["reviewed_preview"], - } - replay_result["server_metrics_summary"] = { - "gpu_cache_usage_avg": 0.10, - "gpu_cache_usage_peak": 0.15, - "cpu_cache_usage_avg": 0.05, - "cpu_cache_usage_peak": 0.10, - "cpu_cache_metric_available": True, - "observability_status": "direct_cpu_cache_metric", - "kv_offload_observed": True, - "samples": 5, - } - # 1M preview sends only 8k actual tokens - replay_result["depth_telemetry"] = { - "total_estimated_input_tokens": 1600000, - "total_actual_input_tokens": 8192, - "max_actual_context_len_per_turn": 8192, - } - - result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_1m_depth") - assert result.returncode == 0, f"Script failed: {result.stderr}" - - output_data = json.loads(result.stdout) - - # 8192 / 1048576 ≈ 0.0078 — less than 0.1 threshold - assert output_data["depth_coverage_ratio"] < 0.01 - assert output_data["depth_coverage_class"] == "configuration_only" - assert output_data["context_pressure_signal"]["status"] == "depth_mismatch" - assert output_data["context_pressure_signal"]["reason"] == "configured_depth_not_exercised" - assert "depth_coverage_ratio" in output_data["context_pressure_signal"] - assert "configured for" in result.stderr - - -def test_producer_expectation_offload_mismatch(tmp_path, base_env, sample_replay_result): - """Verify producer expectation validation when offload is expected but not observed.""" - export_payload = { - "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, - "surface": "code", - "exports": [ - { - "runtime_stack_id": "standalone:vllm", - "hardware_profile_id": "h200-8gpu", - "canonical_model_id": "gpt_oss_120b", - "support_status": "reviewed_preview", - "benchmark_certification_status": "dataset_replay_verified", - "context_band": "xlc2_384k_512k", - "trace_metadata": { - "estimated_kv_bytes_peak": 27000000000, - "context_pressure_profile": { - "expected_offload_mode": "hard_offload", - }, - }, - } - ], - } - export_file = write_export_fixture( - tmp_path, "datasets/isb1/exports/preview/long_context_500k/test_mismatch.json", export_payload - ) - - env = base_env.copy() - env["EXPORT_FILE"] = export_file - env["MODEL_PREFIX"] = "gptoss" - env["CANONICAL_MODEL_ID"] = "gpt_oss_120b" - env["SUPPORT_STATUS"] = "reviewed_preview" - env["MAX_MODEL_LEN"] = "524288" - - replay_result = sample_replay_result.copy() - replay_result["selection"] = { - **replay_result["selection"], - "support_statuses": ["reviewed_preview"], - } - replay_result["server_metrics_summary"] = { - "gpu_cache_usage_avg": 0.50, - "gpu_cache_usage_peak": 0.60, - "cpu_cache_usage_avg": 0.0, - "cpu_cache_usage_peak": 0.0, - "cpu_cache_metric_available": True, - "observability_status": "direct_cpu_cache_metric", - "kv_offload_observed": False, - "samples": 10, - } - replay_result["depth_telemetry"] = { - "total_estimated_input_tokens": 400000, - "total_actual_input_tokens": 131072, - "max_actual_context_len_per_turn": 131072, - } - - result = run_script(tmp_path, env, replay_result, result_filename="isb1_mismatch") - assert result.returncode == 0, f"Script failed: {result.stderr}" - - output_data = json.loads(result.stdout) - - # Producer expected hard_offload, but kv_offload_observed is False - assert output_data["producer_expectation_validation"]["offload_mode_match"] is False - assert output_data["producer_expected_offload_mode"] == "hard_offload" - assert output_data["kv_offload_observed"] is False From d53bd3b6c8bfb8b295f84840053b0530123fcb51 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 21:49:26 -0700 Subject: [PATCH 12/18] data(isb1): ship 179 pre-converted kv-cache-tester trace JSONs Fold Track A into PR 1032. Consumers now point Cam's trace_replay_tester.py directly at datasets/isb1/converted/ with no conversion step: python $KV_CACHE_TESTER_DIR/trace_replay_tester.py --trace-directory datasets/isb1/converted/ --tokenizer Qwen/Qwen2.5-Coder-32B-Instruct --block-size 64 179 traces across 23 bundles span 6 context scales (8k/32k/64k/131k/500k/1M) and multi-model coverage (Kimi K2.5, DSR1, GPT-OSS, Qwen3.5). Co-Authored-By: Claude Opus 4.7 --- datasets/isb1/.gitattributes | 1 + datasets/isb1/README.md | 17 +++++++++++++++++ .../isb1_sess_chat_lc3_contract_review_01.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0013.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0014.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0015.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0016.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0017.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0018.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0019.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0020.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0021.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0022.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0023.json | 3 +++ .../isb1_sess_tool_free_memory_resume_001.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0001.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0002.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0003.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0004.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0005.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0006.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0007.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0008.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0009.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0010.json | 3 +++ ...1_sess_tool_free_memory_resume_001_0011.json | 3 +++ .../isb1_sess_chat_lc3_contract_review_01.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0001.json | 3 +++ ...1_sess_chat_lc3_contract_review_01_0002.json | 3 +++ ...1_sess_code_ca1_agent_benchmark_plan_01.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0025.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0026.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0027.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0028.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0029.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0030.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0031.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0032.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0033.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0034.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0035.json | 3 +++ .../isb1_sess_debug_repair_repo_001.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0001.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0002.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0003.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0004.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0005.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0006.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0007.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0008.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0009.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0010.json | 3 +++ .../isb1_sess_debug_repair_repo_001_0011.json | 3 +++ .../code_8k1k/isb1_sess_offload_cliff_9982.json | 3 +++ .../isb1_sess_offload_cliff_9982_0013.json | 3 +++ .../isb1_sess_offload_cliff_9982_0014.json | 3 +++ .../isb1_sess_offload_cliff_9982_0015.json | 3 +++ .../isb1_sess_offload_cliff_9982_0016.json | 3 +++ .../isb1_sess_offload_cliff_9982_0017.json | 3 +++ .../isb1_sess_offload_cliff_9982_0018.json | 3 +++ .../isb1_sess_offload_cliff_9982_0019.json | 3 +++ .../isb1_sess_offload_cliff_9982_0020.json | 3 +++ .../isb1_sess_offload_cliff_9982_0021.json | 3 +++ .../isb1_sess_offload_cliff_9982_0022.json | 3 +++ .../isb1_sess_offload_cliff_9982_0023.json | 3 +++ ...1_sess_code_ca1_agent_benchmark_plan_01.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0001.json | 3 +++ ...s_code_ca1_agent_benchmark_plan_01_0002.json | 3 +++ .../isb1_sess_xlc1_text_resume_bridge_01.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0001.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0002.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0003.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0004.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0005.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0006.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0007.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0008.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0009.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0010.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0011.json | 3 +++ .../isb1_sess_xlc1_text_resume_bridge_01.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0001.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0002.json | 3 +++ .../isb1_sess_xlc1_text_resume_bridge_01.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0001.json | 3 +++ ...b1_sess_xlc1_text_resume_bridge_01_0002.json | 3 +++ ..._cache_xlc1_text_shared_prefix_swarm_01.json | 3 +++ ..._cache_xlc1_text_shared_prefix_swarm_01.json | 3 +++ .../isb1_sess_chat_lc2_resume_reasoning_01.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0001.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0002.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0003.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0004.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0005.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0006.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0007.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0008.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0009.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0010.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0011.json | 3 +++ .../isb1_sess_chat_lc2_resume_reasoning_01.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0001.json | 3 +++ ..._sess_chat_lc2_resume_reasoning_01_0002.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0001.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0002.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0003.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0004.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0005.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0006.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0007.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0008.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0009.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0010.json | 3 +++ .../code_32k1k/isb1_sess_2c2a96a7_0011.json | 3 +++ .../isb1_sess_doc_comp_fanout_01.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0013.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0014.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0015.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0016.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0017.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0018.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0019.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0020.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0021.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0022.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0023.json | 3 +++ .../code_32k1k_qwen3.5/isb1_sess_2c2a96a7.json | 3 +++ .../isb1_sess_2c2a96a7_0001.json | 3 +++ .../isb1_sess_2c2a96a7_0002.json | 3 +++ .../isb1_sess_doc_comp_fanout_01.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0004.json | 3 +++ .../isb1_sess_doc_comp_fanout_01_0005.json | 3 +++ ...sb1_sess_chat_lc3_multi_day_strategy_01.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0001.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0002.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0003.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0004.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0005.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0006.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0007.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0008.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0009.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0010.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0011.json | 3 +++ ...sb1_sess_chat_lc3_multi_day_strategy_01.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0001.json | 3 +++ ...ess_chat_lc3_multi_day_strategy_01_0002.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0001.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0002.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0003.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0004.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0005.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0006.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0007.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0008.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0009.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0010.json | 3 +++ .../code_64k1k/isb1_sess_optimizer_01_0011.json | 3 +++ .../isb1_sess_optimizer_01.json | 3 +++ .../isb1_sess_optimizer_01_0001.json | 3 +++ .../isb1_sess_optimizer_01_0002.json | 3 +++ ...b1_hb_depth_cache_ulc2_offload_cliff_01.json | 3 +++ ..._depth_cache_ulc2_offload_cliff_01_0001.json | 3 +++ ..._depth_cache_ulc2_offload_cliff_01_0002.json | 3 +++ ...b1_hb_depth_cache_ulc2_offload_cliff_01.json | 3 +++ ..._depth_cache_ulc2_offload_cliff_01_0001.json | 3 +++ ..._depth_cache_ulc2_offload_cliff_01_0002.json | 3 +++ ...sess_cache_xlc2_hot_cold_session_mix_01.json | 3 +++ ...cache_xlc2_hot_cold_session_mix_01_0001.json | 3 +++ ...cache_xlc2_hot_cold_session_mix_01_0002.json | 3 +++ ...epth_cache_xlc2_hot_cold_session_mix_01.json | 3 +++ ...cache_xlc2_hot_cold_session_mix_01_0001.json | 3 +++ ...cache_xlc2_hot_cold_session_mix_01_0002.json | 3 +++ ...sess_cache_xlc2_hot_cold_session_mix_01.json | 3 +++ ...cache_xlc2_hot_cold_session_mix_01_0001.json | 3 +++ ...cache_xlc2_hot_cold_session_mix_01_0002.json | 3 +++ ...epth_cache_xlc2_hot_cold_session_mix_01.json | 3 +++ ...cache_xlc2_hot_cold_session_mix_01_0001.json | 3 +++ ...cache_xlc2_hot_cold_session_mix_01_0002.json | 3 +++ 181 files changed, 555 insertions(+) create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0013.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0014.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0015.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0016.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0017.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0018.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0019.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0020.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0021.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0022.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0023.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0001.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0002.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0003.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0004.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0005.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0006.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0007.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0008.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0009.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0010.json create mode 100644 datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0011.json create mode 100644 datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01.json create mode 100644 datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01_0001.json create mode 100644 datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01_0002.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0025.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0026.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0027.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0028.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0029.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0030.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0031.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0032.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0033.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0034.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0035.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0001.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0002.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0003.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0004.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0005.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0006.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0007.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0008.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0009.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0010.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0011.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0013.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0014.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0015.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0016.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0017.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0018.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0019.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0020.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0021.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0022.json create mode 100644 datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0023.json create mode 100644 datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01.json create mode 100644 datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01_0001.json create mode 100644 datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01_0002.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0001.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0002.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0003.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0004.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0005.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0006.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0007.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0008.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0009.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0010.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0011.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01_0001.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01_0002.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01_0001.json create mode 100644 datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01_0002.json create mode 100644 datasets/isb1/converted/extension_131k/code_131k1k/isb1_sess_cache_xlc1_text_shared_prefix_swarm_01.json create mode 100644 datasets/isb1/converted/extension_131k/code_131k1k_qwen3.5/isb1_hb_depth_cache_xlc1_text_shared_prefix_swarm_01.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0001.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0002.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0003.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0004.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0005.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0006.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0007.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0008.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0009.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0010.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0011.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01_0001.json create mode 100644 datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01_0002.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0001.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0002.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0003.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0004.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0005.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0006.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0007.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0008.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0009.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0010.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0011.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0013.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0014.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0015.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0016.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0017.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0018.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0019.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0020.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0021.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0022.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0023.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7_0001.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7_0002.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01_0004.json create mode 100644 datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01_0005.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0001.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0002.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0003.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0004.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0005.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0006.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0007.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0008.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0009.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0010.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0011.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01_0001.json create mode 100644 datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01_0002.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0001.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0002.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0003.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0004.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0005.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0006.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0007.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0008.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0009.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0010.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0011.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01_0001.json create mode 100644 datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01_0002.json create mode 100644 datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01.json create mode 100644 datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0001.json create mode 100644 datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0002.json create mode 100644 datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01.json create mode 100644 datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0001.json create mode 100644 datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0002.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0001.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0002.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0001.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0002.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0001.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0002.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0001.json create mode 100644 datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0002.json diff --git a/datasets/isb1/.gitattributes b/datasets/isb1/.gitattributes index 5998181c2..006356f24 100644 --- a/datasets/isb1/.gitattributes +++ b/datasets/isb1/.gitattributes @@ -1 +1,2 @@ exports/**/*.json filter=lfs diff=lfs merge=lfs -text linguist-generated=true +converted/**/*.json filter=lfs diff=lfs merge=lfs -text linguist-generated=true diff --git a/datasets/isb1/README.md b/datasets/isb1/README.md index 46a4fbbdc..9c39451d9 100644 --- a/datasets/isb1/README.md +++ b/datasets/isb1/README.md @@ -160,6 +160,23 @@ python tools/isb1_to_kvcache_tester.py \ --output-dir traces_isb1/ ``` +### Pre-converted sidecar + +This repo carries a pre-converted mirror at: + +- `datasets/isb1/converted/` + +Feed that mirror directly to `kv-cache-tester` with: + +```bash +python trace_replay_tester.py --trace-directory datasets/isb1/converted/ --tokenizer Qwen/Qwen2.5-Coder-32B-Instruct --block-size 64 +``` + +Mapping convention: + +- one trace file per ISB1 conversation/event +- each trace filename is prefixed with the source bundle id + ### Step 3 — replay against a running vLLM / SGLang server Using PR #993's own recipes (e.g. `benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh`), diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01.json new file mode 100644 index 000000000..289035999 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7410a365bd5603bacd4aa99a362c2ac3bf2ad4d050d0d7856d06cd217736f1ab +size 2359 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0013.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0013.json new file mode 100644 index 000000000..81af45fd2 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0013.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe2fc1546b3b472234e631bc65c1704fffb6a41dafa65953e4f8eeacb70c9fa +size 2293 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0014.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0014.json new file mode 100644 index 000000000..4ccb09be9 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0014.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53309ad6e29880d35fa4a73d77747929119fc9c775d5c68f89497e7929bd190f +size 2335 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0015.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0015.json new file mode 100644 index 000000000..dbc37f825 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0015.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51be1bba71f5d3129a6f3e1ff879296c3ed440fb7637e1fbcca00b38488f2f0d +size 2342 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0016.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0016.json new file mode 100644 index 000000000..d95f27534 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0016.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6118fd3424465ab45b1402d07f5ff9ea8b3919086fb03bf88d0a9130ac9fbf +size 2360 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0017.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0017.json new file mode 100644 index 000000000..4f9ccaff3 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0017.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2ef537f9cd9db8e4cbb0b9960b35e7e4e3274f11831a4da52577fc1056b5337 +size 2294 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0018.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0018.json new file mode 100644 index 000000000..30aa92a74 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0018.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b430d16b147afbaa0f5a60291456e83db5e997a941632cb5540b6d5b453ee3bd +size 2336 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0019.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0019.json new file mode 100644 index 000000000..7629e156b --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0019.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c54ddeb6a9c56a9ea50b6e82fe4807cf1d11452dade164b05810dcec53b7595b +size 2343 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0020.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0020.json new file mode 100644 index 000000000..ee9615fd8 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0020.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0c916c5054d4e6cb99af4432297b20fb0c12cd6375cbec1b928dce7f2d6d741 +size 2360 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0021.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0021.json new file mode 100644 index 000000000..5b7edbd28 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0021.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ae8aa565ef2627e37b602c520967c49a6b516e05538e5d6603b7c8c96c89820 +size 2294 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0022.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0022.json new file mode 100644 index 000000000..de43c48a5 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0022.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aadbb587997e22fb256fa251ad18a1e424c855f21f5f78483bdabe63e5c2bc31 +size 2336 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0023.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0023.json new file mode 100644 index 000000000..bb5cf37aa --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_chat_lc3_contract_review_01_0023.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e435fa2ce6fbd9a18d9a7ce107abeff46abde60fb7789ff435ab55e48ba34ddf +size 2343 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001.json new file mode 100644 index 000000000..f3c0a78b6 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aab2871b3c45fff6f62cc9d8878b1d2ae8ecb39d2057ea6dbecba8d7da495f74 +size 8445 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0001.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0001.json new file mode 100644 index 000000000..82e834897 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cf0ec347a86e01b6d90f5d149f26bc97b3d10984fd19815bc5db641d2745d23 +size 8258 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0002.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0002.json new file mode 100644 index 000000000..2f1f8efaf --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89dd56c6e1b04caeebb0a392fe7f5c9f79fa2fa95dc50569bb649a6092266327 +size 8377 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0003.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0003.json new file mode 100644 index 000000000..e8af57f2e --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0003.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9c254daadad8c567aed44fb75b4e53b61b38816fe710a6d1e3ac7acef01aeab +size 8377 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0004.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0004.json new file mode 100644 index 000000000..5170f3057 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0004.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c517f0cb668eb570266b1dc9e6246872e22299f6e050315535019b08015ab202 +size 8446 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0005.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0005.json new file mode 100644 index 000000000..deb4e4bb5 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0005.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:497e35e3d78278b79047795ceccbd0ece37cbfcd21c7d3898cc70320eb924060 +size 8259 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0006.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0006.json new file mode 100644 index 000000000..6b694b51f --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0006.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c684232e36da2520cc9df65e69729da1cfc95f344897cf25616508b8b626908e +size 8378 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0007.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0007.json new file mode 100644 index 000000000..3d18a57dd --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0007.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56cf51825e79420f6081ff7005528ac7944ef29e3dedcbd9c37d49c3140d1673 +size 8378 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0008.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0008.json new file mode 100644 index 000000000..4404f7f07 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0008.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca32aafab78ee132141a572e7dce5d137dc3d94370bac3efe610ec46132f46d6 +size 8446 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0009.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0009.json new file mode 100644 index 000000000..86896b84a --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0009.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84b12ab40701ebdc53f4da237c71107736252c3f25bdae88b028eda02673c308 +size 8259 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0010.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0010.json new file mode 100644 index 000000000..e247daa1d --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0010.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1393b46efdb0679a6e7c2decfc216cdbe05082118597b53d72e514aed5fc2826 +size 8378 diff --git a/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0011.json b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0011.json new file mode 100644 index 000000000..fab470726 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k/isb1_sess_tool_free_memory_resume_001_0011.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dc31d4126d5eb6e54e9af4ba554cc5b71038de0b5e406b4f9b0a84e38f82ee5 +size 8378 diff --git a/datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01.json b/datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01.json new file mode 100644 index 000000000..c22f2fb06 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caa872caa9d33985a19954712466b6d46fe982aa9bee2a8bbb1b850be9ce2b30 +size 2372 diff --git a/datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01_0001.json b/datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01_0001.json new file mode 100644 index 000000000..41d93eb2e --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7620c05724a6b02392b5b95277fac6f28a0dea2edceeec5c28ddfcf19dccf1e7 +size 2373 diff --git a/datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01_0002.json b/datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01_0002.json new file mode 100644 index 000000000..536cd8776 --- /dev/null +++ b/datasets/isb1/converted/core/chat_8k1k_qwen3.5/isb1_sess_chat_lc3_contract_review_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05c597f0ad3f46aad040beceac30921c0a774eddd0c225b6ecb4736cc85e7304 +size 2373 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01.json new file mode 100644 index 000000000..b2d78fc85 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:228fdf2065df12f63481c4440db4c67c2616bda22ab2711025360baf29f6fb1f +size 2401 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0025.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0025.json new file mode 100644 index 000000000..8b38fca7d --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0025.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c373636a41b2387979e5b9727c34cb10becf29158ac57298bd257a3e33043dd +size 2335 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0026.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0026.json new file mode 100644 index 000000000..f77cb30fe --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0026.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:becdbb0fb66beea439832ca828d447b8898c922b2bcd9f6df8791da7d520958d +size 2377 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0027.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0027.json new file mode 100644 index 000000000..49ebc3f9e --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0027.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f9bfaebcdedf0623da29d4782fc26a1d76307c3b39dbd3e3fbbe3672b1aeb4 +size 2377 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0028.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0028.json new file mode 100644 index 000000000..2115ed500 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0028.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e6a9b7b6a3c186179d9734dc3c9eff99bcce8a42fb3c62319551c1cfe01e4b +size 2402 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0029.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0029.json new file mode 100644 index 000000000..b26c2ccb9 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0029.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99c14475b0023dfeb1ae12a23930601e7ecae719c6bcd22afd69fb94b9c27e90 +size 2336 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0030.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0030.json new file mode 100644 index 000000000..ce08beecd --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0030.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b616707f813b632f41b6a2dbfb7a8fa77758cecf8f62d6b7b7b192c9f0b0e57d +size 2378 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0031.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0031.json new file mode 100644 index 000000000..ecc7db393 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0031.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1c1f9168bd06050007b11231eae3a4ce2a091d3b64bf34fed05447c55efc836 +size 2378 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0032.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0032.json new file mode 100644 index 000000000..1a645728d --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0032.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:529dd8da2cb64862f8065283a1d2a0b2f75635931da935ce9db2482cf33bdc9c +size 2402 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0033.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0033.json new file mode 100644 index 000000000..f25844e95 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0033.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1dcefa12491d06065823f209a2770046ad8ba8641506d7bfceaa674aeecd97e +size 2336 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0034.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0034.json new file mode 100644 index 000000000..c8f36e610 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0034.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7a6a137188bfb0fa12b8569de694d6446a7dd52f4481ce4e14ca3b5b0e550ff +size 2378 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0035.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0035.json new file mode 100644 index 000000000..2553d6608 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_code_ca1_agent_benchmark_plan_01_0035.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7f313bea274207cc269c564f5800ca1b942430f127628e7127797deeec6d424 +size 2378 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001.json new file mode 100644 index 000000000..dfdb91bb7 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e68dffcda05a8eab2e94918ae18eeb9b5a0fd2ea5ddac42490d214a0547f31bf +size 2784 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0001.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0001.json new file mode 100644 index 000000000..7c3dab242 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a37c6699eb2f233157f6e8d26287aa8e0d6d98581cc36e7235771deca8ea3b92 +size 2707 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0002.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0002.json new file mode 100644 index 000000000..2683d82da --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c4f8492fe921da1b3a515f6acc4fe4266214b2571e8fd9cf236f027eb469d0c +size 2756 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0003.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0003.json new file mode 100644 index 000000000..8a970849d --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0003.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:006ba691e4754f142ffde833e4d49c2b8038e414aeeaf1ea962ee0c0470d56d9 +size 2756 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0004.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0004.json new file mode 100644 index 000000000..adb92e285 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0004.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3baca5e56f471e05e34bd32b76520a94ddf9c5ef9b23dae6fbf7e2565f4f5ea +size 2785 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0005.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0005.json new file mode 100644 index 000000000..ed3894147 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0005.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7818e9a45227acfe746ecce353cb2d395b8f03f03f9365306df2f8c4a2fce6f2 +size 2708 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0006.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0006.json new file mode 100644 index 000000000..c44120a5e --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0006.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:802d7b4d051b3c02f7219edad18540ef339735b35f55cebecd324784bc13f6ae +size 2757 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0007.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0007.json new file mode 100644 index 000000000..c1b6ab46b --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0007.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:115595c48ca72295cb99deeff81bb6c9b25c877c0a7d72b59d05cb4995b6c276 +size 2757 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0008.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0008.json new file mode 100644 index 000000000..829e1e445 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0008.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c19c8304459bed86d8c901b785edcff750451337868d0d0e373d819cdbb69660 +size 2785 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0009.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0009.json new file mode 100644 index 000000000..ba2d03d5d --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0009.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d4e3bfd6913cade38921bc52e7447445d957e1304cd133914fd50931142993 +size 2708 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0010.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0010.json new file mode 100644 index 000000000..e6594d3c6 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0010.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:851ba58a552a39ea9cf5123ce7e642bf00c3d7756b65aafdc6aec9a730799c84 +size 2757 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0011.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0011.json new file mode 100644 index 000000000..b2bd898e4 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_debug_repair_repo_001_0011.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b00d4d905ea1161df0007acc8de86913173090352d9725a38f6a8d2b0fc21321 +size 2757 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982.json new file mode 100644 index 000000000..3c7d72a08 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da7bced17c233960a2d61274ea342a2bfcddda9763feaee9c8de77abb865ce78 +size 3270 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0013.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0013.json new file mode 100644 index 000000000..fd98b87ac --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0013.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b86d7b3773d965c200f734e10aaeb0d5a58ccce907bc6d12a88141eda7e36263 +size 3182 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0014.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0014.json new file mode 100644 index 000000000..7876c3d0b --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0014.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:435e61feb29c8527973126324c81aef085c3e27ca3a9570a8b08766c4b468e4d +size 3238 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0015.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0015.json new file mode 100644 index 000000000..4a8db8812 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0015.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b904ef36516f004c8e7e6bc082c4cb6fb1958a7f354a781a2fa67705bb0baacd +size 3238 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0016.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0016.json new file mode 100644 index 000000000..a2533de83 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0016.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:021ac97814c5c44835b76d548f6ea0db809cfb26e918fa11c110cf5d70c67e7c +size 3271 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0017.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0017.json new file mode 100644 index 000000000..b42712a6c --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0017.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceb660d330dff8500047fb45000c890a1e0a922dd24d4d0d5e669f31438e29db +size 3183 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0018.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0018.json new file mode 100644 index 000000000..448693c0f --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0018.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de0dbfd8c652ea9c5e4686f19b7226c67e4632e00055ed941e91ae01d53e2f56 +size 3239 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0019.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0019.json new file mode 100644 index 000000000..60d6c72a6 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0019.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bee5128892e928afd957490a9d47e7dada73c296bc3c9f146d4c03ea9161a6b +size 3239 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0020.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0020.json new file mode 100644 index 000000000..cb66bdc3f --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0020.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56405fb996608d6373fb500d03d1f9617a7916bcbd7bbabf967e6b7db4ab6cec +size 3271 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0021.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0021.json new file mode 100644 index 000000000..4c79b8f57 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0021.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4447403d1c4d08cc334a458241bb85ea0eab4d29236a450bf3bad0bd4353565 +size 3183 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0022.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0022.json new file mode 100644 index 000000000..baad5558f --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0022.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6165657ee626fb9a507d9b39418187654411119d1434c39af71c5fc3f035f63 +size 3239 diff --git a/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0023.json b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0023.json new file mode 100644 index 000000000..f647a5fa8 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k/isb1_sess_offload_cliff_9982_0023.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b9fb5f30984d54379a4f8e72ab3c42f720a3c4399656868c20a716838e3164 +size 3239 diff --git a/datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01.json b/datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01.json new file mode 100644 index 000000000..166fdc3d7 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:022ac4792b506df3b42bd605fc796ca38c3114c91dcf0b3492c3f70a917071a2 +size 2407 diff --git a/datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01_0001.json b/datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01_0001.json new file mode 100644 index 000000000..e629502c4 --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0b285c9dad4335c18945ab55e31d7bc0dda2d2109014903719c42aec4d39c5 +size 2408 diff --git a/datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01_0002.json b/datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01_0002.json new file mode 100644 index 000000000..59faaebdc --- /dev/null +++ b/datasets/isb1/converted/core/code_8k1k_qwen3.5/isb1_sess_code_ca1_agent_benchmark_plan_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d28f64eb09f50b8bf9e6e7f0291db3bc1c08db7b76cccfdbc0dcc114b360e04 +size 2408 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01.json new file mode 100644 index 000000000..323c64b7c --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd56f01e5090d703d0526c11e96e9cbff3cf1c097c486211ece05acfbd3f6b1 +size 4471 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0001.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0001.json new file mode 100644 index 000000000..9c6173124 --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f45f7941444d1a3c566163063418333ec3e62eb3cbd9bd029547f7f6f1c0a76 +size 4350 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0002.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0002.json new file mode 100644 index 000000000..793bd64c9 --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edd8c5b09579a9415d7d414a3896c3c4269df8099bf8a23f49c5eeb7103dd59c +size 4432 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0003.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0003.json new file mode 100644 index 000000000..931e7b086 --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0003.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81b99077228bcb9c152b252688ef57f146273cc004abdca230e93dd630f2b9ee +size 4427 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0004.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0004.json new file mode 100644 index 000000000..e53fa1dde --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0004.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ab461192902708ec092d408e93416c7c0af125ba283aa61e7c2e0f137599a3 +size 4472 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0005.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0005.json new file mode 100644 index 000000000..831a7c9df --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0005.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd50f79726ace6b1c7da24c2ad958c4de3de7ab75e030ae0b9c3c9644e6c056d +size 4351 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0006.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0006.json new file mode 100644 index 000000000..2faf8ca27 --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0006.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcc6eac63fe7778dc95f47b6d765897907fd47093aee224ba22ad6297bb0fc8f +size 4433 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0007.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0007.json new file mode 100644 index 000000000..586d05e6e --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0007.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dcbeacbe9c7cf8415da9141a5fa870d87c5b75b84b0721d91de68aeebe38509 +size 4428 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0008.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0008.json new file mode 100644 index 000000000..2edfcb63d --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0008.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a3a7389ad3bc2647502821926867d13dfdd2e8100b228392198d5c0e2fd9351 +size 4472 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0009.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0009.json new file mode 100644 index 000000000..944e818d4 --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0009.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec67230bd0ddc6525b348730ab869f6fbf87a839b64bbfdedbef50f3d52574d7 +size 4351 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0010.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0010.json new file mode 100644 index 000000000..b19f75e4f --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0010.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f401813df485734fcd7bda36aa17a8b81087005180c99c719f65e3c2e4112c0d +size 4433 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0011.json b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0011.json new file mode 100644 index 000000000..d859d3153 --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k/isb1_sess_xlc1_text_resume_bridge_01_0011.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d177f9357a51503adbd8af9a410ebf8d5e56ccd9429fede9735a70e2f4c7d6e0 +size 4428 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01.json b/datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01.json new file mode 100644 index 000000000..323c64b7c --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd56f01e5090d703d0526c11e96e9cbff3cf1c097c486211ece05acfbd3f6b1 +size 4471 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01_0001.json b/datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01_0001.json new file mode 100644 index 000000000..e53fa1dde --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ab461192902708ec092d408e93416c7c0af125ba283aa61e7c2e0f137599a3 +size 4472 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01_0002.json b/datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01_0002.json new file mode 100644 index 000000000..2edfcb63d --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k_dsr1/isb1_sess_xlc1_text_resume_bridge_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a3a7389ad3bc2647502821926867d13dfdd2e8100b228392198d5c0e2fd9351 +size 4472 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01.json b/datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01.json new file mode 100644 index 000000000..1ce2fa5c1 --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:261ea0938d4768ea20dec329c076174d0f8a44a052e2f02acc9ac6361c8292fa +size 4487 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01_0001.json b/datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01_0001.json new file mode 100644 index 000000000..cb27e8f77 --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1788d56651677b95e4acc0ebc88a2218054aa373ca22390d569ea248f92d673d +size 4488 diff --git a/datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01_0002.json b/datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01_0002.json new file mode 100644 index 000000000..479e9b01e --- /dev/null +++ b/datasets/isb1/converted/extension_131k/chat_131k1k_qwen3.5/isb1_sess_xlc1_text_resume_bridge_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca8a90183c998f8abbb8504daa48c89bf66dee898cceeba40a6c8ec9ef9b965c +size 4488 diff --git a/datasets/isb1/converted/extension_131k/code_131k1k/isb1_sess_cache_xlc1_text_shared_prefix_swarm_01.json b/datasets/isb1/converted/extension_131k/code_131k1k/isb1_sess_cache_xlc1_text_shared_prefix_swarm_01.json new file mode 100644 index 000000000..88cc7cfa8 --- /dev/null +++ b/datasets/isb1/converted/extension_131k/code_131k1k/isb1_sess_cache_xlc1_text_shared_prefix_swarm_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbd915ed136e1d16788da57d18708969b408d445c408183c65a59520b891d2b6 +size 263363 diff --git a/datasets/isb1/converted/extension_131k/code_131k1k_qwen3.5/isb1_hb_depth_cache_xlc1_text_shared_prefix_swarm_01.json b/datasets/isb1/converted/extension_131k/code_131k1k_qwen3.5/isb1_hb_depth_cache_xlc1_text_shared_prefix_swarm_01.json new file mode 100644 index 000000000..18c269447 --- /dev/null +++ b/datasets/isb1/converted/extension_131k/code_131k1k_qwen3.5/isb1_hb_depth_cache_xlc1_text_shared_prefix_swarm_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aae25baf94a0f8d1c5b4ca49fa684fa1844366b2a2bc0ade25b162f5ce8c773 +size 263379 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01.json new file mode 100644 index 000000000..1df87abf4 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81111f0fc9153920bbcdaec6e23f7fe4b539adfc68ddd211bbbc5f97f6f61bbf +size 5146 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0001.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0001.json new file mode 100644 index 000000000..27bf25cb7 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c8202aa5e0aad871827d94babcbb4f5d2b9a13fedb9119f4b4f5e6803ea0ed3 +size 5025 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0002.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0002.json new file mode 100644 index 000000000..dbf28066d --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfeb547b1f838f821d06043538f1ca358086fe719fdedf7579fe153da0d29019 +size 5102 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0003.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0003.json new file mode 100644 index 000000000..1f4519732 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0003.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e37be49b80578e31798c447baa3a5016baf6f10639abd7b7cfe0220af203ad18 +size 5109 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0004.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0004.json new file mode 100644 index 000000000..aadc25c8e --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0004.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3f0b45305cb2a3328dc44b0ee32e202f09b063a6295391e990e2942ce4ad44d +size 5147 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0005.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0005.json new file mode 100644 index 000000000..57e9ab3dd --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0005.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bbafbac43e8a55570ff4e736708be054f18a10ef2a620b2eed47bdc35fe20a6 +size 5026 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0006.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0006.json new file mode 100644 index 000000000..2dc12c3a6 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0006.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06040e76b3f9b4b33f6bb8bb6ad850fab7a9d646dca1cbb5ff87afb60583e9d7 +size 5103 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0007.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0007.json new file mode 100644 index 000000000..e63e2e8fe --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0007.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cedb5842f6eaf58cdd062e97f95125dfdec76000df9685bb4a50907d6f8963d +size 5110 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0008.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0008.json new file mode 100644 index 000000000..c1c65b310 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0008.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ae80514ebac0758d3b7092aaefd792c6fcc659cdbc1eb7567fd15abc833fa4d +size 5147 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0009.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0009.json new file mode 100644 index 000000000..9ed6d3bc8 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0009.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f4b5750450b5f0d666cb6892ecc365aa2e673ee11ab68882789dbc31c97f807 +size 5026 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0010.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0010.json new file mode 100644 index 000000000..cde215170 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0010.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dde875b3d31b7aec6d06166dcf938208f00bbdc26a47702228dd796911db6906 +size 5103 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0011.json b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0011.json new file mode 100644 index 000000000..b51664086 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k/isb1_sess_chat_lc2_resume_reasoning_01_0011.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed5cb7cf3299c29aeef48f1903dc3c66dd8caef5833f737ff4528ed1fd28801d +size 5110 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01.json b/datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01.json new file mode 100644 index 000000000..a79b872b3 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cfbc883664bfa60fa0c1ff7895c83b31d5e0551279544ce38db9e04e75cabe5 +size 5164 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01_0001.json b/datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01_0001.json new file mode 100644 index 000000000..6e32fc5be --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdbada52e4b8b50a488e77a9102866244b56620042f3c5d238817b80efe203b8 +size 5165 diff --git a/datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01_0002.json b/datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01_0002.json new file mode 100644 index 000000000..a8cd8c71b --- /dev/null +++ b/datasets/isb1/converted/extension_32k/chat_32k1k_qwen3.5/isb1_sess_chat_lc2_resume_reasoning_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3318dfcb10f5d990770616cf5d0f0538b4bf8da1d567f88dfdd5bea168112409 +size 5165 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7.json new file mode 100644 index 000000000..3dcf84566 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:898dbe2c1a03f87a522cf33e310d710ac8223ed1f94b1ee8c7a9100fda9431b6 +size 2804 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0001.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0001.json new file mode 100644 index 000000000..08441a345 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aea5d8909200b74ecec71b915ee017ee9710f618fbe3a69e7f54abbbc8c100b +size 2727 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0002.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0002.json new file mode 100644 index 000000000..260690e2b --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1294b6545ed7b10372c6ef19cb3e8e0328d94bda306f8186362533ee40802c2 +size 2776 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0003.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0003.json new file mode 100644 index 000000000..af2479a77 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0003.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:998fc4cdb3d991960ad7858d5231a921fefadd5bd02b2b306a472c81e2119c8d +size 2776 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0004.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0004.json new file mode 100644 index 000000000..ca8a964a5 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0004.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92f214b18f52865b839ac93ceecbacee733a7071b84a7e571298eec5a6d50c85 +size 2805 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0005.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0005.json new file mode 100644 index 000000000..d479dd22b --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0005.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4adf2f4c220a7bc9416b513b5163a51e092dbc2417393ed67d426735a6fce922 +size 2728 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0006.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0006.json new file mode 100644 index 000000000..c9c098182 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0006.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b59c4809a87806e867d8e220204ca209ca529cbc7d2d5a652798034128d1a3d +size 2777 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0007.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0007.json new file mode 100644 index 000000000..4e27df91c --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0007.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e643421e2790c88f1bbc604ded96735f9091aa8bd3171aa69d277c557636256e +size 2777 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0008.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0008.json new file mode 100644 index 000000000..ff3c101f7 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0008.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a112a6c624844e5bef52cf1cb7523bd701b1529d6881ddcaed845eb3bc1d28c +size 2805 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0009.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0009.json new file mode 100644 index 000000000..7addccd14 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0009.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aae820940a72609eb6d89fd2bff155f951a2f3d6d2f3e246ee66795a7c7497ec +size 2728 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0010.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0010.json new file mode 100644 index 000000000..0ff8b11f8 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0010.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68e796dc6e7d585511eb80b4b5ebe9d75c26f02f1806f5a65900d9e8844f2464 +size 2777 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0011.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0011.json new file mode 100644 index 000000000..48c92905b --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_2c2a96a7_0011.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f9a6774bac4f0c84077cf790e98f234ca8af29557b0de8375dec82d92ca7b69 +size 2777 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01.json new file mode 100644 index 000000000..bf05252e9 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ede46fa67491a65f6eaf65c2831c5db29fa1d97e547c6ff5f8dfda4a8ee3ab +size 2327 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0013.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0013.json new file mode 100644 index 000000000..9ac87542e --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0013.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bd5748d6bbe1887f1e15805c926d5298d4f2e3c8c63d7eaa70bf33db78ed4e7 +size 2261 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0014.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0014.json new file mode 100644 index 000000000..6a18175c4 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0014.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6ca628bf59a22aca1980873f31e2aa9ea708e44a34bb719a17a85d4f9e75b23 +size 2303 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0015.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0015.json new file mode 100644 index 000000000..0e54cb5ea --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0015.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:037e492903c768d94e11464a4214bf1b131ed76d4b0193b3604df7a8b6479a12 +size 2303 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0016.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0016.json new file mode 100644 index 000000000..cf9c9f602 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0016.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42eb70b9fa39336c31220328e7db575f173af44e37884be2fb7b5e62a57c22ed +size 2328 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0017.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0017.json new file mode 100644 index 000000000..1b2d9bf04 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0017.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bb4ce1e1a68b68ce379ad09da2c09a57b05ce64a7bdfbc5e47defae8418da7d +size 2262 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0018.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0018.json new file mode 100644 index 000000000..8539ee4cb --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0018.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d39c8419bebc2ca5ffa7ca2056d07b1e114d8fc76961e5cf3fe4aeeb07f5f3 +size 2304 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0019.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0019.json new file mode 100644 index 000000000..8c176a52f --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0019.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46d4f89c5e481a5b93136053ced19de76d23ea8ad3bf195865cb7bcb62c777af +size 2304 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0020.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0020.json new file mode 100644 index 000000000..c8356ca18 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0020.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddd27528ed68ab47637903d86c011a3a0074690966a8d0f8f35e56371f553581 +size 2328 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0021.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0021.json new file mode 100644 index 000000000..5580594fe --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0021.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9eb0005d487df58324b70d717923be60a89c5e5db89026a58f5aee7854244d5 +size 2262 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0022.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0022.json new file mode 100644 index 000000000..b2c99d2d5 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0022.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cca1e74d84c28eec8161eab5fcd475f96bc0f76556adaa42f43403ad0ea9a65a +size 2304 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0023.json b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0023.json new file mode 100644 index 000000000..08ad2f5b9 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k/isb1_sess_doc_comp_fanout_01_0023.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b392ebeb6c0b86f2ef2e6fd1275e61d298c526d5dd32e3ba97b09c99531e3abf +size 2304 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7.json b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7.json new file mode 100644 index 000000000..65c643380 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f373c4011208b2d97a1e6a0a86accfdd904ebc833f46c5e5b200cc264c57f39 +size 2811 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7_0001.json b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7_0001.json new file mode 100644 index 000000000..b73906f19 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c5879b6e1996c18fe78e80c2e9495ad84dad2e85aa2dcc307d90a6fdef416f7 +size 2812 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7_0002.json b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7_0002.json new file mode 100644 index 000000000..1735d3bdd --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_2c2a96a7_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c926e04d21d2648b6ccc196b14182e06a4593452c0d6c1bec4e37c7d98994dc9 +size 2812 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01.json b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01.json new file mode 100644 index 000000000..a81c959a0 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3efb687c57a2246be7c98f1d502da42c1b565c0368b4292f476b9c2a78c91ba5 +size 2333 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01_0004.json b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01_0004.json new file mode 100644 index 000000000..ca01016a2 --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01_0004.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1cb0ba5bc2093d1525b09ae13e39b8db1a4fa436025cf389f2320765dee1afe +size 2334 diff --git a/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01_0005.json b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01_0005.json new file mode 100644 index 000000000..8703faadd --- /dev/null +++ b/datasets/isb1/converted/extension_32k/code_32k1k_qwen3.5/isb1_sess_doc_comp_fanout_01_0005.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93c6a77ff4c9f1b5ad86f9fedb63790372850f11f7b9e8d420dfa140c084b2ed +size 2334 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01.json new file mode 100644 index 000000000..a14eb48da --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:392dcd8298eeeca109a4c349fbcbd085c87bcd5aa32a5a06c593b8e3f4072f35 +size 7684 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0001.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0001.json new file mode 100644 index 000000000..6b3233162 --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2f601f4fa6ac44b82555eac3a88396b98c3dd700c0a157743a799e0f65653cd +size 7519 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0002.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0002.json new file mode 100644 index 000000000..73e206072 --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a75a018eb65ec3e3d3074e588c70ade1fb465ee21806a0b26c89c7508e5f6fab +size 7624 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0003.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0003.json new file mode 100644 index 000000000..240ad8dcc --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0003.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d8fd9574a23e0d71976e13023fc9073c281fe4a58109bffdd421c2cb6d53ecf +size 7631 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0004.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0004.json new file mode 100644 index 000000000..32fd5bded --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0004.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0d7c11a14ab5754a53860d4c587a086926d69826ef1db2c103c033357fb53d4 +size 7685 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0005.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0005.json new file mode 100644 index 000000000..ff71c156d --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0005.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec0b81adb90c381e352c0a14f5c384a483f7b4bac68df8b2a0d606a88af1ae9f +size 7520 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0006.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0006.json new file mode 100644 index 000000000..c7699b66a --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0006.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23388ddf4ce42292403bc1c21a0009b3ed6fe7457bc726ad46e0691c63f49fe6 +size 7625 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0007.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0007.json new file mode 100644 index 000000000..f1920117c --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0007.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef9f3dc90c8a293f2001cc01d3659b7b45b0c37225020f179b0599488d8344a7 +size 7632 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0008.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0008.json new file mode 100644 index 000000000..991e10709 --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0008.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39166450bd02636042142071bae92b87d3791d7d1ff3963484e4bf49a27bd1ff +size 7685 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0009.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0009.json new file mode 100644 index 000000000..c0a75c91b --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0009.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d323c45d881962162631d95c5dd529c74f82f3bfd6689dd1295b613b3c50de35 +size 7520 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0010.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0010.json new file mode 100644 index 000000000..758cac801 --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0010.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94674bf6b28a393a8c5a8da3d5479191d5c8735d6623d9708bba7e899a1fe3c6 +size 7625 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0011.json b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0011.json new file mode 100644 index 000000000..44aa5e7fb --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k/isb1_sess_chat_lc3_multi_day_strategy_01_0011.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17bdda94141044a3ea36ff67cf038366ed8ea0845ffcc918f6b47ac75c21c7d3 +size 7632 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01.json b/datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01.json new file mode 100644 index 000000000..17e78bdae --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2be8596b4e2ebbd20afdbbb03ef7914f575924a98eee84d0acd5c35fbfae22d +size 7706 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01_0001.json b/datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01_0001.json new file mode 100644 index 000000000..7cc9d4d51 --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:177b67dbf79b22bbd0760e063ef5569f72d9d9385a1d88c007ab4f5679c286e4 +size 7707 diff --git a/datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01_0002.json b/datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01_0002.json new file mode 100644 index 000000000..eee60953c --- /dev/null +++ b/datasets/isb1/converted/extension_64k/chat_64k1k_qwen3.5/isb1_sess_chat_lc3_multi_day_strategy_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86ab968d40f31e4780c53775a7d51454217abf24040e3d8cd0709727badf5db6 +size 7707 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01.json new file mode 100644 index 000000000..bf9718acb --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15e4d7e653c3f4ee4c422b8923b82580a5134be6550819d484eb5f5e0d51f457 +size 2746 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0001.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0001.json new file mode 100644 index 000000000..55e026e7c --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:278c37d6ce1fd114ab5e6bf0abce101db23dd8b7b32b237952bbadc40f92a487 +size 2669 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0002.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0002.json new file mode 100644 index 000000000..ad27fa0dd --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbeb3201f01eafeea2933baeec39072dddfd0b5a08812e9321da2dd5e823df31 +size 2718 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0003.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0003.json new file mode 100644 index 000000000..dbb7d2fee --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0003.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd61a8e0a839f576224db4308d131e2cd4da5d834fe2a728cdb954d7665f9172 +size 2725 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0004.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0004.json new file mode 100644 index 000000000..993d6845e --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0004.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fca2e437912b0f209119fb524b8896c782d4149e5237436e550ba661784a782 +size 2747 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0005.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0005.json new file mode 100644 index 000000000..061cb6600 --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0005.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66f9bd4e2c84710766698e604e887383ae692176bdc08ffec80e210cc6b28e39 +size 2670 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0006.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0006.json new file mode 100644 index 000000000..b8bf217ea --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0006.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f641c5bc45b0be1938fb6067baa59f1cfe8687d09da1fe78e1dfde09d07784 +size 2719 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0007.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0007.json new file mode 100644 index 000000000..a22ff65e8 --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0007.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:764b5d116a9ab258dbb3f39455823e04728171df2078edd8d13ef10d57ae2ebe +size 2726 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0008.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0008.json new file mode 100644 index 000000000..38109b7ab --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0008.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73e9a919db6baad72001517ee6335641e9d03abbb24a03616ae388cd943ca1d0 +size 2747 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0009.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0009.json new file mode 100644 index 000000000..c256ddd8d --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0009.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:126da8b3fed24e28358a905b3051b0e5f345f0f24ba96e06c1edebf55903d0bb +size 2670 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0010.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0010.json new file mode 100644 index 000000000..654a0368f --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0010.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a8b21b503bd9bea0e8315b8658392b67b7b661fb41ebb1273764ba8fdf999f9 +size 2719 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0011.json b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0011.json new file mode 100644 index 000000000..5b491122d --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k/isb1_sess_optimizer_01_0011.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7fb7638ce1fee522641a81199668fa15c34bb544c5daa8ba6b40c1e7198b51b +size 2726 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01.json b/datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01.json new file mode 100644 index 000000000..4e5c9b718 --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edfe48c31bb12a455d929a2abcdd15a4ccb865918c5fbd2b2809e7967fdecfad +size 2760 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01_0001.json b/datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01_0001.json new file mode 100644 index 000000000..a02eca75f --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c00b21670c05c9406b0b74883f8fe5586fb1c979ec550b174721631b0150b8d1 +size 2761 diff --git a/datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01_0002.json b/datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01_0002.json new file mode 100644 index 000000000..cbf22a9df --- /dev/null +++ b/datasets/isb1/converted/extension_64k/code_64k1k_qwen3.5/isb1_sess_optimizer_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe1f712db16fb8a7dd2186bd053f728179de328009c563f4683bcb2151a417d2 +size 2761 diff --git a/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01.json b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01.json new file mode 100644 index 000000000..c4c0876cf --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d451812144b3b944d058b8e7ede4fb7398f54bbbf8cf8b2f431985ee06fad118 +size 3299 diff --git a/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0001.json b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0001.json new file mode 100644 index 000000000..03a2dfb81 --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73df71fde60d50ee90c8042d52489c06af17c37537ff94b0dc172b3287929fb2 +size 3298 diff --git a/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0002.json b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0002.json new file mode 100644 index 000000000..db0d40be0 --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14f779829ac901f6ad41b269866f52dd77b42ecc6bda879af888e45169b61d26 +size 3299 diff --git a/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01.json b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01.json new file mode 100644 index 000000000..44bfb70d2 --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:511843bdadda58bb006e1829df648ade5c9740fd7c745a2a108f15a9cf820482 +size 2222198 diff --git a/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0001.json b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0001.json new file mode 100644 index 000000000..447093133 --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07072bc586a42f1b625b70f5fec9187b73b1c93cc4d6051466299ba57a80d3e0 +size 2222197 diff --git a/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0002.json b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0002.json new file mode 100644 index 000000000..74e3303ff --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1/isb1_hb_depth_cache_ulc2_offload_cliff_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d12dcf7565874dfb729b12d069fbbcefdff2d0a0cfb280f553dc63e71111cc1d +size 2222198 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01.json new file mode 100644 index 000000000..dde417aec --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af6bbdedbcceb93c1f384fb00d18d506365709299ba6dfb149c6d638d659c982 +size 2234 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0001.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0001.json new file mode 100644 index 000000000..5072474af --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df24fdcda9a63909d62b78cbf9cf84882240ba25e3654970311baf3d6a89e6b9 +size 2235 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0002.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0002.json new file mode 100644 index 000000000..948b19e5d --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f3158216c0d1572c28d1a003eebd889bbb0c56c0d4c29769ae983bfe37b83ea +size 2235 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01.json new file mode 100644 index 000000000..172bfd7b8 --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a89f7153acbef8a83f78601ed655f5d3a0907feb30f18e2b4994b96cd703de94 +size 2269 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0001.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0001.json new file mode 100644 index 000000000..47dc53b26 --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:638194fafecfd1183f4e3787727c8bce03bded3d6237492c9db2fbcbe9d8226e +size 2268 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0002.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0002.json new file mode 100644 index 000000000..be419291d --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ec7a67c68bdb06918f0dd05d0fd5ed7c46ad3fc3600a65ec5278d569f6588bb +size 2269 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01.json new file mode 100644 index 000000000..dde417aec --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af6bbdedbcceb93c1f384fb00d18d506365709299ba6dfb149c6d638d659c982 +size 2234 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0001.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0001.json new file mode 100644 index 000000000..5072474af --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df24fdcda9a63909d62b78cbf9cf84882240ba25e3654970311baf3d6a89e6b9 +size 2235 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0002.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0002.json new file mode 100644 index 000000000..948b19e5d --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1/isb1_sess_cache_xlc2_hot_cold_session_mix_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f3158216c0d1572c28d1a003eebd889bbb0c56c0d4c29769ae983bfe37b83ea +size 2235 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01.json new file mode 100644 index 000000000..fdea31cc4 --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1ef4a8272516a3adca7280055169382737ad70e3653201f92dfc892c91f92f5 +size 587823 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0001.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0001.json new file mode 100644 index 000000000..e83480d06 --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0001.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bbd11d756040feaf746e7835569ea62665c4eba011dc4e3db81c1c125672666 +size 587822 diff --git a/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0002.json b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0002.json new file mode 100644 index 000000000..dd2f2ac3a --- /dev/null +++ b/datasets/isb1/converted/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1/isb1_hb_depth_cache_xlc2_hot_cold_session_mix_01_0002.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4580b229e9622a0f50e1c4b9280d4bcfd9a1481fa57b8ffd154e9c35401d4ed0 +size 587823 From fd73c8a01859565b6f61584822d5c57ba579df2a Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:32:56 -0700 Subject: [PATCH 13/18] feat(isb1): add drop-in sweep config for kv-cache-tester (PR #993) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Schema-parity sibling of .github/configs/multiturn-agentic-trace.yaml with 16 ISB1 sweep cells across H200/B200/MI355X/H100 × multi-scale workloads (8k/32k/131k/500k-preview/1M-preview) × multi-model (Qwen3.5, DSR1). Follows Cam's exact tp / users / offload / ep schema. Consumers either merge these top-level keys into multiturn-agentic-trace.yaml or extend the sweep loader to glob multiturn-agentic-trace*.yaml. Co-Authored-By: Claude Opus 4.7 --- .../configs/multiturn-agentic-trace-isb1.yaml | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 .github/configs/multiturn-agentic-trace-isb1.yaml diff --git a/.github/configs/multiturn-agentic-trace-isb1.yaml b/.github/configs/multiturn-agentic-trace-isb1.yaml new file mode 100644 index 000000000..9921d3bc4 --- /dev/null +++ b/.github/configs/multiturn-agentic-trace-isb1.yaml @@ -0,0 +1,54 @@ +# ISB1 sweep cells for Cam's kv-cache-tester replay flow. +# Schema mirrors .github/configs/multiturn-agentic-trace.yaml. +# Merge these top-level keys into that file (or extend the sweep workflow +# to glob .github/configs/multiturn-agentic-trace*.yaml) to include ISB1 sweeps. +# 8k code cells map to datasets/isb1/converted/core/code_8k1k/. +# 32k chat cells map to datasets/isb1/converted/extension_32k/chat_32k1k*/. +# 131k code/chat cells map to datasets/isb1/converted/extension_131k/*_131k1k*/. +# 500k preview cells map to datasets/isb1/converted/preview/long_context_500k/. +# 1m preview cells map to datasets/isb1/converted/preview/long_context_1m/. +# Expected TRACE_DIR is either datasets/isb1/converted/ or one of those subdirs. + +h200-fp8-qwen3-isb1-code-8k: + tp2: {users: [2, 4, 8, 16, 32, 64, 128], offload: ["on", "off"]} + tp4: {users: [2, 4, 8, 16, 32, 64, 128], offload: ["on", "off"]} + +h200-fp8-qwen3-isb1-chat-32k: + tp2: {users: [1, 2, 4, 8, 16, 32], offload: ["on", "off"]} + tp4: {users: [1, 2, 4, 8, 16, 32, 64], offload: ["on", "off"]} + +h200-fp8-qwen3-isb1-code-131k: + tp4: {users: [1, 2, 4, 8], offload: ["on", "off"]} + tp8: {users: [1, 2, 4, 8, 16], offload: ["on", "off"]} + +b200-fp4-dsr1-isb1-code-8k: + tp4: {ep: 4, users: [4, 8, 16, 32, 64, 128, 256], offload: ["on", "off"]} + tp8: {ep: 8, users: [8, 16, 32, 64, 128, 256, 512], offload: ["on", "off"]} + +b200-fp4-dsr1-isb1-chat-32k: + tp4: {ep: 4, users: [1, 2, 4, 8, 16, 32, 64], offload: ["on", "off"]} + tp8: {ep: 8, users: [1, 2, 4, 8, 16, 32, 64, 128], offload: ["on", "off"]} + +b200-fp4-dsr1-isb1-code-131k: + tp8: {ep: 8, users: [1, 2, 4, 8, 16], offload: ["on", "off"]} + +b200-fp4-qwen3-isb1-chat-500k-preview: + tp4: {users: [1, 2, 4], offload: ["on", "off"]} + tp8: {users: [1, 2, 4, 8], offload: ["on", "off"]} + +b200-fp4-qwen3-isb1-chat-1m-preview: + tp8: {users: [1, 2], offload: ["on", "off"]} + +mi355x-fp8-qwen3-isb1-code-8k: + tp2: {users: [2, 4, 8, 16, 32, 64], offload: ["on", "off"]} + tp4: {users: [2, 4, 8, 16, 32, 64, 128], offload: ["on", "off"]} + +mi355x-fp8-qwen3-isb1-chat-32k: + tp4: {users: [1, 2, 4, 8, 16, 32], offload: ["on", "off"]} + +h100-fp8-qwen3-isb1-code-8k-lmcache: + tp2: {users: [1, 2, 4, 8, 16, 32], offload: ["on", "off"]} + tp4: {users: [1, 2, 4, 8, 16, 32, 64], offload: ["on", "off"]} + +h200-fp8-qwen3-isb1-debug: + tp2: {users: [2], offload: ["off"]} From 119a037d29c11a5fb196247afc7a69c9265aa29e Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:35:30 -0700 Subject: [PATCH 14/18] feat(isb1): add kv-cache-tester trace schema validator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tools/validate_kvcache_tester_trace.py — stdlib-only CLI that validates any trace JSON against Cam's kv-cache-tester schema: required keys, block_size consistency, prefix-extending hash_ids, per-request fields. Runs against single files or directories; exit code 1 on any failure. Catches schema drift before submissions reach the sweep. Co-Authored-By: Claude Opus 4.7 --- datasets/isb1/README.md | 4 + tools/validate_kvcache_tester_trace.py | 369 +++++++++++++++++++++++++ 2 files changed, 373 insertions(+) create mode 100644 tools/validate_kvcache_tester_trace.py diff --git a/datasets/isb1/README.md b/datasets/isb1/README.md index 9c39451d9..aa3c4ef0e 100644 --- a/datasets/isb1/README.md +++ b/datasets/isb1/README.md @@ -248,6 +248,10 @@ Any failure of the above means the PR is not actually plumbed end-to-end for this bundle and should be reproduced against Cam's `trace_replay_tester.py` before being claimed as compatible. +### Validate before publishing + +Before publishing or mirroring `datasets/isb1/converted/`, run `python3 tools/validate_kvcache_tester_trace.py datasets/isb1/converted/` to catch schema drift early: missing required keys, invalid `block_size`, and broken prefix-extending `hash_ids` that would otherwise fail inside Cam's replay sweep. + --- ## HF publication diff --git a/tools/validate_kvcache_tester_trace.py b/tools/validate_kvcache_tester_trace.py new file mode 100644 index 000000000..850da4991 --- /dev/null +++ b/tools/validate_kvcache_tester_trace.py @@ -0,0 +1,369 @@ +#!/usr/bin/env python3 +"""Validate kv-cache-tester trace JSON files. + +Stdlib-only validator for the compact trace schema consumed by +`trace_replay_tester.py` / `normalize_trace()` in Cam's kv-cache-tester. +Supports validating a single JSON file or recursively walking a directory of +trace files. +""" + +from __future__ import annotations + +import argparse +import json +import math +import sys +from pathlib import Path +from typing import Any + +VALID_HASH_ID_SCOPES = {"local", "global"} +MANIFEST_FILENAMES = {"manifest.json"} +CHECK = "✓" +CROSS = "✗" +WARN = "!" + + +def _is_int(value: Any) -> bool: + return isinstance(value, int) and not isinstance(value, bool) + + +def _is_number(value: Any) -> bool: + return (isinstance(value, int) or isinstance(value, float)) and not isinstance(value, bool) + + +def _add_issue(bucket: list[str], message: str, max_issues: int) -> None: + if len(bucket) < max_issues: + bucket.append(message) + + +def _validate_string_list(value: Any, field_name: str, errors: list[str], max_issues: int) -> list[str] | None: + if not isinstance(value, list) or not value: + _add_issue(errors, f"{field_name} must be a non-empty list[str]", max_issues) + return None + for idx, item in enumerate(value): + if not isinstance(item, str): + _add_issue(errors, f"{field_name}[{idx}] must be str, got {type(item).__name__}", max_issues) + return value if len(errors) < max_issues else None + + +def _validate_flat_hash_ids( + hash_ids: list[Any], + *, + input_tokens: int, + block_size: int, + scope: str | None, + errors: list[str], + warnings: list[str], + max_issues: int, +) -> None: + expected_len = math.ceil(input_tokens / block_size) if input_tokens > 0 else 0 + if len(hash_ids) != expected_len: + _add_issue( + errors, + f"hash_ids length = {len(hash_ids)}, expected ceil(in={input_tokens} / block_size={block_size}) = {expected_len}", + max_issues, + ) + + if scope is None: + _add_issue( + warnings, + "hash_id_scope missing; cannot strictly validate flat hash_ids semantics", + max_issues, + ) + + for idx, value in enumerate(hash_ids): + if not _is_int(value): + _add_issue(errors, f"hash_ids[{idx}] must be int, got {type(value).__name__}", max_issues) + continue + if value <= 0: + _add_issue(errors, f"hash_ids[{idx}] = {value}, expected positive int", max_issues) + if scope == "local": + expected = idx + 1 + if value != expected: + _add_issue( + errors, + f"hash_ids[{idx}] = {value}, expected {expected} (prefix must extend by 1)", + max_issues, + ) + + +def _validate_nested_hash_ids( + hash_ids: list[Any], + *, + scope: str | None, + errors: list[str], + warnings: list[str], + max_issues: int, +) -> None: + if scope is None: + _add_issue( + warnings, + "hash_id_scope missing; cannot strictly validate nested hash_ids semantics", + max_issues, + ) + for outer_idx, group in enumerate(hash_ids): + if not isinstance(group, list): + _add_issue(errors, f"hash_ids[{outer_idx}] must be list[int], got {type(group).__name__}", max_issues) + continue + for inner_idx, value in enumerate(group): + if not _is_int(value): + _add_issue( + errors, + f"hash_ids[{outer_idx}][{inner_idx}] must be int, got {type(value).__name__}", + max_issues, + ) + continue + if value <= 0: + _add_issue(errors, f"hash_ids[{outer_idx}][{inner_idx}] = {value}, expected positive int", max_issues) + if scope == "local": + expected = inner_idx + 1 + if value != expected: + _add_issue( + errors, + f"hash_ids[{outer_idx}][{inner_idx}] = {value}, expected {expected} (prefix must extend by 1)", + max_issues, + ) + + +def _validate_request( + req: Any, + *, + request_idx: int, + block_size: int, + scope: str | None, + errors: list[str], + warnings: list[str], + max_issues: int, +) -> None: + prefix = f"requests[{request_idx}]" + if not isinstance(req, dict): + _add_issue(errors, f"{prefix} must be object, got {type(req).__name__}", max_issues) + return + + req_type = req.get("type") + if not isinstance(req_type, str): + _add_issue(errors, f"{prefix}.type must be str", max_issues) + + if req_type == "subagent": + return + + t_value = req.get("t") + if not _is_number(t_value): + _add_issue(errors, f"{prefix}.t must be float >= 0", max_issues) + elif float(t_value) < 0: + _add_issue(errors, f"{prefix}.t = {t_value}, expected >= 0", max_issues) + + input_tokens = req.get("in") + if not _is_int(input_tokens): + _add_issue(errors, f"{prefix}.in must be int >= 0", max_issues) + input_tokens = 0 + elif input_tokens < 0: + _add_issue(errors, f"{prefix}.in = {input_tokens}, expected >= 0", max_issues) + + output_tokens = req.get("out") + if not _is_int(output_tokens): + _add_issue(errors, f"{prefix}.out must be int >= 0", max_issues) + elif output_tokens < 0: + _add_issue(errors, f"{prefix}.out = {output_tokens}, expected >= 0", max_issues) + + hash_ids = req.get("hash_ids") + if not isinstance(hash_ids, list): + _add_issue(errors, f"{prefix}.hash_ids must be list[int] or list[list[int]]", max_issues) + else: + is_nested = bool(hash_ids) and all(isinstance(item, list) for item in hash_ids) + is_flat = not hash_ids or all(not isinstance(item, list) for item in hash_ids) + if is_nested: + _validate_nested_hash_ids( + hash_ids, + scope=scope, + errors=errors, + warnings=warnings, + max_issues=max_issues, + ) + elif is_flat: + _validate_flat_hash_ids( + hash_ids, + input_tokens=input_tokens, + block_size=block_size, + scope=scope, + errors=errors, + warnings=warnings, + max_issues=max_issues, + ) + else: + _add_issue(errors, f"{prefix}.hash_ids must not mix flat and nested entries", max_issues) + + optional_string_fields = ("model", "stop") + for field_name in optional_string_fields: + if field_name in req and not isinstance(req[field_name], str): + _add_issue(errors, f"{prefix}.{field_name} must be str", max_issues) + + optional_list_fields = ("input_types", "output_types") + for field_name in optional_list_fields: + if field_name in req: + value = req[field_name] + if not isinstance(value, list): + _add_issue(errors, f"{prefix}.{field_name} must be list[str]", max_issues) + continue + for idx, item in enumerate(value): + if not isinstance(item, str): + _add_issue(errors, f"{prefix}.{field_name}[{idx}] must be str", max_issues) + + optional_number_fields = ("api_time", "think_time") + for field_name in optional_number_fields: + if field_name in req: + value = req[field_name] + if not _is_number(value): + _add_issue(errors, f"{prefix}.{field_name} must be float", max_issues) + + +def validate_trace(trace: Any, *, max_issues: int) -> tuple[list[str], list[str]]: + errors: list[str] = [] + warnings: list[str] = [] + + if not isinstance(trace, dict): + return [f"top-level JSON must be object, got {type(trace).__name__}"], warnings + + trace_id = trace.get("id") + if not isinstance(trace_id, str): + _add_issue(errors, "id must be str", max_issues) + + _validate_string_list(trace.get("models"), "models", errors, max_issues) + + block_size = trace.get("block_size") + if not _is_int(block_size): + _add_issue(errors, "block_size must be int > 0", max_issues) + block_size = 1 + elif block_size <= 0: + _add_issue(errors, f"block_size = {block_size}, expected > 0", max_issues) + + requests = trace.get("requests") + if not isinstance(requests, list) or not requests: + _add_issue(errors, "requests must be a non-empty list", max_issues) + requests = [] + + scope = trace.get("hash_id_scope") + if scope is not None and scope not in VALID_HASH_ID_SCOPES: + _add_issue( + errors, + f"hash_id_scope = {scope!r}, expected one of {sorted(VALID_HASH_ID_SCOPES)}", + max_issues, + ) + scope = None + + for field_name in ("tool_tokens", "system_tokens"): + if field_name in trace: + value = trace[field_name] + if not _is_int(value): + _add_issue(errors, f"{field_name} must be int >= 0", max_issues) + elif value < 0: + _add_issue(errors, f"{field_name} = {value}, expected >= 0", max_issues) + + for idx, req in enumerate(requests): + if len(errors) >= max_issues and len(warnings) >= max_issues: + break + _validate_request( + req, + request_idx=idx, + block_size=block_size, + scope=scope, + errors=errors, + warnings=warnings, + max_issues=max_issues, + ) + + return errors, warnings + + +def iter_trace_files(path: Path) -> list[Path]: + if path.is_file(): + return [path] + if path.is_dir(): + files = [] + for candidate in sorted(path.rglob("*.json")): + if candidate.name in MANIFEST_FILENAMES: + continue + if candidate.is_file(): + files.append(candidate) + return files + raise FileNotFoundError(f"Path not found: {path}") + + +def parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="validate_kvcache_tester_trace.py", + description="Validate kv-cache-tester trace files or directories.", + ) + parser.add_argument("path", metavar="PATH", help="file or directory (recursive glob *.json when directory)") + parser.add_argument("--quiet", action="store_true", help="only print final summary") + parser.add_argument( + "--strict", + action="store_true", + help="treat warnings as errors (e.g. hash_ids scope missing)", + ) + parser.add_argument( + "--max-errors-per-file", + type=int, + default=5, + help="maximum errors reported per file (default: 5)", + ) + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv or sys.argv[1:]) + if args.max_errors_per_file <= 0: + print("--max-errors-per-file must be > 0", file=sys.stderr) + return 2 + + path = Path(args.path) + try: + files = iter_trace_files(path) + except FileNotFoundError as exc: + print(str(exc), file=sys.stderr) + return 2 + + if not files: + print("No trace JSON files found", file=sys.stderr) + return 2 + + valid_count = 0 + failed_count = 0 + + for file_path in files: + try: + trace = json.loads(file_path.read_text()) + except Exception as exc: + errors = [f"invalid JSON: {exc}"] + warnings: list[str] = [] + else: + errors, warnings = validate_trace(trace, max_issues=args.max_errors_per_file) + + effective_errors = list(errors) + if args.strict: + effective_errors.extend(warnings) + + if effective_errors: + failed_count += 1 + if not args.quiet: + print(f"{CROSS} {file_path}") + for issue in effective_errors[: args.max_errors_per_file]: + print(f" {issue}") + else: + valid_count += 1 + if warnings and not args.quiet: + print(f"{WARN} {file_path}") + for warning in warnings[: args.max_errors_per_file]: + print(f" {warning}") + + if failed_count == 0: + print(f"{CHECK} {valid_count} files valid | 0 failed") + return 0 + + plural = "files" if failed_count != 1 else "file" + print(f"{failed_count} {plural} failed validation") + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) From 5208886ebe299fac8191bb8cdb99593018837c5d Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:37:36 -0700 Subject: [PATCH 15/18] =?UTF-8?q?data(isb1):=20ship=20converted/manifest.j?= =?UTF-8?q?son=20=E2=80=94=20179-trace=20catalog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auto-generated index with per-trace metadata: scale band, workload family, model family, token totals, and approximate cache hit rate (computed via Cam's normalize_trace walker). Enables sweep configs to filter or select trace subsets by metadata without loading every file. Co-Authored-By: Claude Opus 4.7 --- datasets/isb1/converted/manifest.json | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 datasets/isb1/converted/manifest.json diff --git a/datasets/isb1/converted/manifest.json b/datasets/isb1/converted/manifest.json new file mode 100644 index 000000000..2c9886e13 --- /dev/null +++ b/datasets/isb1/converted/manifest.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9694ac3d13014235c5e61f4c7cff955b9d924cad373695ed373896e22439c55c +size 93447 From 962634ee7ea208e7790fa9d65eb1c96c1cfbdcab Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:39:14 -0700 Subject: [PATCH 16/18] docs(isb1): HF publication recipe for kv-cache-tester hf_-- path datasets/isb1/HF_PUBLISH.md walks through publishing datasets/isb1/converted/ to Hugging Face at semianalysisai/isb1-cc-traces so Cam's trace_replay scripts can load ISB1 via TRACE_DIR=hf_semianalysisai--isb1-cc-traces with zero changes to his shell scripts (hf_-- handling at benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh lines 54-58). Includes dataset card template, upload command, versioning recipe, and post-upload verification. Co-Authored-By: Claude Opus 4.7 --- datasets/isb1/HF_PUBLISH.md | 124 ++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 datasets/isb1/HF_PUBLISH.md diff --git a/datasets/isb1/HF_PUBLISH.md b/datasets/isb1/HF_PUBLISH.md new file mode 100644 index 000000000..a4f4f391f --- /dev/null +++ b/datasets/isb1/HF_PUBLISH.md @@ -0,0 +1,124 @@ +# HF publication recipe for ISB1 converted traces + +Mirror `datasets/isb1/converted/` to Hugging Face so Cam's +`TRACE_DIR=hf_--` path works immediately with kv-cache-tester. +Recommended target: `semianalysisai/isb1-cc-traces`. + +## 1. Target namespace + +- Dataset repo: `semianalysisai/isb1-cc-traces` +- Source directory: `datasets/isb1/converted/` +- Consumer contract: Cam's replay scripts interpret `hf_--` as a + Hugging Face dataset reference before calling `trace_replay_tester.py` + +## 2. Prereqs + +- `huggingface-cli >= 0.20` +- `HF_TOKEN` with write scope to the destination org +- Local validation already green: + `python3 tools/validate_kvcache_tester_trace.py datasets/isb1/converted/` + +Authenticate first: + +```bash +export HF_TOKEN=hf_xxx +huggingface-cli login --token "$HF_TOKEN" +``` + +## 3. Dataset card template + +Create the HF dataset `README.md` with this content: + +```markdown +--- +license: apache-2.0 +task_categories: [text-generation] +language: [en] +pretty_name: ISB1 Converted kv-cache-tester Traces +tags: [kv-cache, trace-replay, inference-benchmark, semianalysis, isb1] +--- + +# ISB1 Converted kv-cache-tester Traces + +This dataset mirrors `datasets/isb1/converted/` from SemiAnalysisAI/InferenceX +PR #1032 so Cam's kv-cache-tester replay flow from PR #993 can consume ISB1 +traces directly through the `hf_--` `TRACE_DIR` convention. + +## Contents + +- 179 pre-converted trace JSON files +- 8k / 32k / 64k / 131k / 500k preview / 1m preview coverage +- Kimi K2.5 / DSR1 / GPT-OSS / Qwen3.5 coverage +- `manifest.json` metadata catalog + +## Provenance + +- Source repo: `SemiAnalysisAI/InferenceX` +- Source PR: `#1032` +- Consumer workflow: `callanjfox/kv-cache-tester` PR `#993` +- License: Apache-2.0 +``` + +## 4. Upload command + +```bash +huggingface-cli upload \ + semianalysisai/isb1-cc-traces \ + datasets/isb1/converted/ \ + . \ + --repo-type dataset \ + --revision main +``` + +If the repo does not exist yet, create it in the HF UI first, then rerun the +upload. + +## 5. Cam's Slurm integration + +After publication, switch Cam's script from a local directory to the HF path: + +```bash +TRACE_DIR=hf_semianalysisai--isb1-cc-traces # replaces datasets/isb1/converted +``` + +That triggers the `hf_--` branch in Cam's PR #993 replay script +(`benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh`, lines 54-58), +which rewrites the value into `--hf-dataset /` before invoking +`trace_replay_tester.py`. + +## 6. Versioning + +When new traces land: + +1. Regenerate `datasets/isb1/converted/manifest.json` +2. Re-run local validation on the converted directory +3. Upload the updated directory to HF `main` +4. Create a matching HF tag such as `v0.2.0` or `pr1032-r2` +5. Record the InferenceX commit SHA and HF revision together + +Consumers who need immutability should pin an HF revision instead of floating +on `main`. + +## 7. Verification + +```bash +rm -rf /tmp/verify +huggingface-cli download semianalysisai/isb1-cc-traces \ + --repo-type dataset \ + --local-dir /tmp/verify +python3 tools/validate_kvcache_tester_trace.py /tmp/verify +``` + +Expected result: + +- Download succeeds with all trace JSONs present +- Validator reports all converted traces passing +- Cam's replay wrapper accepts + `TRACE_DIR=hf_semianalysisai--isb1-cc-traces` with no shell-script changes + +## Notes + +- Publish converted artifacts and metadata only +- Keep the layout compatible with `trace_replay_tester.py` +- If the org name changes, update both the upload command and `TRACE_DIR` + example together From 40bad610858aa7dfae322f1d3076ea4a17f83b81 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Tue, 21 Apr 2026 11:33:45 -0700 Subject: [PATCH 17/18] feat(isb1): HF publish package for ISB-1 kv-cache-tester corpus Add the HF dataset card, dry-run-safe publisher script, and publish runbook for the converted ISB-1 trace corpus. This packages the zero-friction consumer path via TRACE_DIR=hf_-- and implements PR A from the investigation report without changing Cam's harness. --- datasets/isb1/HF_PUBLISH.md | 205 ++++++++++++++--------- datasets/isb1/hf_dataset_card.md | 144 ++++++++++++++++ tools/publish_hf_dataset.py | 276 +++++++++++++++++++++++++++++++ 3 files changed, 550 insertions(+), 75 deletions(-) create mode 100644 datasets/isb1/hf_dataset_card.md create mode 100755 tools/publish_hf_dataset.py diff --git a/datasets/isb1/HF_PUBLISH.md b/datasets/isb1/HF_PUBLISH.md index a4f4f391f..7b9c8ffc5 100644 --- a/datasets/isb1/HF_PUBLISH.md +++ b/datasets/isb1/HF_PUBLISH.md @@ -2,123 +2,178 @@ Mirror `datasets/isb1/converted/` to Hugging Face so Cam's `TRACE_DIR=hf_--` path works immediately with kv-cache-tester. -Recommended target: `semianalysisai/isb1-cc-traces`. +Preferred target: `semianalysisai/isb1-cc-traces`. +Fallback if org write access is unavailable: `ocwc22/isb1-cc-traces`. -## 1. Target namespace +## 1. What gets published -- Dataset repo: `semianalysisai/isb1-cc-traces` -- Source directory: `datasets/isb1/converted/` -- Consumer contract: Cam's replay scripts interpret `hf_--` as a - Hugging Face dataset reference before calling `trace_replay_tester.py` +This publish package is the checked-in trio below: -## 2. Prereqs +- `datasets/isb1/converted/` — 179 validated kv-cache-tester trace JSON files +- `datasets/isb1/converted/manifest.json` — corpus metadata (`1226` total requests) +- `datasets/isb1/hf_dataset_card.md` — staged to HF as `README.md` -- `huggingface-cli >= 0.20` -- `HF_TOKEN` with write scope to the destination org -- Local validation already green: - `python3 tools/validate_kvcache_tester_trace.py datasets/isb1/converted/` +The consumer contract is unchanged: Cam's replay scripts interpret +`hf_--` as a Hugging Face dataset source, hydrate it locally, and +then invoke the existing replay path. -Authenticate first: +## 2. Pre-flight validation + +Run the stdlib validator before every publish attempt: ```bash -export HF_TOKEN=hf_xxx -huggingface-cli login --token "$HF_TOKEN" +python3 tools/validate_kvcache_tester_trace.py datasets/isb1/converted/ ``` -## 3. Dataset card template +Expected result: + +- `✓ 179 files valid | 0 failed` +- Exit code `0` + +If validation fails, stop and fix the source corpus before publishing. Do not +push a broken dataset mirror to HF. -Create the HF dataset `README.md` with this content: +## 3. Token setup -```markdown ---- -license: apache-2.0 -task_categories: [text-generation] -language: [en] -pretty_name: ISB1 Converted kv-cache-tester Traces -tags: [kv-cache, trace-replay, inference-benchmark, semianalysis, isb1] ---- +Authenticate with a token that has write access to the destination namespace: -# ISB1 Converted kv-cache-tester Traces +```bash +huggingface-cli login +``` -This dataset mirrors `datasets/isb1/converted/` from SemiAnalysisAI/InferenceX -PR #1032 so Cam's kv-cache-tester replay flow from PR #993 can consume ISB1 -traces directly through the `hf_--` `TRACE_DIR` convention. +If you prefer explicit token injection: -## Contents +```bash +export HF_TOKEN=hf_xxx +huggingface-cli login --token "$HF_TOKEN" +``` -- 179 pre-converted trace JSON files -- 8k / 32k / 64k / 131k / 500k preview / 1m preview coverage -- Kimi K2.5 / DSR1 / GPT-OSS / Qwen3.5 coverage -- `manifest.json` metadata catalog +## 4. Dry-run the publish package locally -## Provenance +The uploader script stages the converted corpus plus the dataset card and +prints the exact file list it would upload without making any remote changes. -- Source repo: `SemiAnalysisAI/InferenceX` -- Source PR: `#1032` -- Consumer workflow: `callanjfox/kv-cache-tester` PR `#993` -- License: Apache-2.0 +```bash +python3 tools/publish_hf_dataset.py \ + --source datasets/isb1/converted/ \ + --repo semianalysisai/isb1-cc-traces \ + --private \ + --dry-run ``` -## 4. Upload command +Use the fallback namespace instead if needed: ```bash -huggingface-cli upload \ - semianalysisai/isb1-cc-traces \ - datasets/isb1/converted/ \ - . \ - --repo-type dataset \ - --revision main +python3 tools/publish_hf_dataset.py \ + --source datasets/isb1/converted/ \ + --repo ocwc22/isb1-cc-traces \ + --private \ + --dry-run ``` -If the repo does not exist yet, create it in the HF UI first, then rerun the -upload. +## 5. Publish for real -## 5. Cam's Slurm integration +Once the dry-run output looks correct and HF auth is configured, publish with +one of the exact commands below. -After publication, switch Cam's script from a local directory to the HF path: +Private-first publish: ```bash -TRACE_DIR=hf_semianalysisai--isb1-cc-traces # replaces datasets/isb1/converted +python3 tools/publish_hf_dataset.py \ + --source datasets/isb1/converted/ \ + --repo semianalysisai/isb1-cc-traces \ + --private \ + --commit-message "Publish ISB-1 kv-cache-tester traces" ``` -That triggers the `hf_--` branch in Cam's PR #993 replay script -(`benchmarks/single_node/multiturn_fp4_b200_trace_replay.sh`, lines 54-58), -which rewrites the value into `--hf-dataset /` before invoking -`trace_replay_tester.py`. +Or make the dataset public at creation time: -## 6. Versioning +```bash +python3 tools/publish_hf_dataset.py \ + --source datasets/isb1/converted/ \ + --repo semianalysisai/isb1-cc-traces \ + --public \ + --commit-message "Publish ISB-1 kv-cache-tester traces" +``` -When new traces land: +Fallback org: -1. Regenerate `datasets/isb1/converted/manifest.json` -2. Re-run local validation on the converted directory -3. Upload the updated directory to HF `main` -4. Create a matching HF tag such as `v0.2.0` or `pr1032-r2` -5. Record the InferenceX commit SHA and HF revision together +```bash +python3 tools/publish_hf_dataset.py \ + --source datasets/isb1/converted/ \ + --repo ocwc22/isb1-cc-traces \ + --public \ + --commit-message "Publish ISB-1 kv-cache-tester traces" +``` -Consumers who need immutability should pin an HF revision instead of floating -on `main`. +The script will: + +1. Stage `datasets/isb1/converted/` into a temporary upload tree +2. Copy `datasets/isb1/hf_dataset_card.md` into that tree as `README.md` +3. Create the dataset repo if it does not already exist +4. Upload the staged folder with `huggingface_hub` +5. Verify the published snapshot with `snapshot_download` into `/tmp` + +## 6. Post-publish verification -## 7. Verification +### Repository-level verification + +Re-download the published dataset and re-run the validator against the hydrated +copy: ```bash -rm -rf /tmp/verify huggingface-cli download semianalysisai/isb1-cc-traces \ --repo-type dataset \ - --local-dir /tmp/verify -python3 tools/validate_kvcache_tester_trace.py /tmp/verify + --local-dir /tmp/isb1-cc-traces-verify +python3 tools/validate_kvcache_tester_trace.py /tmp/isb1-cc-traces-verify ``` -Expected result: +### Harness-level verification + +The exact consumer path for Cam is the existing `TRACE_DIR=hf_--` +contract. In the replay harness checkout, the closest end-to-end verification +command is: + +```bash +TRACE_DIR=hf_semianalysisai--isb1-cc-traces \ +bash experimental/multiturn/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +``` + +If the SemianalysisAI org is not available, swap in the fallback namespace: + +```bash +TRACE_DIR=hf_ocwc22--isb1-cc-traces \ +bash experimental/multiturn/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +``` + +## 7. Consumer note for Cam -- Download succeeds with all trace JSONs present -- Validator reports all converted traces passing -- Cam's replay wrapper accepts - `TRACE_DIR=hf_semianalysisai--isb1-cc-traces` with no shell-script changes +This is the zero-friction handoff: + +```bash +TRACE_DIR=hf_semianalysisai--isb1-cc-traces \ +bash experimental/multiturn/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +``` + +No code change is required in Cam's harness. The only user action is publishing +this dataset repo once with valid HF credentials. + +## 8. Versioning guidance + +When new traces land: + +1. Regenerate `datasets/isb1/converted/manifest.json` +2. Re-run `tools/validate_kvcache_tester_trace.py` +3. Re-run the uploader dry-run +4. Publish with a commit message that records the corpus revision +5. Record the InferenceX commit SHA and the HF dataset revision together + +Consumers that need immutability should pin an HF revision instead of floating +on `main`. ## Notes - Publish converted artifacts and metadata only -- Keep the layout compatible with `trace_replay_tester.py` -- If the org name changes, update both the upload command and `TRACE_DIR` - example together +- Do not modify `datasets/isb1/converted/**` during publication prep +- Keep the uploaded layout compatible with kv-cache-tester's existing + `TRACE_DIR=hf_--` convention diff --git a/datasets/isb1/hf_dataset_card.md b/datasets/isb1/hf_dataset_card.md new file mode 100644 index 000000000..0815b03a9 --- /dev/null +++ b/datasets/isb1/hf_dataset_card.md @@ -0,0 +1,144 @@ +--- +pretty_name: ISB-1 KV-Cache-Tester Traces +tags: + - kv-cache + - inference-benchmarking + - vllm + - long-context + - multi-turn +license: apache-2.0 +task_categories: + - text-generation +size_categories: + - 1M--` with no harness changes. + +Current corpus summary: + +- 179 validated trace JSON files +- 1,226 total requests across the corpus +- ~9.26 MiB on disk including `manifest.json` +- Context bands spanning 8k, 32k, 64k, 131k, 500k preview, and 1m preview +- Model coverage including DeepSeek-R1-0528, GPT-OSS-120B, GLM-5, + MiniMax-M2.5, and Qwen3.5-397B-A17B variants present in the source corpus + +Preferred dataset name: + +- `semianalysisai/isb1-cc-traces` + +Fallback if SemiAnalysisAI org write access is unavailable: + +- `ocwc22/isb1-cc-traces` + +The intended consumer is Cam's existing kv-cache-tester replay flow, which +already accepts `hf_--` as a `TRACE_DIR` source and hydrates the +remote dataset locally before replay. + +## Schema + +Each JSON file is one replayable trace object with the following top-level +fields: + +| Field | Type | Notes | +| --- | --- | --- | +| `id` | `str` | Stable trace/session identifier | +| `models` | `list[str]` | Model family identifiers for the trace | +| `block_size` | `int` | Hash-block size used for `hash_ids`; current corpus uses `64` | +| `hash_id_scope` | `str` | `local` or `global`; validated by the stdlib checker | +| `requests` | `list[object]` | Ordered replay requests | +| `tool_tokens` | `int` | Optional token accounting | +| `system_tokens` | `int` | Optional token accounting | +| `totals` / `isb1` | `object` | Source metadata carried through from conversion | + +Each request entry includes the fields kv-cache-tester expects today: + +- `type` +- `t` +- `in` +- `out` +- `hash_ids` +- optional `model`, `stop`, `input_types`, `output_types`, `api_time`, + `think_time` + +Before publishing or consuming the corpus, validate it with the bundled +stdlib-only checker: + +```bash +python3 tools/validate_kvcache_tester_trace.py datasets/isb1/converted/ +``` + +Exit codes: + +- `0` = all trace files valid +- `1` = one or more trace files failed validation +- `2` = usage or path error + +## How to use + +For zero-friction consumption through the existing replay wrapper, point +`TRACE_DIR` at the HF dataset name in `hf_--` form: + +```bash +TRACE_DIR=hf_semianalysisai--isb1-cc-traces \ +bash experimental/multiturn/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +``` + +That is the whole integration contract: the wrapper handles the HF download and +passes the hydrated local mirror into the replay runner. If the preferred org +is unavailable, the fallback form is identical: + +```bash +TRACE_DIR=hf_ocwc22--isb1-cc-traces \ +bash experimental/multiturn/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh +``` + +For local inspection after download, the dataset also includes: + +- `manifest.json` — corpus summary and per-trace metadata +- the original directory layout under `core/`, `extension_32k/`, + `extension_64k/`, `extension_131k/`, and `preview/` + +## License + +This corpus follows the repository license in `SemiAnalysisAI/InferenceX`, +which is Apache-2.0. + +## Citation + +If you use this corpus in a benchmark, report, or derivative evaluation, cite +both the InferenceX repository and the dataset name/revision that you consumed. +A lightweight citation template is below. + +```bibtex +@misc{isb1_kvcache_tester_traces_2026, + title = {ISB-1 KV-Cache-Tester Traces}, + author = {SemiAnalysisAI}, + year = {2026}, + howpublished = {Hugging Face dataset repository}, + note = {Preferred repo: semianalysisai/isb1-cc-traces; fallback: ocwc22/isb1-cc-traces} +} +``` diff --git a/tools/publish_hf_dataset.py b/tools/publish_hf_dataset.py new file mode 100755 index 000000000..32c316d5f --- /dev/null +++ b/tools/publish_hf_dataset.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +"""Publish the ISB-1 kv-cache-tester corpus to a Hugging Face dataset repo. + +This script stages `datasets/isb1/converted/` together with the checked-in +Hugging Face dataset card (`datasets/isb1/hf_dataset_card.md`) and either: + +- prints a dry-run plan without making network changes, or +- creates/uploads a dataset repository via `huggingface_hub`. + +Examples: + python3 tools/publish_hf_dataset.py \ + --source datasets/isb1/converted/ \ + --repo semianalysisai/isb1-cc-traces \ + --private \ + --dry-run + + python3 tools/publish_hf_dataset.py \ + --source datasets/isb1/converted/ \ + --repo semianalysisai/isb1-cc-traces \ + --public \ + --commit-message "Publish ISB-1 kv-cache-tester traces" +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +import tempfile +from collections import Counter +from pathlib import Path +from typing import Iterable + +DEFAULT_SOURCE = Path("datasets/isb1/converted") +DEFAULT_CARD = Path("datasets/isb1/hf_dataset_card.md") +DEFAULT_COMMIT_MESSAGE = "Publish ISB-1 kv-cache-tester traces" + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="publish_hf_dataset.py", + description=( + "Stage and publish the ISB-1 converted kv-cache-tester corpus to a " + "Hugging Face dataset repository." + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Example:\n" + " python3 tools/publish_hf_dataset.py --source datasets/isb1/converted/ " + "--repo semianalysisai/isb1-cc-traces --private --dry-run" + ), + ) + parser.add_argument( + "--source", + type=Path, + default=DEFAULT_SOURCE, + help="directory containing converted trace JSON files and manifest.json", + ) + parser.add_argument( + "--repo", + required=True, + help="destination dataset repo in / form", + ) + visibility = parser.add_mutually_exclusive_group() + visibility.add_argument( + "--private", + action="store_true", + help="create/publish as a private dataset repo (default)", + ) + visibility.add_argument( + "--public", + action="store_true", + help="create/publish as a public dataset repo", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="print the staged upload plan without calling the Hugging Face Hub", + ) + parser.add_argument( + "--commit-message", + default=DEFAULT_COMMIT_MESSAGE, + help=f"commit message for the upload commit (default: {DEFAULT_COMMIT_MESSAGE!r})", + ) + return parser.parse_args(argv) + + +def fail(message: str, exit_code: int = 2) -> int: + print(message, file=sys.stderr) + return exit_code + + +def require_existing_dir(path: Path, label: str) -> Path: + resolved = path.resolve() + if not resolved.exists(): + raise FileNotFoundError(f"{label} not found: {resolved}") + if not resolved.is_dir(): + raise NotADirectoryError(f"{label} must be a directory: {resolved}") + return resolved + + +def require_existing_file(path: Path, label: str) -> Path: + resolved = path.resolve() + if not resolved.exists(): + raise FileNotFoundError(f"{label} not found: {resolved}") + if not resolved.is_file(): + raise FileNotFoundError(f"{label} must be a file: {resolved}") + return resolved + + +def load_manifest(source_dir: Path) -> dict: + manifest_path = source_dir / "manifest.json" + require_existing_file(manifest_path, "manifest") + try: + return json.loads(manifest_path.read_text()) + except json.JSONDecodeError as exc: + raise ValueError(f"invalid manifest JSON: {manifest_path}: {exc}") from exc + + +def infer_dataset_card(source_dir: Path) -> Path: + candidate = source_dir.parent / "hf_dataset_card.md" + return require_existing_file(candidate, "dataset card") + + +def iter_upload_files(stage_dir: Path) -> list[Path]: + return sorted(path for path in stage_dir.rglob("*") if path.is_file()) + + +def human_bytes(num_bytes: int) -> str: + units = ["B", "KiB", "MiB", "GiB", "TiB"] + value = float(num_bytes) + for unit in units: + if value < 1024 or unit == units[-1]: + return f"{value:.2f} {unit}" if unit != "B" else f"{int(value)} B" + value /= 1024 + return f"{num_bytes} B" + + +def summarize_manifest(manifest: dict) -> dict[str, object]: + traces = manifest.get("traces", []) + by_scale = Counter() + for trace in traces: + by_scale[str(trace.get("scale_band", "unknown"))] += 1 + return { + "schema_version": manifest.get("schema_version", "unknown"), + "generated_at": manifest.get("generated_at", "unknown"), + "total_traces": manifest.get("total_traces", len(traces)), + "total_requests": manifest.get("total_requests", "unknown"), + "by_scale_band": dict(sorted(by_scale.items())), + } + + +def print_summary(*, repo: str, private: bool, manifest_summary: dict[str, object], files: Iterable[Path], stage_dir: Path) -> None: + files = list(files) + total_bytes = sum(path.stat().st_size for path in files) + print(f"repo: {repo}") + print(f"visibility: {'private' if private else 'public'}") + print(f"dataset_uri: https://huggingface.co/datasets/{repo}") + print(f"trace_dir_alias: hf_{repo.replace('/', '--')}") + print(f"staged_dir: {stage_dir}") + print("manifest:") + for key, value in manifest_summary.items(): + print(f" {key}: {value}") + print(f"files_to_upload: {len(files)} files | {human_bytes(total_bytes)}") + for file_path in files: + rel = file_path.relative_to(stage_dir) + print(f" - {rel} ({human_bytes(file_path.stat().st_size)})") + + +def stage_upload_tree(source_dir: Path, dataset_card_path: Path, work_dir: Path) -> Path: + stage_dir = work_dir / "hf_dataset_upload" + shutil.copytree(source_dir, stage_dir) + shutil.copy2(dataset_card_path, stage_dir / "README.md") + return stage_dir + + +def load_hf_api() -> tuple[object, object]: + try: + from huggingface_hub import HfApi, snapshot_download + except ImportError as exc: + raise RuntimeError( + "huggingface_hub is required for live publish operations. " + "Install it with `python3 -m pip install huggingface_hub`." + ) from exc + return HfApi, snapshot_download + + +def verify_remote_snapshot(snapshot_download: object, repo: str) -> Path: + verify_root = Path(tempfile.mkdtemp(prefix="isb1-hf-verify-")) + snapshot_path = snapshot_download( + repo_id=repo, + repo_type="dataset", + local_dir=str(verify_root), + local_dir_use_symlinks=False, + ) + return Path(snapshot_path) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + + if "/" not in args.repo or args.repo.count("/") != 1: + return fail("--repo must be in / form") + + private = not args.public + + try: + source_dir = require_existing_dir(args.source, "source directory") + manifest = load_manifest(source_dir) + dataset_card_path = infer_dataset_card(source_dir) + except (FileNotFoundError, NotADirectoryError, ValueError) as exc: + return fail(str(exc)) + + with tempfile.TemporaryDirectory(prefix="isb1-hf-stage-") as temp_root: + stage_dir = stage_upload_tree(source_dir, dataset_card_path, Path(temp_root)) + manifest_summary = summarize_manifest(manifest) + upload_files = iter_upload_files(stage_dir) + print_summary( + repo=args.repo, + private=private, + manifest_summary=manifest_summary, + files=upload_files, + stage_dir=stage_dir, + ) + + if args.dry_run: + print("dry_run: true") + print("remote_actions: skipped") + print( + "note: dry-run stages README.md + manifest + trace files without calling " + "create_repo/upload_folder/snapshot_download" + ) + return 0 + + try: + HfApi, snapshot_download = load_hf_api() + from huggingface_hub.errors import RepositoryNotFoundError + except RuntimeError as exc: + return fail(str(exc)) + except ImportError as exc: + return fail(f"huggingface_hub import failed: {exc}") + + api = HfApi() + repo_exists = True + try: + api.repo_info(repo_id=args.repo, repo_type="dataset") + except RepositoryNotFoundError: + repo_exists = False + + if not repo_exists: + api.create_repo( + repo_id=args.repo, + repo_type="dataset", + private=private, + exist_ok=True, + ) + print(f"created_repo: {args.repo}") + else: + print(f"created_repo: skipped (already exists: {args.repo})") + + api.upload_folder( + repo_id=args.repo, + repo_type="dataset", + folder_path=str(stage_dir), + commit_message=args.commit_message, + ) + print(f"uploaded_repo: {args.repo}") + snapshot_path = verify_remote_snapshot(snapshot_download, args.repo) + print(f"verified_snapshot: {snapshot_path}") + print(f"publish_complete: https://huggingface.co/datasets/{args.repo}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 38fd91a779573ec70e50371a815ffa6c2ce15f95 Mon Sep 17 00:00:00 2001 From: William Chen <57119977+OCWC22@users.noreply.github.com> Date: Tue, 21 Apr 2026 21:32:07 -0700 Subject: [PATCH 18/18] feat(isb1): add noprefix sweep cells + DSR1 131k HF trace_replay cell MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR B (kv-cache-tester lane): - Extend offload: ["on","off"] to ["on","off","noprefix"] on every H200 fp8 Qwen3 cell and the H100 fp8 Qwen3 lmcache cell. - Document the three offload values in the header comment so the sweep generator emits noprefix cells alongside on/off. - Cam's multiturn_fp8_h100_lmcache_aiperf.sh:123-126 already wires --no-enable-prefix-caching; noprefix just lets the sweep invoke it. PR D (DSR1 131k HF trace_replay): - New cell b200-fp4-dsr1-isb1-code-131k-hf pointing at the freshly published HF dataset (wchen22/isb1-cc-traces, alias hf_wchen22--isb1-cc-traces). Pairs with --no-max-tokens (a01b775b). - Header comment extended to document HF alias as a valid TRACE_DIR. HF publish gotcha: - HF_PUBLISH.md gains a "Python version" section (new §3): the publisher needs huggingface_hub which requires Python >= 3.10. macOS system python3 (3.9) will fail with ModuleNotFoundError. Prefer /opt/homebrew/opt/python@3.13/bin/python3.13. - Remaining section numbers shifted +1 accordingly. --- .../configs/multiturn-agentic-trace-isb1.yaml | 37 ++++++++++++++----- datasets/isb1/HF_PUBLISH.md | 29 ++++++++++++--- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/.github/configs/multiturn-agentic-trace-isb1.yaml b/.github/configs/multiturn-agentic-trace-isb1.yaml index 9921d3bc4..c0cfa7a87 100644 --- a/.github/configs/multiturn-agentic-trace-isb1.yaml +++ b/.github/configs/multiturn-agentic-trace-isb1.yaml @@ -7,19 +7,29 @@ # 131k code/chat cells map to datasets/isb1/converted/extension_131k/*_131k1k*/. # 500k preview cells map to datasets/isb1/converted/preview/long_context_500k/. # 1m preview cells map to datasets/isb1/converted/preview/long_context_1m/. -# Expected TRACE_DIR is either datasets/isb1/converted/ or one of those subdirs. +# Expected TRACE_DIR is either datasets/isb1/converted/ or one of those subdirs, +# or an HF alias like hf_wchen22--isb1-cc-traces (resolved by the replay +# harness's TRACE_DIR=hf_-- hydration path). +# +# offload values: +# on — KV offload enabled (VLLM_USE_SIMPLE_KV_OFFLOAD=1) +# off — KV offload disabled (baseline) +# noprefix — offload off AND --no-enable-prefix-caching (clean-cache floor). +# Cam's h100 lane already wires the flag in +# multiturn_fp8_h100_lmcache_aiperf.sh:123-126; these cells just +# surface the third mode so the sweep generator emits it. h200-fp8-qwen3-isb1-code-8k: - tp2: {users: [2, 4, 8, 16, 32, 64, 128], offload: ["on", "off"]} - tp4: {users: [2, 4, 8, 16, 32, 64, 128], offload: ["on", "off"]} + tp2: {users: [2, 4, 8, 16, 32, 64, 128], offload: ["on", "off", "noprefix"]} + tp4: {users: [2, 4, 8, 16, 32, 64, 128], offload: ["on", "off", "noprefix"]} h200-fp8-qwen3-isb1-chat-32k: - tp2: {users: [1, 2, 4, 8, 16, 32], offload: ["on", "off"]} - tp4: {users: [1, 2, 4, 8, 16, 32, 64], offload: ["on", "off"]} + tp2: {users: [1, 2, 4, 8, 16, 32], offload: ["on", "off", "noprefix"]} + tp4: {users: [1, 2, 4, 8, 16, 32, 64], offload: ["on", "off", "noprefix"]} h200-fp8-qwen3-isb1-code-131k: - tp4: {users: [1, 2, 4, 8], offload: ["on", "off"]} - tp8: {users: [1, 2, 4, 8, 16], offload: ["on", "off"]} + tp4: {users: [1, 2, 4, 8], offload: ["on", "off", "noprefix"]} + tp8: {users: [1, 2, 4, 8, 16], offload: ["on", "off", "noprefix"]} b200-fp4-dsr1-isb1-code-8k: tp4: {ep: 4, users: [4, 8, 16, 32, 64, 128, 256], offload: ["on", "off"]} @@ -32,6 +42,15 @@ b200-fp4-dsr1-isb1-chat-32k: b200-fp4-dsr1-isb1-code-131k: tp8: {ep: 8, users: [1, 2, 4, 8, 16], offload: ["on", "off"]} +# DSR1 131k reasoning cell — trace_replay backed by the HF publish +# (wchen22/isb1-cc-traces). Exercises Cam's Apr 20 --no-max-tokens flag +# against a reasoning corpus without requiring local dataset checkout. +# TRACE_DIR alias: hf_wchen22--isb1-cc-traces +# (subset consumed by this cell: extension_131k/code_131k1k* and +# extension_131k/chat_131k1k*) +b200-fp4-dsr1-isb1-code-131k-hf: + tp8: {ep: 8, users: [1, 2, 4, 8, 16], offload: ["on", "off"]} + b200-fp4-qwen3-isb1-chat-500k-preview: tp4: {users: [1, 2, 4], offload: ["on", "off"]} tp8: {users: [1, 2, 4, 8], offload: ["on", "off"]} @@ -47,8 +66,8 @@ mi355x-fp8-qwen3-isb1-chat-32k: tp4: {users: [1, 2, 4, 8, 16, 32], offload: ["on", "off"]} h100-fp8-qwen3-isb1-code-8k-lmcache: - tp2: {users: [1, 2, 4, 8, 16, 32], offload: ["on", "off"]} - tp4: {users: [1, 2, 4, 8, 16, 32, 64], offload: ["on", "off"]} + tp2: {users: [1, 2, 4, 8, 16, 32], offload: ["on", "off", "noprefix"]} + tp4: {users: [1, 2, 4, 8, 16, 32, 64], offload: ["on", "off", "noprefix"]} h200-fp8-qwen3-isb1-debug: tp2: {users: [2], offload: ["off"]} diff --git a/datasets/isb1/HF_PUBLISH.md b/datasets/isb1/HF_PUBLISH.md index 7b9c8ffc5..28f5ae164 100644 --- a/datasets/isb1/HF_PUBLISH.md +++ b/datasets/isb1/HF_PUBLISH.md @@ -33,7 +33,24 @@ Expected result: If validation fails, stop and fix the source corpus before publishing. Do not push a broken dataset mirror to HF. -## 3. Token setup +## 3. Python version + +`tools/publish_hf_dataset.py` imports `huggingface_hub >= 0.24`, which in turn +requires Python 3.10+. On macOS the system `/usr/bin/python3` is 3.9 and does +not ship `huggingface_hub`; do not use it. + +Use Python 3.13 explicitly: + +```bash +/opt/homebrew/opt/python@3.13/bin/python3.13 -m pip install --user huggingface_hub +/opt/homebrew/opt/python@3.13/bin/python3.13 tools/publish_hf_dataset.py --help +``` + +Or activate a virtualenv / pyenv shim that resolves to 3.10+ before running any +of the commands below. If you see `ModuleNotFoundError: huggingface_hub`, you +are on 3.9 — switch interpreters first. + +## 4. Token setup Authenticate with a token that has write access to the destination namespace: @@ -48,7 +65,7 @@ export HF_TOKEN=hf_xxx huggingface-cli login --token "$HF_TOKEN" ``` -## 4. Dry-run the publish package locally +## 5. Dry-run the publish package locally The uploader script stages the converted corpus plus the dataset card and prints the exact file list it would upload without making any remote changes. @@ -71,7 +88,7 @@ python3 tools/publish_hf_dataset.py \ --dry-run ``` -## 5. Publish for real +## 6. Publish for real Once the dry-run output looks correct and HF auth is configured, publish with one of the exact commands below. @@ -114,7 +131,7 @@ The script will: 4. Upload the staged folder with `huggingface_hub` 5. Verify the published snapshot with `snapshot_download` into `/tmp` -## 6. Post-publish verification +## 7. Post-publish verification ### Repository-level verification @@ -146,7 +163,7 @@ TRACE_DIR=hf_ocwc22--isb1-cc-traces \ bash experimental/multiturn/benchmarks/single_node/multiturn_fp8_h200_trace_replay.sh ``` -## 7. Consumer note for Cam +## 8. Consumer note for Cam This is the zero-friction handoff: @@ -158,7 +175,7 @@ bash experimental/multiturn/benchmarks/single_node/multiturn_fp8_h200_trace_repl No code change is required in Cam's harness. The only user action is publishing this dataset repo once with valid HF credentials. -## 8. Versioning guidance +## 9. Versioning guidance When new traces land: