From 10a5c20304c46785d09fe293801d0ab3ccafa2e6 Mon Sep 17 00:00:00 2001
From: William Chen <57119977+OCWC22@users.noreply.github.com>
Date: Tue, 28 Apr 2026 12:49:07 -0700
Subject: [PATCH] feat(agentx): add DSv4 agentic cells for B200, B300, H200

PR #1201 currently includes DSR1 agentic cells but not DSv4 variants.\nAdd DSv4 B200/B300/H200 cells that mirror the DSR1 B200 agentic flow and reuse benchmark_lib.sh helpers from PR #1201.\nThis is purely additive and does not change replay methodology, metrics, or workflow behavior.

Signed-off-by: William Chen <57119977+OCWC22@users.noreply.github.com>
---
 .../single_node/agentic/dsv4_fp4_b200.sh      | 70 +++++++++++++++++
 .../single_node/agentic/dsv4_fp4_b300.sh      | 69 +++++++++++++++++
 .../single_node/agentic/dsv4_fp8_h200.sh      | 75 +++++++++++++++++++
 3 files changed, 214 insertions(+)
 create mode 100755 benchmarks/single_node/agentic/dsv4_fp4_b200.sh
 create mode 100755 benchmarks/single_node/agentic/dsv4_fp4_b300.sh
 create mode 100755 benchmarks/single_node/agentic/dsv4_fp8_h200.sh

diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200.sh
new file mode 100755
index 000000000..ad9b1bd51
--- /dev/null
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b200.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for DSv4 FP4 on B200 using SGLang.
+#
+# Required env vars:
+#   MODEL, TP, CONC, RESULT_DIR
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_DIR
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+MAX_DELAY=${MAX_DELAY:-60}
+ADVANCE_MIN=${ADVANCE_MIN:-0.0}
+ADVANCE_MAX=${ADVANCE_MAX:-0.7}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+nvidia-smi
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Start SGLang server ----------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+echo "Starting SGLang server..."
+export PYTHONNOUSERSITE=1
+
+sglang serve \
+    --model-path "$MODEL" \
+    --host 0.0.0.0 \
+    --port "$PORT" \
+    --trust-remote-code \
+    --tp "$TP" \
+    --disable-radix-cache \
+    --max-running-requests "$((CONC * 3 / 2))" \
+    --mem-fraction-static 0.90 \
+    --swa-full-tokens-ratio 0.1 \
+    --moe-runner-backend flashinfer_mxfp4 \
+    --chunked-prefill-size 8192 \
+    --disable-flashinfer-autotune \
+    --enable-metrics > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
+set +x
+
+write_agentic_result_json "$RESULT_DIR"
+
+# ---- Post-processing --------------------------------------------------------
+python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300.sh
new file mode 100755
index 000000000..e66fe0abd
--- /dev/null
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b300.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for DSv4 FP4 on B300 using SGLang.
+#
+# Required env vars:
+#   MODEL, TP, CONC, RESULT_DIR
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_DIR
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+MAX_DELAY=${MAX_DELAY:-60}
+ADVANCE_MIN=${ADVANCE_MIN:-0.0}
+ADVANCE_MAX=${ADVANCE_MAX:-0.7}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+nvidia-smi
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Start SGLang server ----------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+echo "Starting SGLang server..."
+export PYTHONNOUSERSITE=1
+
+sglang serve \
+    --model-path "$MODEL" \
+    --host 0.0.0.0 \
+    --port "$PORT" \
+    --trust-remote-code \
+    --tp "$TP" \
+    --max-running-requests "$((CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8))" \
+    --mem-fraction-static 0.90 \
+    --swa-full-tokens-ratio 0.1 \
+    --moe-runner-backend flashinfer_mxfp4 \
+    --chunked-prefill-size 8192 \
+    --disable-flashinfer-autotune \
+    --enable-metrics > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
+set +x
+
+write_agentic_result_json "$RESULT_DIR"
+
+# ---- Post-processing --------------------------------------------------------
+python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
new file mode 100755
index 000000000..f22a69826
--- /dev/null
+++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for DSv4 FP8 on H200 using SGLang.
+#
+# Required env vars:
+#   MODEL, TP, CONC, RESULT_DIR
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_DIR
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+MAX_DELAY=${MAX_DELAY:-60}
+ADVANCE_MIN=${ADVANCE_MIN:-0.0}
+ADVANCE_MAX=${ADVANCE_MAX:-0.7}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+nvidia-smi
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Start SGLang server ----------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+echo "Starting server..."
+export VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+vllm serve "$MODEL" \
+    --host 0.0.0.0 \
+    --port "$PORT" \
+    --trust-remote-code \
+    --kv-cache-dtype fp8 \
+    --block-size 256 \
+    --no-enable-prefix-caching \
+    --enable-expert-parallel \
+    --data-parallel-size "$TP" \
+    --max-model-len 800000 \
+    --gpu-memory-utilization 0.95 \
+    --max-num-seqs 512 \
+    --max-num-batched-tokens 512 \
+    --no-enable-flashinfer-autotune \
+    --compilation-config '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' \
+    --tokenizer-mode deepseek_v4 \
+    --tool-call-parser deepseek_v4 \
+    --enable-auto-tool-choice \
+    --reasoning-parser deepseek_v4 > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
+set +x
+
+write_agentic_result_json "$RESULT_DIR"
+
+# ---- Post-processing --------------------------------------------------------
+python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true