diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 32de6f552..67ae47b4c 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -993,7 +993,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 kimik2.5-fp4-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:v0.18.0
+  image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x-disagg
@@ -1046,7 +1046,7 @@ kimik2.5-fp4-mi355x-vllm-disagg:
         - "DECODE_NODES=2"
 
 minimaxm2.5-fp8-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:v0.18.0
+  image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x-disagg
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 81da415e8..cd4794ed5 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -32,8 +32,13 @@ fi
 export IBDEVICES
 
 # Shared: Auto-detect default network interface (portable across clusters)
-export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
-export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
+# Only auto-detect if not already set by the runner/environment
+if [[ -z "$GLOO_SOCKET_IFNAME" ]]; then
+    export GLOO_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1)
+fi
+if [[ -z "$NCCL_SOCKET_IFNAME" ]]; then
+    export NCCL_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1)
+fi
 
 set +x
 
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index abb80b97b..f29b4b71b 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -416,6 +416,7 @@ if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \
         \"$VLLM_ROUTER_IMAGE\" \
         bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \
             --vllm-pd-disaggregation \
+            --kv-connector moriio \
             --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \
             --port ${ROUTER_PORT} \
             --host 0.0.0.0 \
diff --git a/benchmarks/multi_node/amd_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py
deleted file mode 100644
index 7d1e8454b..000000000
--- a/benchmarks/multi_node/amd_utils/moriio_proxy.py
+++ /dev/null
@@ -1,327 +0,0 @@
-#!/usr/bin/env python3
-# MoRI-IO proxy server for vLLM PD disaggregation.
-#
-# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
-# with the following adaptations for production multi-node use:
-#   - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars
-#   - /health endpoint for sync.py barrier readiness checks
-#   - Uses stdlib `re` instead of `regex` to avoid extra dep
-#
-# The proxy performs two roles that vllm-router cannot:
-#   1. ZMQ service discovery — prefill/decode workers register their RDMA ports
-#   2. Request enrichment  — injects remote endpoint info into kv_transfer_params
-
-import asyncio
-import copy
-import logging
-import os
-import re
-import socket
-import threading
-import time
-import uuid
-
-import aiohttp
-import msgpack
-import zmq
-from quart import Quart, make_response, request
-
-logger = logging.getLogger("moriio_proxy")
-logger.setLevel(logging.DEBUG)
-handler = logging.StreamHandler()
-handler.setFormatter(logging.Formatter(
-    "%(asctime)s %(levelname)s [%(name)s] %(message)s"))
-logger.addHandler(handler)
-
-prefill_instances: list[dict] = []
-decode_instances: list[dict] = []
-request_nums = 0
-app = Quart(__name__)
-
-STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300"))
-
-IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)")
-
-TRANSFER_TYPE = None
-
-
-def _append_whole_dict_unique(target_list, data_dict):
-    new_filtered = {k: v for k, v in data_dict.items() if k != "index"}
-    for existed in target_list:
-        existed_filtered = {k: v for k, v in existed.items() if k != "index"}
-        if existed_filtered == new_filtered:
-            return False
-    logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s",
-                data_dict.get("role"), data_dict.get("request_address"),
-                data_dict.get("handshake_port"), data_dict.get("notify_port"),
-                data_dict.get("dp_size"), data_dict.get("tp_size"))
-    target_list.append(data_dict)
-    transfer_mode = data_dict.get("transfer_mode", "unknown")
-    global TRANSFER_TYPE
-
-    if TRANSFER_TYPE is None:
-        TRANSFER_TYPE = transfer_mode
-        logger.info("Transfer mode set to: %s", TRANSFER_TYPE)
-    elif transfer_mode != TRANSFER_TYPE:
-        raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}")
-
-    return True
-
-
-_list_lock = threading.RLock()
-
-
-def _listen_for_register(hostname, port):
-    context = zmq.Context()
-    router_socket = context.socket(zmq.ROUTER)
-    router_socket.bind(f"tcp://{hostname}:{port}")
-    poller = zmq.Poller()
-    poller.register(router_socket, zmq.POLLIN)
-    global prefill_instances
-    global decode_instances
-
-    while True:
-        socks = dict(poller.poll())
-        if router_socket in socks:
-            remote_addr, msg = router_socket.recv_multipart()
-            data = msgpack.loads(msg)
-            if data["type"] == "HELLO":
-                pass
-            elif (
-                data["type"] == "register"
-                and data["role"] == "P"
-                and data["request_address"] not in prefill_instances
-            ):
-                with _list_lock:
-                    _append_whole_dict_unique(prefill_instances, data)
-
-            elif (
-                data["type"] == "register"
-                and data["role"] == "D"
-                and data["request_address"] not in decode_instances
-            ):
-                with _list_lock:
-                    _append_whole_dict_unique(decode_instances, data)
-
-
-def start_service_discovery(hostname, port):
-    if not hostname:
-        hostname = socket.gethostname()
-    if port == 0:
-        raise ValueError("Port cannot be 0")
-
-    _listener_thread = threading.Thread(
-        target=_listen_for_register, args=(hostname, port), daemon=True
-    )
-    _listener_thread.start()
-    logger.info("Service discovery listening on %s:%s", hostname, port)
-    return _listener_thread
-
-
-async def send_request_to_prefill(
-    endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank
-):
-    req_data_copy = req_data
-
-    req_data_copy["kv_transfer_params"].update(
-        {
-            "do_remote_decode": True,
-            "do_remote_prefill": False,
-            "remote_handshake_port": d_endpoint["handshake_port"],
-            "remote_notify_port": d_endpoint["notify_port"],
-            "remote_engine_id": None,
-            "remote_block_ids": None,
-            "remote_host": dip,
-            "remote_port": dport,
-        }
-    )
-    req_data_copy["stream"] = False
-    req_data_copy["max_tokens"] = 1
-    if "max_completion_tokens" in req_data_copy:
-        req_data_copy["max_completion_tokens"] = 1
-    if "stream_options" in req_data_copy:
-        del req_data_copy["stream_options"]
-    async with aiohttp.ClientSession(
-        timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000)
-    ) as session:
-        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-            "X-Request-Id": request_id,
-        }
-        if selected_prefill_dp_rank is not None:
-            headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank)
-        async with session.post(
-            url=endpoint, json=req_data_copy, headers=headers
-        ) as response:
-            if response.status == 200:
-                return await response.json()
-            else:
-                raise RuntimeError(
-                    f"Prefill response status={response.status}"
-                )
-
-
-async def start_decode_request(endpoint, req_data, request_id):
-    session = aiohttp.ClientSession(
-        timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000)
-    )
-    headers = {
-        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        "X-Request-Id": request_id,
-    }
-    response = await session.post(url=endpoint, json=req_data, headers=headers)
-    return session, response
-
-
-async def stream_decode_response(session, response, request_id):
-    try:
-        if response.status == 200:
-            chunk_iter = response.content.iter_chunked(1024).__aiter__()
-            while True:
-                try:
-                    chunk_bytes = await asyncio.wait_for(
-                        chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT,
-                    )
-                    yield chunk_bytes
-                except StopAsyncIteration:
-                    break
-                except asyncio.TimeoutError:
-                    logger.error(
-                        "Decode stream %s idle for %ds, aborting",
-                        request_id, STREAM_IDLE_TIMEOUT,
-                    )
-                    break
-        else:
-            raise RuntimeError(
-                f"Decode response status={response.status}"
-            )
-    finally:
-        await response.release()
-        await session.close()
-
-
-@app.route("/health", methods=["GET"])
-async def health_check():
-    with _list_lock:
-        p_count = len(prefill_instances)
-        d_count = len(decode_instances)
-    return await make_response(
-        ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200)
-    )
-
-
-@app.route("/v1/completions", methods=["POST"])
-@app.route("/v1/chat/completions", methods=["POST"])
-async def handle_request():
-    try:
-        with _list_lock:
-            global request_nums
-            request_nums += 1
-
-        def extract_ip_port_fast(url):
-            match = IP_PORT_PATTERN.search(url)
-            if not match:
-                raise ValueError(f"Invalid URL format: {url}")
-            return match.groups()
-
-        req_data = await request.get_json()
-        request_id = str(uuid.uuid4())
-
-        if not prefill_instances or not decode_instances:
-            return await make_response(
-                ("Service Unavailable: No prefill or decode instances registered.", 503)
-            )
-
-        pid = request_nums % len(prefill_instances)
-        did = request_nums % len(decode_instances)
-        prefill_instance_endpoint = prefill_instances[pid]
-        decode_instance_endpoint = decode_instances[did]
-
-        selected_prefill_dp_rank = None
-        if prefill_instance_endpoint["dp_size"] > 1:
-            selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"]
-
-        dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"])
-
-        req_data_to_prefill = copy.deepcopy(req_data)
-        req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id}
-        req_data["kv_transfer_params"] = {"transfer_id": request_id}
-        req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = (
-            decode_instance_endpoint["dp_size"]
-        )
-        req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = (
-            decode_instance_endpoint["tp_size"]
-        )
-
-        send_prefill_task = asyncio.create_task(
-            send_request_to_prefill(
-                prefill_instance_endpoint["request_address"],
-                req_data_to_prefill,
-                request_id,
-                decode_instance_endpoint,
-                dip,
-                dport,
-                selected_prefill_dp_rank,
-            )
-        )
-        ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"])
-
-        req_data["max_tokens"] -= 1
-
-        req_data["kv_transfer_params"] = {
-            "transfer_id": request_id,
-            "do_remote_decode": False,
-            "do_remote_prefill": True,
-            "remote_handshake_port": prefill_instance_endpoint["handshake_port"],
-            "remote_notify_port": prefill_instance_endpoint["notify_port"],
-            "remote_engine_id": None,
-            "remote_block_ids": None,
-            "remote_host": ip,
-            "remote_port": port,
-        }
-        if TRANSFER_TYPE == "READ":
-            prefill_response = await send_prefill_task
-            req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[
-                "kv_transfer_params"
-            ]["remote_engine_id"]
-            req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[
-                "kv_transfer_params"
-            ]["remote_block_ids"]
-
-        req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[
-            "dp_size"
-        ]
-        req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[
-            "tp_size"
-        ]
-
-        if selected_prefill_dp_rank is not None:
-            req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank
-
-        decode_request_task = asyncio.create_task(
-            start_decode_request(
-                decode_instance_endpoint["request_address"], req_data, request_id
-            )
-        )
-
-        session, decode_response = await decode_request_task
-        stream_generator = stream_decode_response(session, decode_response, request_id)
-        response = await make_response(stream_generator)
-        return response
-    except Exception as e:
-        logger.exception("Error handling request: %s", e)
-        return await make_response((f"Internal Server Error: {e!s}", 500))
-
-
-if __name__ == "__main__":
-    http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000"))
-    ping_port = int(os.environ.get("PROXY_PING_PORT", "36367"))
-
-    t = start_service_discovery("0.0.0.0", ping_port)
-    app.debug = False
-    app.config["BODY_TIMEOUT"] = 360000
-    app.config["RESPONSE_TIMEOUT"] = 360000
-
-    logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port)
-    app.run(host="0.0.0.0", port=http_port)
-    t.join()
diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py
index 8290276fb..ac830eb1f 100644
--- a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py
+++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py
@@ -137,7 +137,6 @@ def __init__(
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
-            reduce_results=False,
             renormalize=True,
             scoring_func=getattr(config, "scoring_func", "softmax"),
             e_score_correction_bias=self.e_score_correction_bias,
@@ -185,7 +184,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             )
             final_hidden_states = final_hidden_states[:num_tokens]
         elif self.tp_size > 1:
-            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+            from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+            final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states
             )
 
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
index 73cad3adc..9acb05f54 100755
--- a/benchmarks/multi_node/amd_utils/server_vllm.sh
+++ b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -242,7 +242,7 @@ done
 echo "Prefill node IPs: ${PREFILL_ARGS}"
 echo "Decode  node IPs: ${DECODE_ARGS}"
 
-# MoRI-IO proxy ZMQ registration port (must match moriio_proxy.py PROXY_PING_PORT)
+# MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address)
 PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
 
 # vLLM environment (UCX transport vars are set at the Docker level in job.slurm)
@@ -281,26 +281,8 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
     setup_vllm_env
 
-    # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup
-    # Skipped when ROUTER_TYPE=vllm-router (external router container started by job.slurm)
-    if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then
-        echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..."
-        PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \
-            python3 $WS_PATH/moriio_proxy.py"
-
-        if [[ "$DRY_RUN" -eq 1 ]]; then
-            echo "DRY RUN: $PROXY_CMD"
-        else
-            PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log"
-            set -x
-            eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 &
-            set +x
-            proxy_pid=$!
-            sleep 3
-        fi
-    else
-        echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})"
-    fi
+    # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE)
+    echo "Using external vllm-router container (started by job.slurm on this node)"
 
     PREFILL_CMD="vllm serve ${MODEL_PATH} \
         --port $SERVER_PORT \
@@ -343,7 +325,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         echo "DRY RUN: $HEALTH_BARRIER_CMD"
     else
         eval "$HEALTH_BARRIER_CMD"
-        echo "${ROUTER_TYPE} is ready for benchmarking"
+        echo "MoRI-IO proxy is ready for benchmarking"
     fi
 
     echo "Ready for benchmarking on ${host_name}:${host_ip}"
@@ -375,14 +357,8 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
     echo "Killing the prefill server"
     if [[ "$DRY_RUN" -eq 0 ]]; then
-        if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then
-            [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true
-        fi
         [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true
         sleep 2
-        if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then
-            pkill -f moriio_proxy 2>/dev/null || true
-        fi
         pkill -f "vllm serve" 2>/dev/null || true
     fi
 
diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh
index 589399f74..958cb9808 100644
--- a/benchmarks/multi_node/amd_utils/setup_deps.sh
+++ b/benchmarks/multi_node/amd_utils/setup_deps.sh
@@ -242,43 +242,48 @@ patch_mori_fp8_compat() {
 import re, os, sys
 patched = []
 
-# 1. Patch layer.py: remove multi-line AITER assertion for MoRI
+# Patch layer.py: remove AITER requirement assertion(s) for MoRI
 try:
     import vllm.model_executor.layers.fused_moe.layer as lm
     f = lm.__file__
     src = open(f).read()
-    if "Mori needs to be used with aiter" in src:
+    if "[PATCHED] AITER requirement removed for MoRI-EP + FP8" in src:
+        print("[SETUP] layer.py MoRI-FP8 patch already applied")
+    elif "Mori needs to be used with aiter" in src:
+        # v0.19+: two consecutive assertions inside `if self.moe_config.use_mori_kernels:`
         new = re.sub(
-            r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)",
+            r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)\s*"
+            r"assert not self\.aiter_fmoe_shared_expert_enabled,\s*\([^)]*\)",
             "pass  # [PATCHED] AITER requirement removed for MoRI-EP + FP8",
             src, flags=re.DOTALL)
+        if new == src:
+            # v0.17.1/v0.18.0: only the first assertion existed
+            new = re.sub(
+                r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)",
+                "pass  # [PATCHED] AITER requirement removed for MoRI-EP + FP8",
+                src, flags=re.DOTALL)
         if new != src:
             open(f, "w").write(new)
             patched.append("layer.py")
+        else:
+            print("[SETUP] ERROR: layer.py pattern found but regex had no effect", file=sys.stderr)
+            sys.exit(1)
+    else:
+        print("[SETUP] ERROR: layer.py AITER assertion pattern not found — vLLM API may have changed", file=sys.stderr)
+        sys.exit(1)
 except Exception as e:
-    print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr)
+    print(f"[SETUP] ERROR patch layer.py: {e}", file=sys.stderr)
+    sys.exit(1)
 
-# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction
-try:
-    import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm
-    f = mm.__file__
-    src = open(f).read()
-    if "defer_input_quant" in src:
-        new = re.sub(
-            r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)",
-            "pass  # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8",
-            src)
-        if new != src:
-            open(f, "w").write(new)
-            patched.append("mori_prepare_finalize.py")
-except Exception as e:
-    print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr)
+# prepare_finalize/mori.py (v0.19+) already handles defer_input_quant correctly
+# (skips FP8 quant when True). No patch needed for that file.
+# Added in 0.18.1: https://github.com/vllm-project/vllm/commit/6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209
 
 if patched:
     print(f"[SETUP] Patched: {chr(44).join(patched)}")
 else:
     print("[SETUP] No MoRI-FP8 patches needed")
-'
+' || exit 1
     _SETUP_INSTALLED+=("MoRI-FP8-patch")
 }
 
@@ -881,7 +886,6 @@ except Exception as e:
 # install_libionic
 # install_mori
 install_amd_quark
-install_mori_proxy_deps
 patch_mori_fp8_compat
 patch_moriio_save_kv_timeout
 patch_moriio_transfer_timeout
diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..4f805f5a9
--- /dev/null
+++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-095035", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 1280, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 128, "duration": 156.629553565057, "completed": 1280, "total_input_tokens": 1310720, "total_output_tokens": 1310720, "request_throughput": 8.17214868373065, "request_goodput:": null, "output_throughput": 8368.280252140186, "total_token_throughput": 16736.560504280373, "mean_ttft_ms": 322.05012791437184, "median_ttft_ms": 168.4953118674457, "std_ttft_ms": 412.7725204129608, "p90_ttft_ms": 1269.4133618613705, "p99_ttft_ms": 1515.174685146194, "p99.9_ttft_ms": 1516.856569517171, "mean_tpot_ms": 14.871257350024614, "median_tpot_ms": 14.892716348905914, "std_tpot_ms": 0.12168611094612052, "p90_tpot_ms": 14.975844017677883, "p99_tpot_ms": 15.061890008918056, "p99.9_tpot_ms": 15.066467860250443, "mean_itl_ms": 14.871688992762476, "median_itl_ms": 14.909795951098204, "std_itl_ms": 1.126674865744914, "p90_itl_ms": 15.301332250237465, "p99_itl_ms": 18.650689502246678, "p99.9_itl_ms": 22.104734647087753, "mean_e2el_ms": 15535.346396989553, "median_e2el_ms": 15410.800829995424, "std_e2el_ms": 414.3454674568179, "p90_e2el_ms": 16522.48927145265, "p99_e2el_ms": 16675.76581487432, "p99.9_e2el_ms": 16677.12394464435}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..5c258b87c
--- /dev/null
+++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-110159", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 160, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 16, "duration": 94.94771882798523, "completed": 160, "total_input_tokens": 163840, "total_output_tokens": 163840, "request_throughput": 1.68513790510195, "request_goodput:": null, "output_throughput": 1725.5812148243967, "total_token_throughput": 3451.1624296487935, "mean_ttft_ms": 185.00612755160546, "median_ttft_ms": 104.43462803959846, "std_ttft_ms": 193.8479555106311, "p90_ttft_ms": 668.5733429389074, "p99_ttft_ms": 687.9313375009224, "p99.9_ttft_ms": 698.2081180762034, "mean_tpot_ms": 9.043755853690438, "median_tpot_ms": 9.03941193840877, "std_tpot_ms": 0.032097259503204134, "p90_tpot_ms": 9.08699567049181, "p99_tpot_ms": 9.092274875981406, "p99.9_tpot_ms": 9.094480784422148, "mean_itl_ms": 9.043755927185433, "median_itl_ms": 9.067311882972717, "std_itl_ms": 0.49330033887680946, "p90_itl_ms": 9.187642950564623, "p99_itl_ms": 9.348576478660105, "p99.9_itl_ms": 11.5778636110482, "mean_e2el_ms": 9436.768365876924, "median_e2el_ms": 9359.918448608369, "std_e2el_ms": 203.51181544144868, "p90_e2el_ms": 9920.965689211152, "p99_e2el_ms": 9977.636895296164, "p99.9_e2el_ms": 9977.824207578553}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..1850ee750
--- /dev/null
+++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-095432", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 2560, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 256, "duration": 187.5680166000966, "completed": 2560, "total_input_tokens": 2621440, "total_output_tokens": 2621440, "request_throughput": 13.648382311671154, "request_goodput:": null, "output_throughput": 13975.943487151262, "total_token_throughput": 27951.886974302524, "mean_ttft_ms": 382.50921859926166, "median_ttft_ms": 215.23276844527572, "std_ttft_ms": 389.20735932882576, "p90_ttft_ms": 1051.413601823151, "p99_ttft_ms": 1578.5206863190976, "p99.9_ttft_ms": 1665.9692472564057, "mean_tpot_ms": 17.745007218556957, "median_tpot_ms": 17.7608991554098, "std_tpot_ms": 0.1668966632857337, "p90_tpot_ms": 17.881159080449176, "p99_tpot_ms": 17.904399000923036, "p99.9_tpot_ms": 17.906466535026507, "mean_itl_ms": 17.745312193796607, "median_itl_ms": 17.753243912011385, "std_itl_ms": 2.8374261437282113, "p90_itl_ms": 18.29339493997395, "p99_itl_ms": 30.380772710777826, "p99.9_itl_ms": 41.25628810096536, "mean_e2el_ms": 18535.651603183032, "median_e2el_ms": 18436.250409460627, "std_e2el_ms": 403.4017476106507, "p90_e2el_ms": 19055.39850196801, "p99_e2el_ms": 19630.57034333702, "p99.9_e2el_ms": 19677.968813177664}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..4242f60f4
--- /dev/null
+++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-094424", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 320, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 32, "duration": 109.75491525093094, "completed": 320, "total_input_tokens": 327680, "total_output_tokens": 327680, "request_throughput": 2.9155869627195194, "request_goodput:": null, "output_throughput": 2985.561049824788, "total_token_throughput": 5971.122099649576, "mean_ttft_ms": 327.7923499357712, "median_ttft_ms": 139.84792854171246, "std_ttft_ms": 325.32643470611464, "p90_ttft_ms": 761.9237934472039, "p99_ttft_ms": 1553.907009542454, "p99.9_ttft_ms": 1554.0890494412743, "mean_tpot_ms": 10.35171252518911, "median_tpot_ms": 10.37312014370279, "std_tpot_ms": 0.2204616711640924, "p90_tpot_ms": 10.569622599677286, "p99_tpot_ms": 10.68635546505808, "p99.9_tpot_ms": 10.692018508377648, "mean_itl_ms": 10.351712596903877, "median_itl_ms": 10.424092994071543, "std_itl_ms": 0.6230783496277226, "p90_itl_ms": 10.737212025560439, "p99_itl_ms": 11.208135604392732, "p99.9_itl_ms": 13.578608148033387, "mean_e2el_ms": 10917.594263204228, "median_e2el_ms": 10899.026850122027, "std_e2el_ms": 363.70930694261756, "p90_e2el_ms": 11385.994586907327, "p99_e2el_ms": 11952.086912665982, "p99.9_e2el_ms": 11952.363938602852}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..103292520
--- /dev/null
+++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-105748", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 40, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 4, "duration": 89.19548447802663, "completed": 40, "total_input_tokens": 40960, "total_output_tokens": 40960, "request_throughput": 0.44845319507013864, "request_goodput:": null, "output_throughput": 459.21607175182197, "total_token_throughput": 918.4321435036439, "mean_ttft_ms": 330.7963805680629, "median_ttft_ms": 124.30587713606656, "std_ttft_ms": 300.8566190220928, "p90_ttft_ms": 797.8223511483521, "p99_ttft_ms": 987.5254542799665, "p99.9_ttft_ms": 1082.7990462793985, "mean_tpot_ms": 8.386647437519565, "median_tpot_ms": 8.389879373914562, "std_tpot_ms": 0.02406742291045804, "p90_tpot_ms": 8.410128863539297, "p99_tpot_ms": 8.42375959217583, "p99.9_tpot_ms": 8.429241839897129, "mean_itl_ms": 8.386647518691657, "median_itl_ms": 8.409935398958623, "std_itl_ms": 0.4271045776615353, "p90_itl_ms": 8.471527020446956, "p99_itl_ms": 8.548026895150542, "p99.9_itl_ms": 9.727208444615831, "mean_e2el_ms": 8910.336709150579, "median_e2el_ms": 8702.599443029612, "std_e2el_ms": 289.10261376307136, "p90_e2el_ms": 9388.286692928523, "p99_e2el_ms": 9497.63202768052, "p99.9_e2el_ms": 9560.362910952654}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..93b38b837
--- /dev/null
+++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-095932", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 5120, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 512, "duration": 232.66371850995347, "completed": 5120, "total_input_tokens": 5242880, "total_output_tokens": 5242880, "request_throughput": 22.006009500707624, "request_goodput:": null, "output_throughput": 22534.153728724606, "total_token_throughput": 45068.30745744921, "mean_ttft_ms": 485.9170994026499, "median_ttft_ms": 313.3755950257182, "std_ttft_ms": 479.43043981674873, "p90_ttft_ms": 1244.9121220968664, "p99_ttft_ms": 2244.784710702952, "p99.9_ttft_ms": 2325.581170682097, "mean_tpot_ms": 22.05546968971106, "median_tpot_ms": 22.107187851891105, "std_tpot_ms": 0.2246878257279778, "p90_tpot_ms": 22.263784393693665, "p99_tpot_ms": 22.29470054129683, "p99.9_tpot_ms": 22.34338827603105, "mean_itl_ms": 22.055469743536744, "median_itl_ms": 21.63424016907811, "std_itl_ms": 7.963070491395084, "p90_itl_ms": 22.762464964762344, "p99_itl_ms": 58.16545383073397, "p99.9_itl_ms": 86.60649072681669, "mean_e2el_ms": 23048.662591977063, "median_e2el_ms": 22988.531311624683, "std_e2el_ms": 468.6015130352978, "p90_e2el_ms": 23656.58285028767, "p99_e2el_ms": 24586.990111244377, "p99.9_e2el_ms": 24895.06961080851}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..0cec5d70e
--- /dev/null
+++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-094713", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 640, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 64, "duration": 131.00373828504235, "completed": 640, "total_input_tokens": 655360, "total_output_tokens": 655360, "request_throughput": 4.885356772090476, "request_goodput:": null, "output_throughput": 5002.605334620647, "total_token_throughput": 10005.210669241294, "mean_ttft_ms": 236.25605633751547, "median_ttft_ms": 176.69113306328654, "std_ttft_ms": 184.64948422493916, "p90_ttft_ms": 612.4625990400091, "p99_ttft_ms": 685.4502430907452, "p99.9_ttft_ms": 715.0928137269802, "mean_tpot_ms": 12.526103166546621, "median_tpot_ms": 12.51935791408594, "std_tpot_ms": 0.2060056266287734, "p90_tpot_ms": 12.759973599943258, "p99_tpot_ms": 12.92620933710034, "p99.9_tpot_ms": 12.929059177312093, "mean_itl_ms": 12.526103234641905, "median_itl_ms": 12.575171422213316, "std_itl_ms": 0.7854384445450877, "p90_itl_ms": 13.051916868425906, "p99_itl_ms": 13.57216314645484, "p99.9_itl_ms": 16.35113976360316, "mean_e2el_ms": 13050.45959571471, "median_e2el_ms": 13016.266988008283, "std_e2el_ms": 277.330954378513, "p90_e2el_ms": 13396.61577034276, "p99_e2el_ms": 13560.334076725412, "p99.9_e2el_ms": 13561.001875217538}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..0ec79b6a4
--- /dev/null
+++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-105950", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 80, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 8, "duration": 88.1823810969945, "completed": 80, "total_input_tokens": 81920, "total_output_tokens": 81920, "request_throughput": 0.9072107035985516, "request_goodput:": null, "output_throughput": 928.9837604849168, "total_token_throughput": 1857.9675209698337, "mean_ttft_ms": 122.96592185739428, "median_ttft_ms": 112.40720690693706, "std_ttft_ms": 66.13911304000679, "p90_ttft_ms": 255.94735525082797, "p99_ttft_ms": 311.73925049602985, "p99.9_ttft_ms": 314.68144817650324, "mean_tpot_ms": 8.482334189249984, "median_tpot_ms": 8.484079436520124, "std_tpot_ms": 0.08878052985776738, "p90_tpot_ms": 8.605768184886767, "p99_tpot_ms": 8.633496982299468, "p99.9_tpot_ms": 8.63350683135524, "mean_itl_ms": 8.482334263969731, "median_itl_ms": 8.499014074914157, "std_itl_ms": 0.4519442913074439, "p90_itl_ms": 8.674743911251426, "p99_itl_ms": 8.812509721610695, "p99.9_itl_ms": 10.00087821437054, "mean_e2el_ms": 8800.393797460129, "median_e2el_ms": 8809.698014520109, "std_e2el_ms": 92.8142026327281, "p90_e2el_ms": 8913.593775313348, "p99_e2el_ms": 8953.53475105483, "p99.9_e2el_ms": 8953.599093816709}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..88629b669
--- /dev/null
+++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-121723", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 1280, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 128, "duration": 244.16467579081655, "completed": 1280, "total_input_tokens": 1310720, "total_output_tokens": 1310720, "request_throughput": 5.2423635640751565, "request_goodput:": null, "output_throughput": 5368.18028961296, "total_token_throughput": 10736.36057922592, "mean_ttft_ms": 354.4963012025619, "median_ttft_ms": 221.00601554848254, "std_ttft_ms": 472.55769183914384, "p90_ttft_ms": 367.79938223771757, "p99_ttft_ms": 2608.2393946917728, "p99.9_ttft_ms": 2782.5139312951364, "mean_tpot_ms": 23.336153631907884, "median_tpot_ms": 23.378271256657253, "std_tpot_ms": 0.1400068026502214, "p90_tpot_ms": 23.406807750391568, "p99_tpot_ms": 23.42566725517553, "p99.9_tpot_ms": 23.436426655725757, "mean_itl_ms": 23.3361536941776, "median_itl_ms": 23.340390995144844, "std_itl_ms": 1.2778758518534754, "p90_itl_ms": 24.728624033741653, "p99_itl_ms": 26.77674562903122, "p99.9_itl_ms": 29.069117963546944, "mean_e2el_ms": 24227.38146664433, "median_e2el_ms": 24133.458781056106, "std_e2el_ms": 434.30045305130164, "p90_e2el_ms": 24261.306971753947, "p99_e2el_ms": 26417.07157871453, "p99.9_e2el_ms": 26614.837866391288}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..dc27001d6
--- /dev/null
+++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-120317", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 160, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 16, "duration": 181.8641711010132, "completed": 160, "total_input_tokens": 163840, "total_output_tokens": 163840, "request_throughput": 0.879777468158535, "request_goodput:": null, "output_throughput": 900.8921273943398, "total_token_throughput": 1801.7842547886796, "mean_ttft_ms": 177.28006526303943, "median_ttft_ms": 137.23533554002643, "std_ttft_ms": 71.58428526644974, "p90_ttft_ms": 269.01265301276, "p99_ttft_ms": 412.05388789297996, "p99.9_ttft_ms": 437.451443815371, "mean_tpot_ms": 17.564719299398767, "median_tpot_ms": 17.56061658305084, "std_tpot_ms": 0.03530473075764802, "p90_tpot_ms": 17.610166981731627, "p99_tpot_ms": 17.63727733525352, "p99.9_tpot_ms": 17.651837991991645, "mean_itl_ms": 17.564719371964884, "median_itl_ms": 17.580883926711977, "std_itl_ms": 1.0867409227478173, "p90_itl_ms": 19.01229675859213, "p99_itl_ms": 19.53902784269303, "p99.9_itl_ms": 20.444900048896738, "mean_e2el_ms": 18145.987908547977, "median_e2el_ms": 18126.11438310705, "std_e2el_ms": 69.77299297083215, "p90_e2el_ms": 18231.70208907686, "p99_e2el_ms": 18345.085098790005, "p99.9_e2el_ms": 18374.506261226954}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..0a0906a0c
--- /dev/null
+++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-122337", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 2560, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 256, "duration": 293.4073180370033, "completed": 2560, "total_input_tokens": 2621440, "total_output_tokens": 2621440, "request_throughput": 8.72507208452498, "request_goodput:": null, "output_throughput": 8934.47381455358, "total_token_throughput": 17868.94762910716, "mean_ttft_ms": 499.91884301671234, "median_ttft_ms": 232.72915300913155, "std_ttft_ms": 936.3295262768914, "p90_ttft_ms": 352.94642923399795, "p99_ttft_ms": 4966.478346844669, "p99.9_ttft_ms": 5474.548177599907, "mean_tpot_ms": 27.88984300256413, "median_tpot_ms": 27.98402836757511, "std_tpot_ms": 0.2996025480834357, "p90_tpot_ms": 28.034266286901946, "p99_tpot_ms": 28.066722955850597, "p99.9_tpot_ms": 28.08507930946706, "mean_itl_ms": 27.889843062892968, "median_itl_ms": 27.93224505148828, "std_itl_ms": 2.0988254491284923, "p90_itl_ms": 29.45356967393309, "p99_itl_ms": 34.86446616007015, "p99.9_itl_ms": 40.692481586942456, "mean_e2el_ms": 29031.22823463982, "median_e2el_ms": 28862.933419528417, "std_e2el_ms": 885.7421631371468, "p90_e2el_ms": 28989.10636473447, "p99_e2el_ms": 33438.947307681665, "p99.9_e2el_ms": 34007.66125454707}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..33bcbb85a
--- /dev/null
+++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-120731", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 320, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 32, "duration": 197.75558240781538, "completed": 320, "total_input_tokens": 327680, "total_output_tokens": 327680, "request_throughput": 1.618159124024574, "request_goodput:": null, "output_throughput": 1656.9949430011638, "total_token_throughput": 3313.9898860023277, "mean_ttft_ms": 204.8250449974148, "median_ttft_ms": 165.07942252792418, "std_ttft_ms": 126.25809383468614, "p90_ttft_ms": 280.0931277219208, "p99_ttft_ms": 760.1671954616904, "p99.9_ttft_ms": 763.005586763844, "mean_tpot_ms": 19.072629646838525, "median_tpot_ms": 19.09136980360525, "std_tpot_ms": 0.05851032738792819, "p90_tpot_ms": 19.13042017240496, "p99_tpot_ms": 19.18148197103492, "p99.9_tpot_ms": 19.181995146826385, "mean_itl_ms": 19.072629718559696, "median_itl_ms": 19.08195298165083, "std_itl_ms": 1.1082617310125253, "p90_itl_ms": 20.502249943092465, "p99_itl_ms": 21.305559629108753, "p99.9_itl_ms": 22.560074097709737, "mean_e2el_ms": 19716.125173713226, "median_e2el_ms": 19690.169369452633, "std_e2el_ms": 124.41108603823491, "p90_e2el_ms": 19801.692966977134, "p99_e2el_ms": 20280.91403166065, "p99.9_e2el_ms": 20281.5225590982}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..3b03fa3e5
--- /dev/null
+++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-115544", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 40, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 4, "duration": 158.28731181519106, "completed": 40, "total_input_tokens": 40960, "total_output_tokens": 40960, "request_throughput": 0.2527050307525732, "request_goodput:": null, "output_throughput": 258.769951490635, "total_token_throughput": 517.53990298127, "mean_ttft_ms": 117.1287115837913, "median_ttft_ms": 105.45016499236226, "std_ttft_ms": 22.95514831968995, "p90_ttft_ms": 154.71577032003552, "p99_ttft_ms": 179.9939913023263, "p99.9_ttft_ms": 192.60747395735254, "mean_tpot_ms": 15.340623984750376, "median_tpot_ms": 15.343764986895673, "std_tpot_ms": 0.038580294888251945, "p90_tpot_ms": 15.396409688466472, "p99_tpot_ms": 15.41568306727628, "p99.9_tpot_ms": 15.422512741592364, "mean_itl_ms": 15.34062407968064, "median_itl_ms": 15.342000522650778, "std_itl_ms": 1.104424460872785, "p90_itl_ms": 16.805011359974742, "p99_itl_ms": 17.192507253494114, "p99.9_itl_ms": 17.785792525159223, "mean_e2el_ms": 15810.587047983427, "median_e2el_ms": 15814.386753481813, "std_e2el_ms": 40.38963926676447, "p90_e2el_ms": 15859.638332994655, "p99_e2el_ms": 15892.085841062944, "p99.9_e2el_ms": 15900.850841090782}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..bda123e9d
--- /dev/null
+++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-123216", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 5120, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 512, "duration": 409.99618642101996, "completed": 5120, "total_input_tokens": 5242880, "total_output_tokens": 5242880, "request_throughput": 12.487921033349163, "request_goodput:": null, "output_throughput": 12787.631138149543, "total_token_throughput": 25575.262276299087, "mean_ttft_ms": 799.6392629810998, "median_ttft_ms": 281.9825775222853, "std_ttft_ms": 1862.6660647354645, "p90_ttft_ms": 422.8423163294793, "p99_ttft_ms": 9882.19552612165, "p99.9_ttft_ms": 10696.999395577937, "mean_tpot_ms": 38.74069696199115, "median_tpot_ms": 39.040027885107854, "std_tpot_ms": 0.9434676549467771, "p90_tpot_ms": 39.248179043834355, "p99_tpot_ms": 39.34789874325328, "p99.9_tpot_ms": 39.38483638085906, "mean_itl_ms": 38.74069702274606, "median_itl_ms": 38.97913300897926, "std_itl_ms": 7.37413467906563, "p90_itl_ms": 41.30664523690939, "p99_itl_ms": 64.09359137760478, "p99.9_itl_ms": 91.91624939884088, "mean_e2el_ms": 40431.37225509804, "median_e2el_ms": 40222.33807248995, "std_e2el_ms": 1868.5858228603415, "p90_e2el_ms": 40530.33041614108, "p99_e2el_ms": 49498.614948240574, "p99.9_e2el_ms": 50402.42158004316}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..39a9836cc
--- /dev/null
+++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-121211", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 640, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 64, "duration": 219.57327283197083, "completed": 640, "total_input_tokens": 655360, "total_output_tokens": 655360, "request_throughput": 2.9147445485761017, "request_goodput:": null, "output_throughput": 2984.698417741928, "total_token_throughput": 5969.396835483856, "mean_ttft_ms": 272.4847578108893, "median_ttft_ms": 223.96848094649613, "std_ttft_ms": 233.0339475669637, "p90_ttft_ms": 315.4728528345004, "p99_ttft_ms": 1419.038086910732, "p99.9_ttft_ms": 1425.9020658000386, "mean_tpot_ms": 21.105819801529368, "median_tpot_ms": 21.12906251010737, "std_tpot_ms": 0.07842898272041984, "p90_tpot_ms": 21.173637142069893, "p99_tpot_ms": 21.188599876768137, "p99.9_tpot_ms": 21.2060247620869, "mean_itl_ms": 21.105819863068465, "median_itl_ms": 21.13879646640271, "std_itl_ms": 1.1227502711857107, "p90_itl_ms": 22.462772903963923, "p99_itl_ms": 23.81146681262179, "p99.9_itl_ms": 25.36561909643935, "mean_e2el_ms": 21863.738414775435, "median_e2el_ms": 21827.26457947865, "std_e2el_ms": 211.00382490901427, "p90_e2el_ms": 21929.400861775503, "p99_e2el_ms": 22961.20647754753, "p99.9_e2el_ms": 22970.07569239778}
\ No newline at end of file
diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json
new file mode 100644
index 000000000..ff8c39ac9
--- /dev/null
+++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json
@@ -0,0 +1 @@
+{"date": "20260428-115923", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 80, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 8, "duration": 169.385015378939, "completed": 80, "total_input_tokens": 81920, "total_output_tokens": 81920, "request_throughput": 0.4722967956818868, "request_goodput:": null, "output_throughput": 483.6319187782521, "total_token_throughput": 967.2638375565042, "mean_ttft_ms": 148.00462815037463, "median_ttft_ms": 132.5412035221234, "std_ttft_ms": 37.51148982241326, "p90_ttft_ms": 168.5610823798926, "p99_ttft_ms": 278.3863469376229, "p99.9_ttft_ms": 278.64237096509896, "mean_tpot_ms": 16.392615779661135, "median_tpot_ms": 16.387911661726513, "std_tpot_ms": 0.034123230191850644, "p90_tpot_ms": 16.43408909303483, "p99_tpot_ms": 16.473588053212627, "p99.9_tpot_ms": 16.473637208842806, "mean_itl_ms": 16.392615859419287, "median_itl_ms": 16.397346975281835, "std_itl_ms": 1.0821039401103671, "p90_itl_ms": 17.86093518603593, "p99_itl_ms": 18.27556065050885, "p99.9_itl_ms": 18.872217730851858, "mean_e2el_ms": 16917.650570743717, "median_e2el_ms": 16907.48401999008, "std_e2el_ms": 42.08324227539982, "p90_e2el_ms": 16975.576334632933, "p99_e2el_ms": 17025.145734883845, "p99.9_e2el_ms": 17025.642524216324}
\ No newline at end of file
diff --git a/benchmarks/multi_node/vllm_disagg_utils/Dockerfile b/benchmarks/multi_node/vllm_disagg_utils/Dockerfile
new file mode 100644
index 000000000..899e026f7
--- /dev/null
+++ b/benchmarks/multi_node/vllm_disagg_utils/Dockerfile
@@ -0,0 +1,21 @@
+# Also installs
+#  - RDMA libs for Thor2
+# 
+# Build with `docker build -t vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c-bnxt .` to use in run_P.sh
+# From 4/24/2026, 0.19+
+FROM vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c
+
+# RDMA drivers for broadcom devices
+RUN apt-get update && apt-get install -y autoconf libibverbs-dev ibverbs-utils libtool unzip wget
+RUN wget https://docs.broadcom.com/docs-and-downloads/ethernet-network-adapters/NXE/Thor2/GCA1/bcm5760x_230.2.52.0a.zip && \
+    unzip bcm5760x_230.2.52.0a.zip && \
+    cd bcm5760x_230.2.52.0a/drivers_linux/bnxt_rocelib/ && \
+    results=$(find -name "libbnxt*.tar.gz") && tar -xf $results && \
+    untar_dir=$(find . -maxdepth 1 -type d -name "libbnxt*" ! -name "*.tar.gz" | head -n 1) && cd $untar_dir && sh autogen.sh && ./configure && make && \
+    find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \; && \
+    make install all && sudo sh -c "echo /usr/local/lib >> /etc/ld.so.conf" && \
+    sudo ldconfig && \
+    cp -f bnxt_re.driver /etc/libibverbs.d/ && \
+    ibv_devices
+RUN rm -rf bcm5760x_230.2.52.0a && \
+    rm -rf bcm5760x_230.2.52.0a.zip
diff --git a/benchmarks/multi_node/vllm_disagg_utils/run_D.sh b/benchmarks/multi_node/vllm_disagg_utils/run_D.sh
new file mode 100644
index 000000000..cc43d7df1
--- /dev/null
+++ b/benchmarks/multi_node/vllm_disagg_utils/run_D.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7"
+
+# export MODEL_NAME="DeepSeek-R1-0528"   # key from models_vllm.yaml
+# export MODEL_DIR="$HOME/.cache/huggingface/hub"
+# export MODEL_PATH="$HOME/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-0528/snapshots/4236a6af538feda4548eca9ab308586007567f52"
+export MODEL_NAME="MiniMax-M2.5"   # key from models_vllm.yaml
+export MODEL_DIR="$HOME/.cache/huggingface/hub"
+export MODEL_PATH="$HOME/.cache/huggingface/hub/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/"
+export NODE0_ADDR="10.21.9.47"          # prefill (rank-0) node's IP
+export IPADDRS="10.21.9.47,10.21.9.29"  # prefill IP, then decode IPs
+export xP=1 yD=1
+export NNODES=2
+export GPUS_PER_NODE=8
+
+export NODE_RANK=1
+export DRY_RUN=0
+
+export BENCH_INPUT_LEN=1024
+export BENCH_OUTPUT_LEN=1024
+export BENCH_MAX_CONCURRENCY="32x64x128x256x512"
+
+# Repo root (3 levels up from this script's directory)
+export DI_REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
+# Mount point inside the container (must match WS_PATH computation below)
+export DOCKER_MOUNT_PATH="/workspace"
+# Container-side path to the scripts directory
+export WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils"
+# Remap host MODEL_PATH into the container's /models mount
+export DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}"
+
+export SLURM_JOB_ID=1
+mkdir -p "/tmp/slurm_job-${SLURM_JOB_ID}"
+
+CONTAINER_NAME="vllm-disagg-decode"
+docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
+
+docker run --rm \
+    --name "$CONTAINER_NAME" \
+    --init \
+    --stop-timeout 10 \
+    --device /dev/dri \
+    --device /dev/kfd \
+    --device /dev/infiniband \
+    --device=/dev/infiniband/rdma_cm \
+    --device=/dev/infiniband/uverbs0 \
+    --device=/dev/infiniband/uverbs1 \
+    --device=/dev/infiniband/uverbs2 \
+    --device=/dev/infiniband/uverbs3 \
+    --device=/dev/infiniband/uverbs4 \
+    --device=/dev/infiniband/uverbs5 \
+    --device=/dev/infiniband/uverbs6 \
+    --device=/dev/infiniband/uverbs7 \
+    --ulimit memlock=-1 \
+    --ulimit stack=67108864 \
+    --network host \
+    --ipc host \
+    --group-add video \
+    --cap-add SYS_PTRACE \
+    --security-opt seccomp=unconfined \
+    --privileged \
+    -v /sys:/sys \
+    $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \
+    -v ${MODEL_DIR}:/models \
+    -v $HOME/.ssh:/root/.ssh \
+    --shm-size 128G \
+    -v /tmp:/run_logs \
+    -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
+    -e SLURM_JOB_ID=$SLURM_JOB_ID \
+    -e NNODES=$NNODES \
+    -e NODE0_ADDR=$NODE0_ADDR \
+    -e IPADDRS=$IPADDRS \
+    -e MODEL_DIR=/models \
+    -e MODEL_NAME=$MODEL_NAME \
+    -e MODEL_PATH=$DOCKER_MODEL_PATH \
+    -e WS_PATH=${WS_PATH} \
+    -e GPUS_PER_NODE=$GPUS_PER_NODE \
+    -e NODE_RANK=$NODE_RANK \
+    -e xP=$xP \
+    -e yD=$yD \
+    -e IBDEVICES=$IBDEVICES \
+    -e DRY_RUN=$DRY_RUN \
+    -e ENGINE=vllm-disagg \
+    -e HF_HUB_CACHE=/models \
+    -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma \
+    -e UCX_SOCKADDR_TLS_PRIORITY=tcp \
+    -e UCX_MEMTYPE_CACHE=y \
+    -e UCX_RNDV_SCHEME=get_zcopy \
+    -e UCX_RNDV_THRESH=4k \
+    -e UCX_ROCM_IPC_MIN_ZCOPY=0 \
+    -e UCX_LOG_LEVEL=warn \
+    -e HSA_ENABLE_SDMA=1 \
+    -e VLLM_USE_V1=1 \
+    -e VLLM_ROCM_USE_AITER=1 \
+    -e VLLM_ROCM_USE_AITER_RMSNORM=1 \
+    -e VLLM_ENGINE_READY_TIMEOUT_S=3600 \
+    -e PREFILL_NODES=1 \
+    -e DECODE_NODES=1 \
+    -e PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} \
+    -e VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \
+    -e PYTHONPYCACHEPREFIX=/tmp/pycache \
+    -e PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP:-true} \
+    -e PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP:-false} \
+    -e DECODE_ENABLE_EP=${DECODE_ENABLE_EP:-true} \
+    -e DECODE_ENABLE_DP=${DECODE_ENABLE_DP:-false} \
+    -e PREFILL_TP_SIZE=${PREFILL_TP_SIZE:-8} \
+    -e DECODE_TP_SIZE=${DECODE_TP_SIZE:-8} \
+    --entrypoint /bin/bash \
+    vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c-bnxt \
+    -lc "mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && ${WS_PATH}/server.sh 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/server_\$(hostname).log"
diff --git a/benchmarks/multi_node/vllm_disagg_utils/run_P.sh b/benchmarks/multi_node/vllm_disagg_utils/run_P.sh
new file mode 100755
index 000000000..ca33f0148
--- /dev/null
+++ b/benchmarks/multi_node/vllm_disagg_utils/run_P.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7"
+
+# export MODEL_NAME="DeepSeek-R1-0528"   # key from models_vllm.yaml
+# export MODEL_DIR="$HOME/.cache/huggingface/hub"
+# export MODEL_PATH="$HOME/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-0528/snapshots/4236a6af538feda4548eca9ab308586007567f52"
+export MODEL_NAME="MiniMax-M2.5"   # key from models_vllm.yaml
+export MODEL_DIR="$HOME/.cache/huggingface/hub"
+export MODEL_PATH="$HOME/.cache/huggingface/hub/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/"
+export NODE0_ADDR="10.21.9.47"          # this node's IP (prefill)
+export IPADDRS="10.21.9.47,10.21.9.29"  # prefill IP, then decode IPs
+export xP=1 yD=1
+export NNODES=2
+export GPUS_PER_NODE=8
+
+export NODE_RANK=0
+export DRY_RUN=0
+
+export BENCH_INPUT_LEN=1024
+export BENCH_OUTPUT_LEN=1024
+export BENCH_MAX_CONCURRENCY="4x8x16x32x64x128x256x512"
+
+# Repo root (3 levels up from this script's directory)
+export DI_REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
+# Mount point inside the container (must match WS_PATH computation below)
+export DOCKER_MOUNT_PATH="/workspace"
+# Container-side path to the scripts directory
+export WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils"
+# Remap host MODEL_PATH into the container's /models mount
+export DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}"
+
+export SLURM_JOB_ID=1
+mkdir -p "/tmp/slurm_job-${SLURM_JOB_ID}"
+
+ROUTER_PORT="${ROUTER_PORT:-30000}"
+PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
+VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}"
+ROUTER_CONT_NAME="router_vllm_local_${SLURM_JOB_ID}"
+
+# Launch vllm-router as a separate container (mirrors job.slurm behavior)
+docker rm -f "$ROUTER_CONT_NAME" 2>/dev/null || true
+docker run -d \
+    --name "$ROUTER_CONT_NAME" \
+    --network host \
+    -v /tmp:/run_logs \
+    "$VLLM_ROUTER_IMAGE" \
+    bash -lc "mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \
+        --vllm-pd-disaggregation \
+        --kv-connector moriio \
+        --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \
+        --port ${ROUTER_PORT} \
+        --host 0.0.0.0 \
+        --policy consistent_hash \
+        --prefill-policy consistent_hash \
+        --decode-policy consistent_hash \
+        --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log"
+
+CONTAINER_NAME="vllm-disagg-prefill"
+docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
+
+docker run --rm \
+    --name "$CONTAINER_NAME" \
+    --init \
+    --stop-timeout 10 \
+    --device /dev/dri \
+    --device /dev/kfd \
+    --device /dev/infiniband \
+    --device=/dev/infiniband/rdma_cm \
+    --device=/dev/infiniband/uverbs0 \
+    --device=/dev/infiniband/uverbs1 \
+    --device=/dev/infiniband/uverbs2 \
+    --device=/dev/infiniband/uverbs3 \
+    --device=/dev/infiniband/uverbs4 \
+    --device=/dev/infiniband/uverbs5 \
+    --device=/dev/infiniband/uverbs6 \
+    --device=/dev/infiniband/uverbs7 \
+    --ulimit memlock=-1 \
+    --ulimit stack=67108864 \
+    --network host \
+    --ipc host \
+    --group-add video \
+    --cap-add SYS_PTRACE \
+    --security-opt seccomp=unconfined \
+    --privileged \
+    -v /sys:/sys \
+    $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \
+    -v ${MODEL_DIR}:/models \
+    -v $HOME/.ssh:/root/.ssh \
+    --shm-size 128G \
+    -v /tmp:/run_logs \
+    -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
+    -e SLURM_JOB_ID=$SLURM_JOB_ID \
+    -e NNODES=$NNODES \
+    -e NODE0_ADDR=$NODE0_ADDR \
+    -e IPADDRS=$IPADDRS \
+    -e MODEL_DIR=/models \
+    -e MODEL_NAME=$MODEL_NAME \
+    -e MODEL_PATH=$DOCKER_MODEL_PATH \
+    -e WS_PATH=${WS_PATH} \
+    -e GPUS_PER_NODE=$GPUS_PER_NODE \
+    -e NODE_RANK=$NODE_RANK \
+    -e xP=$xP \
+    -e yD=$yD \
+    -e IBDEVICES=$IBDEVICES \
+    -e DRY_RUN=$DRY_RUN \
+    -e ENGINE=vllm-disagg \
+    -e HF_HUB_CACHE=/models \
+    -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma \
+    -e UCX_SOCKADDR_TLS_PRIORITY=tcp \
+    -e UCX_MEMTYPE_CACHE=y \
+    -e UCX_RNDV_SCHEME=get_zcopy \
+    -e UCX_RNDV_THRESH=4k \
+    -e UCX_ROCM_IPC_MIN_ZCOPY=0 \
+    -e UCX_LOG_LEVEL=warn \
+    -e HSA_ENABLE_SDMA=1 \
+    -e GLOO_SOCKET_IFNAME=ens51np0 \
+    -e NCCL_SOCKET_IFNAME=ens51np0 \
+    -e VLLM_USE_V1=1 \
+    -e VLLM_ROCM_USE_AITER=1 \
+    -e VLLM_ROCM_USE_AITER_RMSNORM=1 \
+    -e VLLM_ENGINE_READY_TIMEOUT_S=3600 \
+    -e PREFILL_NODES=1 \
+    -e DECODE_NODES=1 \
+    -e PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} \
+    -e VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \
+    -e PYTHONPYCACHEPREFIX=/tmp/pycache \
+    -e PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP:-true} \
+    -e PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP:-false} \
+    -e DECODE_ENABLE_EP=${DECODE_ENABLE_EP:-true} \
+    -e DECODE_ENABLE_DP=${DECODE_ENABLE_DP:-false} \
+    -e PREFILL_TP_SIZE=${PREFILL_TP_SIZE:-8} \
+    -e DECODE_TP_SIZE=${DECODE_TP_SIZE:-8} \
+    -e BENCH_INPUT_LEN=${BENCH_INPUT_LEN:-1024} \
+    -e BENCH_OUTPUT_LEN=${BENCH_OUTPUT_LEN:-1024} \
+    -e BENCH_RANDOM_RANGE_RATIO=${BENCH_RANDOM_RANGE_RATIO:-1} \
+    -e BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} \
+    -e BENCH_MAX_CONCURRENCY=${BENCH_MAX_CONCURRENCY:-512} \
+    -e BENCH_REQUEST_RATE=${BENCH_REQUEST_RATE:-inf} \
+    -e TQDM_MININTERVAL=${TQDM_MININTERVAL:-20} \
+    --entrypoint /bin/bash \
+    vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c-bnxt \
+    -lc "mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && ${WS_PATH}/server.sh 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/server_\$(hostname).log"
+
+docker rm -f "$ROUTER_CONT_NAME" 2>/dev/null || true