diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 32de6f552..67ae47b4c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -993,7 +993,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1046,7 +1046,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 81da415e8..cd4794ed5 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -32,8 +32,13 @@ fi export IBDEVICES # Shared: Auto-detect default network interface (portable across clusters) -export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +# Only auto-detect if not already set by the runner/environment +if [[ -z "$GLOO_SOCKET_IFNAME" ]]; then + export GLOO_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1) +fi +if [[ -z "$NCCL_SOCKET_IFNAME" ]]; then + export NCCL_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1) +fi set +x diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index abb80b97b..f29b4b71b 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -416,6 +416,7 @@ if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \ \"$VLLM_ROUTER_IMAGE\" \ bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ --vllm-pd-disaggregation \ + --kv-connector moriio \ --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ --port ${ROUTER_PORT} \ --host 0.0.0.0 \ diff --git a/benchmarks/multi_node/amd_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py deleted file mode 100644 index 7d1e8454b..000000000 --- a/benchmarks/multi_node/amd_utils/moriio_proxy.py +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env python3 -# MoRI-IO proxy server for vLLM PD disaggregation. -# -# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py -# with the following adaptations for production multi-node use: -# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars -# - /health endpoint for sync.py barrier readiness checks -# - Uses stdlib `re` instead of `regex` to avoid extra dep -# -# The proxy performs two roles that vllm-router cannot: -# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports -# 2. Request enrichment — injects remote endpoint info into kv_transfer_params - -import asyncio -import copy -import logging -import os -import re -import socket -import threading -import time -import uuid - -import aiohttp -import msgpack -import zmq -from quart import Quart, make_response, request - -logger = logging.getLogger("moriio_proxy") -logger.setLevel(logging.DEBUG) -handler = logging.StreamHandler() -handler.setFormatter(logging.Formatter( - "%(asctime)s %(levelname)s [%(name)s] %(message)s")) -logger.addHandler(handler) - -prefill_instances: list[dict] = [] -decode_instances: list[dict] = [] -request_nums = 0 -app = Quart(__name__) - -STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) - -IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") - -TRANSFER_TYPE = None - - -def _append_whole_dict_unique(target_list, data_dict): - new_filtered = {k: v for k, v in data_dict.items() if k != "index"} - for existed in target_list: - existed_filtered = {k: v for k, v in existed.items() if k != "index"} - if existed_filtered == new_filtered: - return False - logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", - data_dict.get("role"), data_dict.get("request_address"), - data_dict.get("handshake_port"), data_dict.get("notify_port"), - data_dict.get("dp_size"), data_dict.get("tp_size")) - target_list.append(data_dict) - transfer_mode = data_dict.get("transfer_mode", "unknown") - global TRANSFER_TYPE - - if TRANSFER_TYPE is None: - TRANSFER_TYPE = transfer_mode - logger.info("Transfer mode set to: %s", TRANSFER_TYPE) - elif transfer_mode != TRANSFER_TYPE: - raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") - - return True - - -_list_lock = threading.RLock() - - -def _listen_for_register(hostname, port): - context = zmq.Context() - router_socket = context.socket(zmq.ROUTER) - router_socket.bind(f"tcp://{hostname}:{port}") - poller = zmq.Poller() - poller.register(router_socket, zmq.POLLIN) - global prefill_instances - global decode_instances - - while True: - socks = dict(poller.poll()) - if router_socket in socks: - remote_addr, msg = router_socket.recv_multipart() - data = msgpack.loads(msg) - if data["type"] == "HELLO": - pass - elif ( - data["type"] == "register" - and data["role"] == "P" - and data["request_address"] not in prefill_instances - ): - with _list_lock: - _append_whole_dict_unique(prefill_instances, data) - - elif ( - data["type"] == "register" - and data["role"] == "D" - and data["request_address"] not in decode_instances - ): - with _list_lock: - _append_whole_dict_unique(decode_instances, data) - - -def start_service_discovery(hostname, port): - if not hostname: - hostname = socket.gethostname() - if port == 0: - raise ValueError("Port cannot be 0") - - _listener_thread = threading.Thread( - target=_listen_for_register, args=(hostname, port), daemon=True - ) - _listener_thread.start() - logger.info("Service discovery listening on %s:%s", hostname, port) - return _listener_thread - - -async def send_request_to_prefill( - endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank -): - req_data_copy = req_data - - req_data_copy["kv_transfer_params"].update( - { - "do_remote_decode": True, - "do_remote_prefill": False, - "remote_handshake_port": d_endpoint["handshake_port"], - "remote_notify_port": d_endpoint["notify_port"], - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": dip, - "remote_port": dport, - } - ) - req_data_copy["stream"] = False - req_data_copy["max_tokens"] = 1 - if "max_completion_tokens" in req_data_copy: - req_data_copy["max_completion_tokens"] = 1 - if "stream_options" in req_data_copy: - del req_data_copy["stream_options"] - async with aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) - ) as session: - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id, - } - if selected_prefill_dp_rank is not None: - headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) - async with session.post( - url=endpoint, json=req_data_copy, headers=headers - ) as response: - if response.status == 200: - return await response.json() - else: - raise RuntimeError( - f"Prefill response status={response.status}" - ) - - -async def start_decode_request(endpoint, req_data, request_id): - session = aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) - ) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id, - } - response = await session.post(url=endpoint, json=req_data, headers=headers) - return session, response - - -async def stream_decode_response(session, response, request_id): - try: - if response.status == 200: - chunk_iter = response.content.iter_chunked(1024).__aiter__() - while True: - try: - chunk_bytes = await asyncio.wait_for( - chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, - ) - yield chunk_bytes - except StopAsyncIteration: - break - except asyncio.TimeoutError: - logger.error( - "Decode stream %s idle for %ds, aborting", - request_id, STREAM_IDLE_TIMEOUT, - ) - break - else: - raise RuntimeError( - f"Decode response status={response.status}" - ) - finally: - await response.release() - await session.close() - - -@app.route("/health", methods=["GET"]) -async def health_check(): - with _list_lock: - p_count = len(prefill_instances) - d_count = len(decode_instances) - return await make_response( - ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) - ) - - -@app.route("/v1/completions", methods=["POST"]) -@app.route("/v1/chat/completions", methods=["POST"]) -async def handle_request(): - try: - with _list_lock: - global request_nums - request_nums += 1 - - def extract_ip_port_fast(url): - match = IP_PORT_PATTERN.search(url) - if not match: - raise ValueError(f"Invalid URL format: {url}") - return match.groups() - - req_data = await request.get_json() - request_id = str(uuid.uuid4()) - - if not prefill_instances or not decode_instances: - return await make_response( - ("Service Unavailable: No prefill or decode instances registered.", 503) - ) - - pid = request_nums % len(prefill_instances) - did = request_nums % len(decode_instances) - prefill_instance_endpoint = prefill_instances[pid] - decode_instance_endpoint = decode_instances[did] - - selected_prefill_dp_rank = None - if prefill_instance_endpoint["dp_size"] > 1: - selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] - - dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) - - req_data_to_prefill = copy.deepcopy(req_data) - req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} - req_data["kv_transfer_params"] = {"transfer_id": request_id} - req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( - decode_instance_endpoint["dp_size"] - ) - req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( - decode_instance_endpoint["tp_size"] - ) - - send_prefill_task = asyncio.create_task( - send_request_to_prefill( - prefill_instance_endpoint["request_address"], - req_data_to_prefill, - request_id, - decode_instance_endpoint, - dip, - dport, - selected_prefill_dp_rank, - ) - ) - ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) - - req_data["max_tokens"] -= 1 - - req_data["kv_transfer_params"] = { - "transfer_id": request_id, - "do_remote_decode": False, - "do_remote_prefill": True, - "remote_handshake_port": prefill_instance_endpoint["handshake_port"], - "remote_notify_port": prefill_instance_endpoint["notify_port"], - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": ip, - "remote_port": port, - } - if TRANSFER_TYPE == "READ": - prefill_response = await send_prefill_task - req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ - "kv_transfer_params" - ]["remote_engine_id"] - req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ - "kv_transfer_params" - ]["remote_block_ids"] - - req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ - "dp_size" - ] - req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ - "tp_size" - ] - - if selected_prefill_dp_rank is not None: - req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank - - decode_request_task = asyncio.create_task( - start_decode_request( - decode_instance_endpoint["request_address"], req_data, request_id - ) - ) - - session, decode_response = await decode_request_task - stream_generator = stream_decode_response(session, decode_response, request_id) - response = await make_response(stream_generator) - return response - except Exception as e: - logger.exception("Error handling request: %s", e) - return await make_response((f"Internal Server Error: {e!s}", 500)) - - -if __name__ == "__main__": - http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) - ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) - - t = start_service_discovery("0.0.0.0", ping_port) - app.debug = False - app.config["BODY_TIMEOUT"] = 360000 - app.config["RESPONSE_TIMEOUT"] = 360000 - - logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) - app.run(host="0.0.0.0", port=http_port) - t.join() diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py index 8290276fb..ac830eb1f 100644 --- a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py +++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py @@ -137,7 +137,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, - reduce_results=False, renormalize=True, scoring_func=getattr(config, "scoring_func", "softmax"), e_score_correction_bias=self.e_score_correction_bias, @@ -185,7 +184,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: ) final_hidden_states = final_hidden_states[:num_tokens] elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + from vllm.distributed.communication_op import tensor_model_parallel_all_reduce + final_hidden_states = tensor_model_parallel_all_reduce( final_hidden_states ) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 73cad3adc..9acb05f54 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -242,7 +242,7 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" -# MoRI-IO proxy ZMQ registration port (must match moriio_proxy.py PROXY_PING_PORT) +# MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address) PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # vLLM environment (UCX transport vars are set at the Docker level in job.slurm) @@ -281,26 +281,8 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env - # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup - # Skipped when ROUTER_TYPE=vllm-router (external router container started by job.slurm) - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." - PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ - python3 $WS_PATH/moriio_proxy.py" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PROXY_CMD" - else - PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" - set -x - eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & - set +x - proxy_pid=$! - sleep 3 - fi - else - echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})" - fi + # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) + echo "Using external vllm-router container (started by job.slurm on this node)" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ @@ -343,7 +325,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "DRY RUN: $HEALTH_BARRIER_CMD" else eval "$HEALTH_BARRIER_CMD" - echo "${ROUTER_TYPE} is ready for benchmarking" + echo "MoRI-IO proxy is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -375,14 +357,8 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Killing the prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true - fi [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true sleep 2 - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - pkill -f moriio_proxy 2>/dev/null || true - fi pkill -f "vllm serve" 2>/dev/null || true fi diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 589399f74..958cb9808 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -242,43 +242,48 @@ patch_mori_fp8_compat() { import re, os, sys patched = [] -# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +# Patch layer.py: remove AITER requirement assertion(s) for MoRI try: import vllm.model_executor.layers.fused_moe.layer as lm f = lm.__file__ src = open(f).read() - if "Mori needs to be used with aiter" in src: + if "[PATCHED] AITER requirement removed for MoRI-EP + FP8" in src: + print("[SETUP] layer.py MoRI-FP8 patch already applied") + elif "Mori needs to be used with aiter" in src: + # v0.19+: two consecutive assertions inside `if self.moe_config.use_mori_kernels:` new = re.sub( - r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)\s*" + r"assert not self\.aiter_fmoe_shared_expert_enabled,\s*\([^)]*\)", "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", src, flags=re.DOTALL) + if new == src: + # v0.17.1/v0.18.0: only the first assertion existed + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) if new != src: open(f, "w").write(new) patched.append("layer.py") + else: + print("[SETUP] ERROR: layer.py pattern found but regex had no effect", file=sys.stderr) + sys.exit(1) + else: + print("[SETUP] ERROR: layer.py AITER assertion pattern not found — vLLM API may have changed", file=sys.stderr) + sys.exit(1) except Exception as e: - print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + print(f"[SETUP] ERROR patch layer.py: {e}", file=sys.stderr) + sys.exit(1) -# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction -try: - import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm - f = mm.__file__ - src = open(f).read() - if "defer_input_quant" in src: - new = re.sub( - r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", - "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", - src) - if new != src: - open(f, "w").write(new) - patched.append("mori_prepare_finalize.py") -except Exception as e: - print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) +# prepare_finalize/mori.py (v0.19+) already handles defer_input_quant correctly +# (skips FP8 quant when True). No patch needed for that file. +# Added in 0.18.1: https://github.com/vllm-project/vllm/commit/6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209 if patched: print(f"[SETUP] Patched: {chr(44).join(patched)}") else: print("[SETUP] No MoRI-FP8 patches needed") -' +' || exit 1 _SETUP_INSTALLED+=("MoRI-FP8-patch") } @@ -881,7 +886,6 @@ except Exception as e: # install_libionic # install_mori install_amd_quark -install_mori_proxy_deps patch_mori_fp8_compat patch_moriio_save_kv_timeout patch_moriio_transfer_timeout diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..4f805f5a9 --- /dev/null +++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-095035", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 1280, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 128, "duration": 156.629553565057, "completed": 1280, "total_input_tokens": 1310720, "total_output_tokens": 1310720, "request_throughput": 8.17214868373065, "request_goodput:": null, "output_throughput": 8368.280252140186, "total_token_throughput": 16736.560504280373, "mean_ttft_ms": 322.05012791437184, "median_ttft_ms": 168.4953118674457, "std_ttft_ms": 412.7725204129608, "p90_ttft_ms": 1269.4133618613705, "p99_ttft_ms": 1515.174685146194, "p99.9_ttft_ms": 1516.856569517171, "mean_tpot_ms": 14.871257350024614, "median_tpot_ms": 14.892716348905914, "std_tpot_ms": 0.12168611094612052, "p90_tpot_ms": 14.975844017677883, "p99_tpot_ms": 15.061890008918056, "p99.9_tpot_ms": 15.066467860250443, "mean_itl_ms": 14.871688992762476, "median_itl_ms": 14.909795951098204, "std_itl_ms": 1.126674865744914, "p90_itl_ms": 15.301332250237465, "p99_itl_ms": 18.650689502246678, "p99.9_itl_ms": 22.104734647087753, "mean_e2el_ms": 15535.346396989553, "median_e2el_ms": 15410.800829995424, "std_e2el_ms": 414.3454674568179, "p90_e2el_ms": 16522.48927145265, "p99_e2el_ms": 16675.76581487432, "p99.9_e2el_ms": 16677.12394464435} \ No newline at end of file diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..5c258b87c --- /dev/null +++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-110159", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 160, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 16, "duration": 94.94771882798523, "completed": 160, "total_input_tokens": 163840, "total_output_tokens": 163840, "request_throughput": 1.68513790510195, "request_goodput:": null, "output_throughput": 1725.5812148243967, "total_token_throughput": 3451.1624296487935, "mean_ttft_ms": 185.00612755160546, "median_ttft_ms": 104.43462803959846, "std_ttft_ms": 193.8479555106311, "p90_ttft_ms": 668.5733429389074, "p99_ttft_ms": 687.9313375009224, "p99.9_ttft_ms": 698.2081180762034, "mean_tpot_ms": 9.043755853690438, "median_tpot_ms": 9.03941193840877, "std_tpot_ms": 0.032097259503204134, "p90_tpot_ms": 9.08699567049181, "p99_tpot_ms": 9.092274875981406, "p99.9_tpot_ms": 9.094480784422148, "mean_itl_ms": 9.043755927185433, "median_itl_ms": 9.067311882972717, "std_itl_ms": 0.49330033887680946, "p90_itl_ms": 9.187642950564623, "p99_itl_ms": 9.348576478660105, "p99.9_itl_ms": 11.5778636110482, "mean_e2el_ms": 9436.768365876924, "median_e2el_ms": 9359.918448608369, "std_e2el_ms": 203.51181544144868, "p90_e2el_ms": 9920.965689211152, "p99_e2el_ms": 9977.636895296164, "p99.9_e2el_ms": 9977.824207578553} \ No newline at end of file diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..1850ee750 --- /dev/null +++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-095432", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 2560, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 256, "duration": 187.5680166000966, "completed": 2560, "total_input_tokens": 2621440, "total_output_tokens": 2621440, "request_throughput": 13.648382311671154, "request_goodput:": null, "output_throughput": 13975.943487151262, "total_token_throughput": 27951.886974302524, "mean_ttft_ms": 382.50921859926166, "median_ttft_ms": 215.23276844527572, "std_ttft_ms": 389.20735932882576, "p90_ttft_ms": 1051.413601823151, "p99_ttft_ms": 1578.5206863190976, "p99.9_ttft_ms": 1665.9692472564057, "mean_tpot_ms": 17.745007218556957, "median_tpot_ms": 17.7608991554098, "std_tpot_ms": 0.1668966632857337, "p90_tpot_ms": 17.881159080449176, "p99_tpot_ms": 17.904399000923036, "p99.9_tpot_ms": 17.906466535026507, "mean_itl_ms": 17.745312193796607, "median_itl_ms": 17.753243912011385, "std_itl_ms": 2.8374261437282113, "p90_itl_ms": 18.29339493997395, "p99_itl_ms": 30.380772710777826, "p99.9_itl_ms": 41.25628810096536, "mean_e2el_ms": 18535.651603183032, "median_e2el_ms": 18436.250409460627, "std_e2el_ms": 403.4017476106507, "p90_e2el_ms": 19055.39850196801, "p99_e2el_ms": 19630.57034333702, "p99.9_e2el_ms": 19677.968813177664} \ No newline at end of file diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..4242f60f4 --- /dev/null +++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-094424", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 320, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 32, "duration": 109.75491525093094, "completed": 320, "total_input_tokens": 327680, "total_output_tokens": 327680, "request_throughput": 2.9155869627195194, "request_goodput:": null, "output_throughput": 2985.561049824788, "total_token_throughput": 5971.122099649576, "mean_ttft_ms": 327.7923499357712, "median_ttft_ms": 139.84792854171246, "std_ttft_ms": 325.32643470611464, "p90_ttft_ms": 761.9237934472039, "p99_ttft_ms": 1553.907009542454, "p99.9_ttft_ms": 1554.0890494412743, "mean_tpot_ms": 10.35171252518911, "median_tpot_ms": 10.37312014370279, "std_tpot_ms": 0.2204616711640924, "p90_tpot_ms": 10.569622599677286, "p99_tpot_ms": 10.68635546505808, "p99.9_tpot_ms": 10.692018508377648, "mean_itl_ms": 10.351712596903877, "median_itl_ms": 10.424092994071543, "std_itl_ms": 0.6230783496277226, "p90_itl_ms": 10.737212025560439, "p99_itl_ms": 11.208135604392732, "p99.9_itl_ms": 13.578608148033387, "mean_e2el_ms": 10917.594263204228, "median_e2el_ms": 10899.026850122027, "std_e2el_ms": 363.70930694261756, "p90_e2el_ms": 11385.994586907327, "p99_e2el_ms": 11952.086912665982, "p99.9_e2el_ms": 11952.363938602852} \ No newline at end of file diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..103292520 --- /dev/null +++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-105748", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 40, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 4, "duration": 89.19548447802663, "completed": 40, "total_input_tokens": 40960, "total_output_tokens": 40960, "request_throughput": 0.44845319507013864, "request_goodput:": null, "output_throughput": 459.21607175182197, "total_token_throughput": 918.4321435036439, "mean_ttft_ms": 330.7963805680629, "median_ttft_ms": 124.30587713606656, "std_ttft_ms": 300.8566190220928, "p90_ttft_ms": 797.8223511483521, "p99_ttft_ms": 987.5254542799665, "p99.9_ttft_ms": 1082.7990462793985, "mean_tpot_ms": 8.386647437519565, "median_tpot_ms": 8.389879373914562, "std_tpot_ms": 0.02406742291045804, "p90_tpot_ms": 8.410128863539297, "p99_tpot_ms": 8.42375959217583, "p99.9_tpot_ms": 8.429241839897129, "mean_itl_ms": 8.386647518691657, "median_itl_ms": 8.409935398958623, "std_itl_ms": 0.4271045776615353, "p90_itl_ms": 8.471527020446956, "p99_itl_ms": 8.548026895150542, "p99.9_itl_ms": 9.727208444615831, "mean_e2el_ms": 8910.336709150579, "median_e2el_ms": 8702.599443029612, "std_e2el_ms": 289.10261376307136, "p90_e2el_ms": 9388.286692928523, "p99_e2el_ms": 9497.63202768052, "p99.9_e2el_ms": 9560.362910952654} \ No newline at end of file diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..93b38b837 --- /dev/null +++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-095932", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 5120, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 512, "duration": 232.66371850995347, "completed": 5120, "total_input_tokens": 5242880, "total_output_tokens": 5242880, "request_throughput": 22.006009500707624, "request_goodput:": null, "output_throughput": 22534.153728724606, "total_token_throughput": 45068.30745744921, "mean_ttft_ms": 485.9170994026499, "median_ttft_ms": 313.3755950257182, "std_ttft_ms": 479.43043981674873, "p90_ttft_ms": 1244.9121220968664, "p99_ttft_ms": 2244.784710702952, "p99.9_ttft_ms": 2325.581170682097, "mean_tpot_ms": 22.05546968971106, "median_tpot_ms": 22.107187851891105, "std_tpot_ms": 0.2246878257279778, "p90_tpot_ms": 22.263784393693665, "p99_tpot_ms": 22.29470054129683, "p99.9_tpot_ms": 22.34338827603105, "mean_itl_ms": 22.055469743536744, "median_itl_ms": 21.63424016907811, "std_itl_ms": 7.963070491395084, "p90_itl_ms": 22.762464964762344, "p99_itl_ms": 58.16545383073397, "p99.9_itl_ms": 86.60649072681669, "mean_e2el_ms": 23048.662591977063, "median_e2el_ms": 22988.531311624683, "std_e2el_ms": 468.6015130352978, "p90_e2el_ms": 23656.58285028767, "p99_e2el_ms": 24586.990111244377, "p99.9_e2el_ms": 24895.06961080851} \ No newline at end of file diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..0cec5d70e --- /dev/null +++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-094713", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 640, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 64, "duration": 131.00373828504235, "completed": 640, "total_input_tokens": 655360, "total_output_tokens": 655360, "request_throughput": 4.885356772090476, "request_goodput:": null, "output_throughput": 5002.605334620647, "total_token_throughput": 10005.210669241294, "mean_ttft_ms": 236.25605633751547, "median_ttft_ms": 176.69113306328654, "std_ttft_ms": 184.64948422493916, "p90_ttft_ms": 612.4625990400091, "p99_ttft_ms": 685.4502430907452, "p99.9_ttft_ms": 715.0928137269802, "mean_tpot_ms": 12.526103166546621, "median_tpot_ms": 12.51935791408594, "std_tpot_ms": 0.2060056266287734, "p90_tpot_ms": 12.759973599943258, "p99_tpot_ms": 12.92620933710034, "p99.9_tpot_ms": 12.929059177312093, "mean_itl_ms": 12.526103234641905, "median_itl_ms": 12.575171422213316, "std_itl_ms": 0.7854384445450877, "p90_itl_ms": 13.051916868425906, "p99_itl_ms": 13.57216314645484, "p99.9_itl_ms": 16.35113976360316, "mean_e2el_ms": 13050.45959571471, "median_e2el_ms": 13016.266988008283, "std_e2el_ms": 277.330954378513, "p90_e2el_ms": 13396.61577034276, "p99_e2el_ms": 13560.334076725412, "p99.9_e2el_ms": 13561.001875217538} \ No newline at end of file diff --git a/benchmarks/multi_node/results/gpt-oss-120b/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..0ec79b6a4 --- /dev/null +++ b/benchmarks/multi_node/results/gpt-oss-120b/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-105950", "backend": "openai", "model_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "tokenizer_id": "/models/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a/", "best_of": 1, "num_prompts": 80, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 8, "duration": 88.1823810969945, "completed": 80, "total_input_tokens": 81920, "total_output_tokens": 81920, "request_throughput": 0.9072107035985516, "request_goodput:": null, "output_throughput": 928.9837604849168, "total_token_throughput": 1857.9675209698337, "mean_ttft_ms": 122.96592185739428, "median_ttft_ms": 112.40720690693706, "std_ttft_ms": 66.13911304000679, "p90_ttft_ms": 255.94735525082797, "p99_ttft_ms": 311.73925049602985, "p99.9_ttft_ms": 314.68144817650324, "mean_tpot_ms": 8.482334189249984, "median_tpot_ms": 8.484079436520124, "std_tpot_ms": 0.08878052985776738, "p90_tpot_ms": 8.605768184886767, "p99_tpot_ms": 8.633496982299468, "p99.9_tpot_ms": 8.63350683135524, "mean_itl_ms": 8.482334263969731, "median_itl_ms": 8.499014074914157, "std_itl_ms": 0.4519442913074439, "p90_itl_ms": 8.674743911251426, "p99_itl_ms": 8.812509721610695, "p99.9_itl_ms": 10.00087821437054, "mean_e2el_ms": 8800.393797460129, "median_e2el_ms": 8809.698014520109, "std_e2el_ms": 92.8142026327281, "p90_e2el_ms": 8913.593775313348, "p99_e2el_ms": 8953.53475105483, "p99.9_e2el_ms": 8953.599093816709} \ No newline at end of file diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..88629b669 --- /dev/null +++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_128_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-121723", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 1280, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 128, "duration": 244.16467579081655, "completed": 1280, "total_input_tokens": 1310720, "total_output_tokens": 1310720, "request_throughput": 5.2423635640751565, "request_goodput:": null, "output_throughput": 5368.18028961296, "total_token_throughput": 10736.36057922592, "mean_ttft_ms": 354.4963012025619, "median_ttft_ms": 221.00601554848254, "std_ttft_ms": 472.55769183914384, "p90_ttft_ms": 367.79938223771757, "p99_ttft_ms": 2608.2393946917728, "p99.9_ttft_ms": 2782.5139312951364, "mean_tpot_ms": 23.336153631907884, "median_tpot_ms": 23.378271256657253, "std_tpot_ms": 0.1400068026502214, "p90_tpot_ms": 23.406807750391568, "p99_tpot_ms": 23.42566725517553, "p99.9_tpot_ms": 23.436426655725757, "mean_itl_ms": 23.3361536941776, "median_itl_ms": 23.340390995144844, "std_itl_ms": 1.2778758518534754, "p90_itl_ms": 24.728624033741653, "p99_itl_ms": 26.77674562903122, "p99.9_itl_ms": 29.069117963546944, "mean_e2el_ms": 24227.38146664433, "median_e2el_ms": 24133.458781056106, "std_e2el_ms": 434.30045305130164, "p90_e2el_ms": 24261.306971753947, "p99_e2el_ms": 26417.07157871453, "p99.9_e2el_ms": 26614.837866391288} \ No newline at end of file diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..dc27001d6 --- /dev/null +++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_16_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-120317", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 160, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 16, "duration": 181.8641711010132, "completed": 160, "total_input_tokens": 163840, "total_output_tokens": 163840, "request_throughput": 0.879777468158535, "request_goodput:": null, "output_throughput": 900.8921273943398, "total_token_throughput": 1801.7842547886796, "mean_ttft_ms": 177.28006526303943, "median_ttft_ms": 137.23533554002643, "std_ttft_ms": 71.58428526644974, "p90_ttft_ms": 269.01265301276, "p99_ttft_ms": 412.05388789297996, "p99.9_ttft_ms": 437.451443815371, "mean_tpot_ms": 17.564719299398767, "median_tpot_ms": 17.56061658305084, "std_tpot_ms": 0.03530473075764802, "p90_tpot_ms": 17.610166981731627, "p99_tpot_ms": 17.63727733525352, "p99.9_tpot_ms": 17.651837991991645, "mean_itl_ms": 17.564719371964884, "median_itl_ms": 17.580883926711977, "std_itl_ms": 1.0867409227478173, "p90_itl_ms": 19.01229675859213, "p99_itl_ms": 19.53902784269303, "p99.9_itl_ms": 20.444900048896738, "mean_e2el_ms": 18145.987908547977, "median_e2el_ms": 18126.11438310705, "std_e2el_ms": 69.77299297083215, "p90_e2el_ms": 18231.70208907686, "p99_e2el_ms": 18345.085098790005, "p99.9_e2el_ms": 18374.506261226954} \ No newline at end of file diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..0a0906a0c --- /dev/null +++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_256_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-122337", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 2560, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 256, "duration": 293.4073180370033, "completed": 2560, "total_input_tokens": 2621440, "total_output_tokens": 2621440, "request_throughput": 8.72507208452498, "request_goodput:": null, "output_throughput": 8934.47381455358, "total_token_throughput": 17868.94762910716, "mean_ttft_ms": 499.91884301671234, "median_ttft_ms": 232.72915300913155, "std_ttft_ms": 936.3295262768914, "p90_ttft_ms": 352.94642923399795, "p99_ttft_ms": 4966.478346844669, "p99.9_ttft_ms": 5474.548177599907, "mean_tpot_ms": 27.88984300256413, "median_tpot_ms": 27.98402836757511, "std_tpot_ms": 0.2996025480834357, "p90_tpot_ms": 28.034266286901946, "p99_tpot_ms": 28.066722955850597, "p99.9_tpot_ms": 28.08507930946706, "mean_itl_ms": 27.889843062892968, "median_itl_ms": 27.93224505148828, "std_itl_ms": 2.0988254491284923, "p90_itl_ms": 29.45356967393309, "p99_itl_ms": 34.86446616007015, "p99.9_itl_ms": 40.692481586942456, "mean_e2el_ms": 29031.22823463982, "median_e2el_ms": 28862.933419528417, "std_e2el_ms": 885.7421631371468, "p90_e2el_ms": 28989.10636473447, "p99_e2el_ms": 33438.947307681665, "p99.9_e2el_ms": 34007.66125454707} \ No newline at end of file diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..33bcbb85a --- /dev/null +++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_32_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-120731", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 320, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 32, "duration": 197.75558240781538, "completed": 320, "total_input_tokens": 327680, "total_output_tokens": 327680, "request_throughput": 1.618159124024574, "request_goodput:": null, "output_throughput": 1656.9949430011638, "total_token_throughput": 3313.9898860023277, "mean_ttft_ms": 204.8250449974148, "median_ttft_ms": 165.07942252792418, "std_ttft_ms": 126.25809383468614, "p90_ttft_ms": 280.0931277219208, "p99_ttft_ms": 760.1671954616904, "p99.9_ttft_ms": 763.005586763844, "mean_tpot_ms": 19.072629646838525, "median_tpot_ms": 19.09136980360525, "std_tpot_ms": 0.05851032738792819, "p90_tpot_ms": 19.13042017240496, "p99_tpot_ms": 19.18148197103492, "p99.9_tpot_ms": 19.181995146826385, "mean_itl_ms": 19.072629718559696, "median_itl_ms": 19.08195298165083, "std_itl_ms": 1.1082617310125253, "p90_itl_ms": 20.502249943092465, "p99_itl_ms": 21.305559629108753, "p99.9_itl_ms": 22.560074097709737, "mean_e2el_ms": 19716.125173713226, "median_e2el_ms": 19690.169369452633, "std_e2el_ms": 124.41108603823491, "p90_e2el_ms": 19801.692966977134, "p99_e2el_ms": 20280.91403166065, "p99.9_e2el_ms": 20281.5225590982} \ No newline at end of file diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..3b03fa3e5 --- /dev/null +++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_4_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-115544", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 40, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 4, "duration": 158.28731181519106, "completed": 40, "total_input_tokens": 40960, "total_output_tokens": 40960, "request_throughput": 0.2527050307525732, "request_goodput:": null, "output_throughput": 258.769951490635, "total_token_throughput": 517.53990298127, "mean_ttft_ms": 117.1287115837913, "median_ttft_ms": 105.45016499236226, "std_ttft_ms": 22.95514831968995, "p90_ttft_ms": 154.71577032003552, "p99_ttft_ms": 179.9939913023263, "p99.9_ttft_ms": 192.60747395735254, "mean_tpot_ms": 15.340623984750376, "median_tpot_ms": 15.343764986895673, "std_tpot_ms": 0.038580294888251945, "p90_tpot_ms": 15.396409688466472, "p99_tpot_ms": 15.41568306727628, "p99.9_tpot_ms": 15.422512741592364, "mean_itl_ms": 15.34062407968064, "median_itl_ms": 15.342000522650778, "std_itl_ms": 1.104424460872785, "p90_itl_ms": 16.805011359974742, "p99_itl_ms": 17.192507253494114, "p99.9_itl_ms": 17.785792525159223, "mean_e2el_ms": 15810.587047983427, "median_e2el_ms": 15814.386753481813, "std_e2el_ms": 40.38963926676447, "p90_e2el_ms": 15859.638332994655, "p99_e2el_ms": 15892.085841062944, "p99.9_e2el_ms": 15900.850841090782} \ No newline at end of file diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..bda123e9d --- /dev/null +++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_512_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-123216", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 5120, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 512, "duration": 409.99618642101996, "completed": 5120, "total_input_tokens": 5242880, "total_output_tokens": 5242880, "request_throughput": 12.487921033349163, "request_goodput:": null, "output_throughput": 12787.631138149543, "total_token_throughput": 25575.262276299087, "mean_ttft_ms": 799.6392629810998, "median_ttft_ms": 281.9825775222853, "std_ttft_ms": 1862.6660647354645, "p90_ttft_ms": 422.8423163294793, "p99_ttft_ms": 9882.19552612165, "p99.9_ttft_ms": 10696.999395577937, "mean_tpot_ms": 38.74069696199115, "median_tpot_ms": 39.040027885107854, "std_tpot_ms": 0.9434676549467771, "p90_tpot_ms": 39.248179043834355, "p99_tpot_ms": 39.34789874325328, "p99.9_tpot_ms": 39.38483638085906, "mean_itl_ms": 38.74069702274606, "median_itl_ms": 38.97913300897926, "std_itl_ms": 7.37413467906563, "p90_itl_ms": 41.30664523690939, "p99_itl_ms": 64.09359137760478, "p99.9_itl_ms": 91.91624939884088, "mean_e2el_ms": 40431.37225509804, "median_e2el_ms": 40222.33807248995, "std_e2el_ms": 1868.5858228603415, "p90_e2el_ms": 40530.33041614108, "p99_e2el_ms": 49498.614948240574, "p99.9_e2el_ms": 50402.42158004316} \ No newline at end of file diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..39a9836cc --- /dev/null +++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_64_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-121211", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 640, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 64, "duration": 219.57327283197083, "completed": 640, "total_input_tokens": 655360, "total_output_tokens": 655360, "request_throughput": 2.9147445485761017, "request_goodput:": null, "output_throughput": 2984.698417741928, "total_token_throughput": 5969.396835483856, "mean_ttft_ms": 272.4847578108893, "median_ttft_ms": 223.96848094649613, "std_ttft_ms": 233.0339475669637, "p90_ttft_ms": 315.4728528345004, "p99_ttft_ms": 1419.038086910732, "p99.9_ttft_ms": 1425.9020658000386, "mean_tpot_ms": 21.105819801529368, "median_tpot_ms": 21.12906251010737, "std_tpot_ms": 0.07842898272041984, "p90_tpot_ms": 21.173637142069893, "p99_tpot_ms": 21.188599876768137, "p99.9_tpot_ms": 21.2060247620869, "mean_itl_ms": 21.105819863068465, "median_itl_ms": 21.13879646640271, "std_itl_ms": 1.1227502711857107, "p90_itl_ms": 22.462772903963923, "p99_itl_ms": 23.81146681262179, "p99.9_itl_ms": 25.36561909643935, "mean_e2el_ms": 21863.738414775435, "median_e2el_ms": 21827.26457947865, "std_e2el_ms": 211.00382490901427, "p90_e2el_ms": 21929.400861775503, "p99_e2el_ms": 22961.20647754753, "p99.9_e2el_ms": 22970.07569239778} \ No newline at end of file diff --git a/benchmarks/multi_node/results/minimax-m2.5/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json b/benchmarks/multi_node/results/minimax-m2.5/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json new file mode 100644 index 000000000..ff8c39ac9 --- /dev/null +++ b/benchmarks/multi_node/results/minimax-m2.5/concurrency_8_req_rate_inf_gpus_16_ctx_8_gen_8.json @@ -0,0 +1 @@ +{"date": "20260428-115923", "backend": "openai", "model_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "tokenizer_id": "/models/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/", "best_of": 1, "num_prompts": 80, "request_rate": "inf", "burstiness": 1.0, "max_concurrency": 8, "duration": 169.385015378939, "completed": 80, "total_input_tokens": 81920, "total_output_tokens": 81920, "request_throughput": 0.4722967956818868, "request_goodput:": null, "output_throughput": 483.6319187782521, "total_token_throughput": 967.2638375565042, "mean_ttft_ms": 148.00462815037463, "median_ttft_ms": 132.5412035221234, "std_ttft_ms": 37.51148982241326, "p90_ttft_ms": 168.5610823798926, "p99_ttft_ms": 278.3863469376229, "p99.9_ttft_ms": 278.64237096509896, "mean_tpot_ms": 16.392615779661135, "median_tpot_ms": 16.387911661726513, "std_tpot_ms": 0.034123230191850644, "p90_tpot_ms": 16.43408909303483, "p99_tpot_ms": 16.473588053212627, "p99.9_tpot_ms": 16.473637208842806, "mean_itl_ms": 16.392615859419287, "median_itl_ms": 16.397346975281835, "std_itl_ms": 1.0821039401103671, "p90_itl_ms": 17.86093518603593, "p99_itl_ms": 18.27556065050885, "p99.9_itl_ms": 18.872217730851858, "mean_e2el_ms": 16917.650570743717, "median_e2el_ms": 16907.48401999008, "std_e2el_ms": 42.08324227539982, "p90_e2el_ms": 16975.576334632933, "p99_e2el_ms": 17025.145734883845, "p99.9_e2el_ms": 17025.642524216324} \ No newline at end of file diff --git a/benchmarks/multi_node/vllm_disagg_utils/Dockerfile b/benchmarks/multi_node/vllm_disagg_utils/Dockerfile new file mode 100644 index 000000000..899e026f7 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/Dockerfile @@ -0,0 +1,21 @@ +# Also installs +# - RDMA libs for Thor2 +# +# Build with `docker build -t vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c-bnxt .` to use in run_P.sh +# From 4/24/2026, 0.19+ +FROM vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c + +# RDMA drivers for broadcom devices +RUN apt-get update && apt-get install -y autoconf libibverbs-dev ibverbs-utils libtool unzip wget +RUN wget https://docs.broadcom.com/docs-and-downloads/ethernet-network-adapters/NXE/Thor2/GCA1/bcm5760x_230.2.52.0a.zip && \ + unzip bcm5760x_230.2.52.0a.zip && \ + cd bcm5760x_230.2.52.0a/drivers_linux/bnxt_rocelib/ && \ + results=$(find -name "libbnxt*.tar.gz") && tar -xf $results && \ + untar_dir=$(find . -maxdepth 1 -type d -name "libbnxt*" ! -name "*.tar.gz" | head -n 1) && cd $untar_dir && sh autogen.sh && ./configure && make && \ + find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \; && \ + make install all && sudo sh -c "echo /usr/local/lib >> /etc/ld.so.conf" && \ + sudo ldconfig && \ + cp -f bnxt_re.driver /etc/libibverbs.d/ && \ + ibv_devices +RUN rm -rf bcm5760x_230.2.52.0a && \ + rm -rf bcm5760x_230.2.52.0a.zip diff --git a/benchmarks/multi_node/vllm_disagg_utils/run_D.sh b/benchmarks/multi_node/vllm_disagg_utils/run_D.sh new file mode 100644 index 000000000..cc43d7df1 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/run_D.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7" + +# export MODEL_NAME="DeepSeek-R1-0528" # key from models_vllm.yaml +# export MODEL_DIR="$HOME/.cache/huggingface/hub" +# export MODEL_PATH="$HOME/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-0528/snapshots/4236a6af538feda4548eca9ab308586007567f52" +export MODEL_NAME="MiniMax-M2.5" # key from models_vllm.yaml +export MODEL_DIR="$HOME/.cache/huggingface/hub" +export MODEL_PATH="$HOME/.cache/huggingface/hub/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/" +export NODE0_ADDR="10.21.9.47" # prefill (rank-0) node's IP +export IPADDRS="10.21.9.47,10.21.9.29" # prefill IP, then decode IPs +export xP=1 yD=1 +export NNODES=2 +export GPUS_PER_NODE=8 + +export NODE_RANK=1 +export DRY_RUN=0 + +export BENCH_INPUT_LEN=1024 +export BENCH_OUTPUT_LEN=1024 +export BENCH_MAX_CONCURRENCY="32x64x128x256x512" + +# Repo root (3 levels up from this script's directory) +export DI_REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +# Mount point inside the container (must match WS_PATH computation below) +export DOCKER_MOUNT_PATH="/workspace" +# Container-side path to the scripts directory +export WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" +# Remap host MODEL_PATH into the container's /models mount +export DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + +export SLURM_JOB_ID=1 +mkdir -p "/tmp/slurm_job-${SLURM_JOB_ID}" + +CONTAINER_NAME="vllm-disagg-decode" +docker rm -f "$CONTAINER_NAME" 2>/dev/null || true + +docker run --rm \ + --name "$CONTAINER_NAME" \ + --init \ + --stop-timeout 10 \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --device=/dev/infiniband/rdma_cm \ + --device=/dev/infiniband/uverbs0 \ + --device=/dev/infiniband/uverbs1 \ + --device=/dev/infiniband/uverbs2 \ + --device=/dev/infiniband/uverbs3 \ + --device=/dev/infiniband/uverbs4 \ + --device=/dev/infiniband/uverbs5 \ + --device=/dev/infiniband/uverbs6 \ + --device=/dev/infiniband/uverbs7 \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v /sys:/sys \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ + -v ${MODEL_DIR}:/models \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + -v /tmp:/run_logs \ + -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ + -e SLURM_JOB_ID=$SLURM_JOB_ID \ + -e NNODES=$NNODES \ + -e NODE0_ADDR=$NODE0_ADDR \ + -e IPADDRS=$IPADDRS \ + -e MODEL_DIR=/models \ + -e MODEL_NAME=$MODEL_NAME \ + -e MODEL_PATH=$DOCKER_MODEL_PATH \ + -e WS_PATH=${WS_PATH} \ + -e GPUS_PER_NODE=$GPUS_PER_NODE \ + -e NODE_RANK=$NODE_RANK \ + -e xP=$xP \ + -e yD=$yD \ + -e IBDEVICES=$IBDEVICES \ + -e DRY_RUN=$DRY_RUN \ + -e ENGINE=vllm-disagg \ + -e HF_HUB_CACHE=/models \ + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma \ + -e UCX_SOCKADDR_TLS_PRIORITY=tcp \ + -e UCX_MEMTYPE_CACHE=y \ + -e UCX_RNDV_SCHEME=get_zcopy \ + -e UCX_RNDV_THRESH=4k \ + -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ + -e UCX_LOG_LEVEL=warn \ + -e HSA_ENABLE_SDMA=1 \ + -e VLLM_USE_V1=1 \ + -e VLLM_ROCM_USE_AITER=1 \ + -e VLLM_ROCM_USE_AITER_RMSNORM=1 \ + -e VLLM_ENGINE_READY_TIMEOUT_S=3600 \ + -e PREFILL_NODES=1 \ + -e DECODE_NODES=1 \ + -e PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} \ + -e VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ + -e PYTHONPYCACHEPREFIX=/tmp/pycache \ + -e PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP:-true} \ + -e PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP:-false} \ + -e DECODE_ENABLE_EP=${DECODE_ENABLE_EP:-true} \ + -e DECODE_ENABLE_DP=${DECODE_ENABLE_DP:-false} \ + -e PREFILL_TP_SIZE=${PREFILL_TP_SIZE:-8} \ + -e DECODE_TP_SIZE=${DECODE_TP_SIZE:-8} \ + --entrypoint /bin/bash \ + vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c-bnxt \ + -lc "mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && ${WS_PATH}/server.sh 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/server_\$(hostname).log" diff --git a/benchmarks/multi_node/vllm_disagg_utils/run_P.sh b/benchmarks/multi_node/vllm_disagg_utils/run_P.sh new file mode 100755 index 000000000..ca33f0148 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/run_P.sh @@ -0,0 +1,144 @@ +#!/bin/bash +export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7" + +# export MODEL_NAME="DeepSeek-R1-0528" # key from models_vllm.yaml +# export MODEL_DIR="$HOME/.cache/huggingface/hub" +# export MODEL_PATH="$HOME/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-0528/snapshots/4236a6af538feda4548eca9ab308586007567f52" +export MODEL_NAME="MiniMax-M2.5" # key from models_vllm.yaml +export MODEL_DIR="$HOME/.cache/huggingface/hub" +export MODEL_PATH="$HOME/.cache/huggingface/hub/models--MiniMaxAI--MiniMax-M2.5/snapshots/f710177d938eff80b684d42c5aa84b382612f21f/" +export NODE0_ADDR="10.21.9.47" # this node's IP (prefill) +export IPADDRS="10.21.9.47,10.21.9.29" # prefill IP, then decode IPs +export xP=1 yD=1 +export NNODES=2 +export GPUS_PER_NODE=8 + +export NODE_RANK=0 +export DRY_RUN=0 + +export BENCH_INPUT_LEN=1024 +export BENCH_OUTPUT_LEN=1024 +export BENCH_MAX_CONCURRENCY="4x8x16x32x64x128x256x512" + +# Repo root (3 levels up from this script's directory) +export DI_REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +# Mount point inside the container (must match WS_PATH computation below) +export DOCKER_MOUNT_PATH="/workspace" +# Container-side path to the scripts directory +export WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" +# Remap host MODEL_PATH into the container's /models mount +export DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + +export SLURM_JOB_ID=1 +mkdir -p "/tmp/slurm_job-${SLURM_JOB_ID}" + +ROUTER_PORT="${ROUTER_PORT:-30000}" +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}" +ROUTER_CONT_NAME="router_vllm_local_${SLURM_JOB_ID}" + +# Launch vllm-router as a separate container (mirrors job.slurm behavior) +docker rm -f "$ROUTER_CONT_NAME" 2>/dev/null || true +docker run -d \ + --name "$ROUTER_CONT_NAME" \ + --network host \ + -v /tmp:/run_logs \ + "$VLLM_ROUTER_IMAGE" \ + bash -lc "mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ + --vllm-pd-disaggregation \ + --kv-connector moriio \ + --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ + --port ${ROUTER_PORT} \ + --host 0.0.0.0 \ + --policy consistent_hash \ + --prefill-policy consistent_hash \ + --decode-policy consistent_hash \ + --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log" + +CONTAINER_NAME="vllm-disagg-prefill" +docker rm -f "$CONTAINER_NAME" 2>/dev/null || true + +docker run --rm \ + --name "$CONTAINER_NAME" \ + --init \ + --stop-timeout 10 \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --device=/dev/infiniband/rdma_cm \ + --device=/dev/infiniband/uverbs0 \ + --device=/dev/infiniband/uverbs1 \ + --device=/dev/infiniband/uverbs2 \ + --device=/dev/infiniband/uverbs3 \ + --device=/dev/infiniband/uverbs4 \ + --device=/dev/infiniband/uverbs5 \ + --device=/dev/infiniband/uverbs6 \ + --device=/dev/infiniband/uverbs7 \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v /sys:/sys \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ + -v ${MODEL_DIR}:/models \ + -v $HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + -v /tmp:/run_logs \ + -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ + -e SLURM_JOB_ID=$SLURM_JOB_ID \ + -e NNODES=$NNODES \ + -e NODE0_ADDR=$NODE0_ADDR \ + -e IPADDRS=$IPADDRS \ + -e MODEL_DIR=/models \ + -e MODEL_NAME=$MODEL_NAME \ + -e MODEL_PATH=$DOCKER_MODEL_PATH \ + -e WS_PATH=${WS_PATH} \ + -e GPUS_PER_NODE=$GPUS_PER_NODE \ + -e NODE_RANK=$NODE_RANK \ + -e xP=$xP \ + -e yD=$yD \ + -e IBDEVICES=$IBDEVICES \ + -e DRY_RUN=$DRY_RUN \ + -e ENGINE=vllm-disagg \ + -e HF_HUB_CACHE=/models \ + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma \ + -e UCX_SOCKADDR_TLS_PRIORITY=tcp \ + -e UCX_MEMTYPE_CACHE=y \ + -e UCX_RNDV_SCHEME=get_zcopy \ + -e UCX_RNDV_THRESH=4k \ + -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ + -e UCX_LOG_LEVEL=warn \ + -e HSA_ENABLE_SDMA=1 \ + -e GLOO_SOCKET_IFNAME=ens51np0 \ + -e NCCL_SOCKET_IFNAME=ens51np0 \ + -e VLLM_USE_V1=1 \ + -e VLLM_ROCM_USE_AITER=1 \ + -e VLLM_ROCM_USE_AITER_RMSNORM=1 \ + -e VLLM_ENGINE_READY_TIMEOUT_S=3600 \ + -e PREFILL_NODES=1 \ + -e DECODE_NODES=1 \ + -e PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} \ + -e VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ + -e PYTHONPYCACHEPREFIX=/tmp/pycache \ + -e PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP:-true} \ + -e PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP:-false} \ + -e DECODE_ENABLE_EP=${DECODE_ENABLE_EP:-true} \ + -e DECODE_ENABLE_DP=${DECODE_ENABLE_DP:-false} \ + -e PREFILL_TP_SIZE=${PREFILL_TP_SIZE:-8} \ + -e DECODE_TP_SIZE=${DECODE_TP_SIZE:-8} \ + -e BENCH_INPUT_LEN=${BENCH_INPUT_LEN:-1024} \ + -e BENCH_OUTPUT_LEN=${BENCH_OUTPUT_LEN:-1024} \ + -e BENCH_RANDOM_RANGE_RATIO=${BENCH_RANDOM_RANGE_RATIO:-1} \ + -e BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} \ + -e BENCH_MAX_CONCURRENCY=${BENCH_MAX_CONCURRENCY:-512} \ + -e BENCH_REQUEST_RATE=${BENCH_REQUEST_RATE:-inf} \ + -e TQDM_MININTERVAL=${TQDM_MININTERVAL:-20} \ + --entrypoint /bin/bash \ + vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c-bnxt \ + -lc "mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && ${WS_PATH}/server.sh 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/server_\$(hostname).log" + +docker rm -f "$ROUTER_CONT_NAME" 2>/dev/null || true