From ba03630403b9a8c0efd37968abf34548e7573b12 Mon Sep 17 00:00:00 2001 From: tianhao909 <843101550@qq.com> Date: Tue, 14 Apr 2026 08:51:33 +0000 Subject: [PATCH 1/8] feat: SimAI 1.6 GPU memory module with PD-separation (review fixes) --- README.md | 124 +- README_CN.md | 268 +++++ vidur-alibabacloud/.gitignore | 17 + vidur-alibabacloud/README-vidur.md | 162 ++- vidur-alibabacloud/README.md | 340 ++++-- vidur-alibabacloud/README_CN.md | 458 ++++++++ .../hf_configs/deepseek_R1_0528_config.json | 67 ++ .../data/hf_configs/deepseek_v3_config.json | 70 ++ .../qwen3-235B-A22B_FP8_config.json | 49 + .../hf_configs/qwen3-235B-A22B_config.json | 38 + .../data/hf_configs/qwen3-30B-A3B_config.json | 38 + .../data/hf_configs/qwen3-8B_config.json | 30 + ...wen3-next-80B-A3B_Instruct_FP8_config.json | 43 + .../hf_configs/qwen3-next-80B-A3B_config.json | 43 + .../vidur-ali-scenarios/run_scenarios.sh | 361 ++++++ .../tests/test_pd_separation.py | 167 +++ vidur-alibabacloud/vidur/config/config.py | 111 +- .../vidur/config/device_sku_config.py | 38 +- .../vidur/config/model_config.py | 122 +- .../vidur/config/node_sku_config.py | 11 +- vidur-alibabacloud/vidur/entities/batch.py | 1 - vidur-alibabacloud/vidur/entities/cluster.py | 142 ++- .../vidur/entities/execution_time.py | 1027 +++++++++++++++-- vidur-alibabacloud/vidur/entities/replica.py | 182 ++- vidur-alibabacloud/vidur/entities/request.py | 156 +-- .../vidur/events/batch_end_event.py | 178 ++- .../vidur/events/batch_stage_arrival_event.py | 1 + .../vidur/events/batch_stage_end_event.py | 15 +- .../vidur/events/replica_schedule_event.py | 4 +- .../events/replica_stage_schedule_event.py | 9 +- .../base_execution_time_predictor.py | 113 +- .../communication_time_predictor.py | 30 +- .../sklearn_execution_time_predictor.py | 84 +- .../vidur/metrics/cdf_sketch.py | 4 +- .../vidur/metrics/data_series.py | 60 +- .../vidur/metrics/metrics_store.py | 3 +- .../profiling/collectives/benchmark_runner.py | 4 +- .../profiling/collectives/collectives_impl.py | 9 +- .../splitwise_global_scheduler.py | 63 +- .../splitwise_replica_scheduler.py | 141 ++- .../replica_stage_schduler.py | 7 +- .../vidur/scheduler/utils/memory_planner.py | 302 ++++- vidur-alibabacloud/vidur/simulator.py | 22 +- .../vidur/types/device_sku_type.py | 3 + .../vidur/types/node_sku_type.py | 1 + .../vidur/utils/mfu_calculator.py | 93 +- .../vidur/utils/param_counter.py | 457 +++++++- 47 files changed, 5056 insertions(+), 612 deletions(-) create mode 100644 README_CN.md create mode 100644 vidur-alibabacloud/README_CN.md create mode 100644 vidur-alibabacloud/data/hf_configs/deepseek_R1_0528_config.json create mode 100644 vidur-alibabacloud/data/hf_configs/deepseek_v3_config.json create mode 100644 vidur-alibabacloud/data/hf_configs/qwen3-235B-A22B_FP8_config.json create mode 100644 vidur-alibabacloud/data/hf_configs/qwen3-235B-A22B_config.json create mode 100644 vidur-alibabacloud/data/hf_configs/qwen3-30B-A3B_config.json create mode 100644 vidur-alibabacloud/data/hf_configs/qwen3-8B_config.json create mode 100644 vidur-alibabacloud/data/hf_configs/qwen3-next-80B-A3B_Instruct_FP8_config.json create mode 100644 vidur-alibabacloud/data/hf_configs/qwen3-next-80B-A3B_config.json create mode 100644 vidur-alibabacloud/examples/vidur-ali-scenarios/run_scenarios.sh create mode 100644 vidur-alibabacloud/tests/test_pd_separation.py diff --git a/README.md b/README.md index 85fbcb63..d2b2aba3 100755 --- a/README.md +++ b/README.md @@ -1,12 +1,29 @@ +
+ 中文  |  English +
+ +# SimAI + +[](LICENSE) +[](https://ennanzhai.github.io/pub/nsdi25spring-simai.pdf) + # Latest News ### Recent Updates +- [2026/03] **SimAI 1.6 Released!** This release adds GPU memory modeling for inference simulation. Key features include: + + - **GPU Memory Module:** Accurate parameter counting and KV cache management for DeepSeek-V3-671B, Qwen3-MoE-235B, and Qwen3-Next-80B. + - **PD-Separation Memory Planning:** Independent parameter memory and KV cache budget calculation for Prefill and Decode phases. See [memory_planner.py](./vidur-alibabacloud/vidur/scheduler/utils/memory_planner.py). + - **Improved Decode Time Estimation:** Linear interpolation replacing nearest-neighbor for AICB decode time prediction, with global cache for cross-run reuse. See [execution_time.py](./vidur-alibabacloud/vidur/entities/execution_time.py). + - **4-Scenario Test Suite:** End-to-end validation covering Qwen3-Next-80B, DeepSeek-671B, and Qwen3-MoE-235B. See [run_scenarios.sh](./vidur-alibabacloud/examples/vidur-ali-scenarios/run_scenarios.sh). + - **Code Quality:** Replaced print with logging, added bilingual docstrings, removed ~390 lines of dead code, standardized TODOs, and added type annotations across vidur-alibabacloud modules. + - [2025/12] **SimAI 1.5 Released!** This release brings end-to-end simulation for multi-request **inference** workloads. Key features include: - - - **Advanced Inference Simulation:** Model complex scenarios with Prefill/Decode separation. - - **Modern Model Support:** Now includes DeepSeek, Qwen3Moe and Qwen3Next. See [AICB's README](./aicb/README.md) for more detailed information. - - **Request Scheduling:** Request scheduling is now handled by a component adapted from Microsoft's [Vidur](https://github.com/microsoft/vidur). See [Vidur-Alibabacloud's README](./vidur-alibabacloud/README.md) for more detailed information. + + - **Advanced Inference Simulation:** Model complex scenarios with Prefill/Decode separation. + - **Modern Model Support:** Now includes DeepSeek, Qwen3Moe and Qwen3Next. See [AICB's README](./aicb/README.md) for more detailed information. + - **Request Scheduling:** Request scheduling is now handled by a component adapted from Microsoft's [Vidur](https://github.com/microsoft/vidur). See [Vidur-Alibabacloud's README](./vidur-alibabacloud/README.md) for more detailed information. - [2025/11] [AICB](https://github.com/aliyun/aicb/tree/master) now supports generating **prefill/decode** inference workloads for **DeepSeek**, **Qwen3-MoE** and **Qwen3-Next**. @@ -14,7 +31,8 @@ - [2025/06] The code of SimCCL is first released in the branch [SimCCL](https://github.com/aliyun/SimAI/tree/SimCCL) and will be released in SimCCL repository soon. -**We warmly welcome contributions from the community!** If you are interested in helping shape the future of SimAI, please feel free to open an issue to discuss your ideas or submit a pull request. +**We warmly welcome contributions from the community!** If you are interested in helping shape the future of SimAI, please feel free to open an issue to discuss your ideas or submit a pull request. +
+ 中文  |  English +
+ +# SimAI + +[](LICENSE) +[](https://ennanzhai.github.io/pub/nsdi25spring-simai.pdf) + +# 最新动态 + +### 近期更新 + +- [2026/03] **SimAI 1.6 正式发布!** 本版本新增推理仿真的 GPU 内存建模能力。主要特性包括: + + - **GPU 内存计算模块:** 支持 DeepSeek-V3-671B、Qwen3-MoE-235B、Qwen3-Next-80B 的精确参数计数与 KV Cache 管理。 + - **PD 分离内存规划:** Prefill 与 Decode 阶段独立的参数内存和 KV Cache 预算计算。详见 [memory_planner.py](./vidur-alibabacloud/vidur/scheduler/utils/memory_planner.py)。 + - **Decode 时间估算改进:** 首尾线性插值替代最近邻的 AICB decode 时间预测,全局缓存支持跨运行复用。详见 [execution_time.py](./vidur-alibabacloud/vidur/entities/execution_time.py)。 + - **4 场景端到端测试:** 覆盖 Qwen3-Next-80B、DeepSeek-671B、Qwen3-MoE-235B 的完整验证套件。详见 [run_scenarios.sh](./vidur-alibabacloud/examples/vidur-ali-scenarios/run_scenarios.sh)。 + - **代码质量提升:** logging 替换 print 输出、双语 docstring、清理 ~390 行死代码、TODO 规范化、类型标注补全。 + +- [2025/12] **SimAI 1.5 正式发布!** 本版本新增对多请求**推理**工作负载的端到端仿真支持,主要特性包括: + + - **高级推理仿真:** 支持 Prefill/Decode 分离等复杂场景建模。 + - **主流模型支持:** 新增 DeepSeek、Qwen3Moe 和 Qwen3Next 模型。详见 [AICB README](./aicb/README.md)。 + - **请求调度:** 请求调度组件基于微软 [Vidur](https://github.com/microsoft/vidur) 适配,详见 [Vidur-Alibabacloud README](./vidur-alibabacloud/README_CN.md)。 + +- [2025/11] [AICB](https://github.com/aliyun/aicb/tree/master) 新增对 **DeepSeek**、**Qwen3-MoE** 和 **Qwen3-Next** 的 **prefill/decode** 推理工作负载生成支持。 + +- [2025/09] [AICB](https://github.com/aliyun/aicb/tree/master) 新增 DeepSeek 训练工作负载生成支持。感谢 [@parthpower](https://github.com/parthpower) 的贡献。 + +- [2025/06] SimCCL 代码首次在 [SimCCL](https://github.com/aliyun/SimAI/tree/SimCCL) 分支发布,后续将在独立仓库正式开源。 + +**欢迎社区贡献!** 如有想法,欢迎提交 Issue 讨论或发起 Pull Request。 + ++ |--- AICB +SimAI --|--- SimCCL + |--- astra-sim-alibabacloud + |--- ns-3-alibabacloud + |--- vidur-alibabacloud ++ +在纯仿真能力基础上,SimAI 已演进为一个由四个组件([aicb](https://github.com/aliyun/aicb)、[SimCCL](https://github.com/aliyun/SimCCL)、[astra-sim-alibabacloud](https://github.com/aliyun/SimAI/tree/master/astra-sim-alibabacloud)、[ns-3-alibabacloud](https://github.com/aliyun/ns-3-alibabacloud))构成的全栈工具套件。这些组件可以灵活组合以实现不同功能。我们鼓励用户探索更多可能性。 + +下图为 SimAI 模拟器架构图: + + +astra-sim-alibabacloud 基于 [astra-sim](https://github.com/astra-sim/astra-sim/tree/ASTRA-sim-1.0) 扩展开发。感谢 astra-sim 团队的优秀工作和开源贡献。我们在其基础上集成了 NCCL 算法并添加了若干新特性。 + +## 应用场景 + +SimAI 支持三种主要运行模式: + +**SimAI-Analytical** 通过使用总线带宽(busbw)抽象网络通信细节来估算集合通信时间,实现快速仿真。目前支持用户自定义 busbw,自动计算 busbw 功能即将推出。 + +**SimAI-Simulation** 提供基于细粒度网络通信建模的全栈仿真。利用 NS-3 或其他网络模拟器(当前 NS-3 已开源)实现对所有通信行为的详细仿真,力求高保真还原真实训练环境。 + +**SimAI-Physical** *(Beta)* 支持在 CPU RDMA 集群环境下生成物理流量,通过生成类 NCCL 的流量模式深入研究 LLM 训练中的 NIC 行为。当前处于内测阶段。 + +| 场景 | 描述 | 组件组合 | +|------|------|----------| +| 1. AICB 测试套件 | 在 GPU 集群上使用 AICB 测试套件运行通信模式 | [AICB](https://github.com/aliyun/aicb) | +| 2. AICB/AIOB 工作负载 | 建模**推理**/训练过程的计算/通信模式以生成工作负载 | [AICB](https://github.com/aliyun/aicb) | +| 3. 集合通信分析 | 将集合通信操作分解为点对点通信集合 | [SimCCL](https://github.com/aliyun/SimCCL) | +| 4. 无 GPU 集合通信 | 在非 GPU 集群上执行 RDMA 集合通信流量 | [AICB](https://github.com/aliyun/aicb) + [SimCCL](https://github.com/aliyun/SimCCL) + [astra-sim-alibabacloud](https://github.com/aliyun/SimAI/tree/master/astra-sim-alibabacloud)(physical) | +| 5. SimAI-Analytical | 在任意服务器上快速进行 AICB 工作负载分析与仿真(忽略底层网络细节) | [AICB](https://github.com/aliyun/aicb) + [astra-sim-alibabacloud](https://github.com/aliyun/SimAI/tree/master/astra-sim-alibabacloud)(analytical) | +| 6. SimAI-Simulation | 在任意服务器上进行全栈仿真 | [AICB](https://github.com/aliyun/aicb) + [SimCCL](https://github.com/aliyun/SimCCL) + [astra-sim-alibabacloud](https://github.com/aliyun/SimAI/tree/master/astra-sim-alibabacloud)(simulation) + [ns-3-alibabacloud](https://github.com/aliyun/ns-3-alibabacloud) | +| 7. 多请求推理仿真 | 在单 GPU 服务器上进行多请求**推理**全栈仿真 | [AICB](https://github.com/aliyun/aicb) + [SimCCL](https://github.com/aliyun/SimCCL) + [vidur-alibabacloud](./vidur-alibabacloud) + [astra-sim-alibabacloud](https://github.com/aliyun/SimAI/tree/master/astra-sim-alibabacloud)(analytical/simulation) | + +## 引用 + +SimAI 论文已被 NSDI'25 Spring 接收,详情请参阅: + +*SimAI: Unifying Architecture Design and Performance Tuning for Large-Scale Large Language Model Training with Scalability and Precision.* + +[[pdf](https://ennanzhai.github.io/pub/nsdi25spring-simai.pdf)] / [[slides](./docs/SimAI_Intro_Online.pdf)] / [[video](https://n.dingtalk.com/dingding/live-room/index.html?roomId=OF5BkBUXVxmgsK7x&liveUuid=305736cd-aa70-498b-8003-2b471a53decd)] + +欢迎基于 SimAI 开展创新研究和功能扩展。欢迎加入社区群或通过邮件联系我们交流,我们可提供技术支持。 + +# 快速开始 + +以下为简单示例。完整教程请参见:[**SimAI@Tutorial**](./docs/Tutorial.md)、[**aicb@Tutorial**](https://github.com/aliyun/aicb/blob/master/training/tutorial.md)、[SimCCL@Tutorial]、[ns-3-alibabacloud@Tutorial] + +## 环境搭建 + +请按照以下步骤快速搭建环境并运行 SimAI。 + +### 从源码安装 + +以下步骤已在 Ubuntu 20.04 的 GCC/G++ 9.4.0、python 3.8.10 环境下验证。 + +可使用官方 Ubuntu 20.04 镜像,**不要安装 ninja**。 + +(对于工作负载生成场景,推荐直接使用 NGC 容器镜像。) + +```bash +# 克隆仓库 +$ git clone https://github.com/aliyun/SimAI.git +$ cd ./SimAI/ + +# 初始化子模块 +$ git submodule update --init --recursive +# 更新到最新提交 +$ git submodule update --remote + +# 编译 SimAI-Analytical +$ ./scripts/build.sh -c analytical + +# 编译 SimAI-Simulation (ns3) +$ ./scripts/build.sh -c ns3 +``` + +## 使用 SimAI-Analytical + +```bash +$ ./bin/SimAI_analytical -w example/workload_analytical.txt -g 9216 -g_p_s 8 -r test- -busbw example/busbw.yaml +``` + +若需自动计算总线带宽,请尝试: + +```bash +$ ./bin/SimAI_analytical -w ./example/workload_analytical.txt -g 9216 -nv 360 -nic 48.5 -n_p_s 8 -g_p_s 8 -r example- +``` + +## 使用 SimAI-Simulation + +```bash +# 生成网络拓扑 +$ python3 ./astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py -topo Spectrum-X -g 128 -gt A100 -bw 100Gbps -nvbw 2400Gbps + +# 运行仿真 +$ AS_SEND_LAT=3 AS_NVLS_ENABLE=1 ./bin/SimAI_simulator -t 16 -w ./example/microAllReduce.txt -n ./Spectrum-X_128g_8gps_100Gbps_A100 -c astra-sim-alibabacloud/inputs/config/SimAI.conf +``` + +## 使用多请求推理仿真 + +详情请参见 `vidur-alibabacloud` 目录下的 [README](./vidur-alibabacloud/README_CN.md)。该模块利用 AICB 对**推理**工作负载的计算时间进行 profiling。由于依赖 DeepGEMM 和 FlashMLA 等特定硬件加速库,目前仅兼容基于 **Hopper(SM90)** 和 **Blackwell(SM100)** 架构的 NVIDIA GPU。 + +```bash +# 从 Dockerfile 构建 +docker build -t image:latest . +docker run --gpus all -it --rm image:latest +``` + +**注意:** 若使用 Hopper GPU,请在 Dockerfile 中添加 `ENV FLASH_MLA_DISABLE_SM100=1`。 + +如需快速验证所有支持的推理场景(Qwen3-Next-80B、DeepSeek-671B、Qwen3-MoE-235B),可使用内置的四场景测试套件: + +```bash +# 前置条件:conda activate vidur +bash vidur-alibabacloud/examples/vidur-ali-scenarios/run_scenarios.sh --all +# 或单独运行某个场景: +bash vidur-alibabacloud/examples/vidur-ali-scenarios/run_scenarios.sh --scenario 1 +``` + +> **前置条件:** 需先激活 `conda activate vidur` 环境。详见 [环境配置](./vidur-alibabacloud/README_CN.md#-环境配置)。 +> +> 完整场景配置表与输出文件说明请参见 [Vidur-AlibabaCloud README](./vidur-alibabacloud/README_CN.md#四场景配置说明)。 + +# 致谢 + +衷心感谢以下人员和机构对本项目的贡献: + + +- TianHao Fu (Peking University) and [TELOS-syslab](https://github.com/TELOS-syslab/) +- Parth Parikh (KEYSIGHT) +- Sarah-Michelle Hammer & Ziyi Wang (TU-Berlin) +- Xinyue Li (BUPT) +- Tong Chen (Zhejiang University) +- Ming Wang (BUPT) +- Tao Jiang (Institute of Computing Technology, Chinese Academy of Sciences) + +……以及众多来自社区的个人贡献者(详见 [Contributors to aliyun/SimAI](https://github.com/aliyun/SimAI/graphs/contributors))。 + +同时感谢 Chenning Li(MIT CSAIL)发起了将 SimAI 集成到 [M4](https://github.com/netiken/m4) 的合作——M4 是一个新型创新模拟器。 + +**本项目持续欢迎更多贡献与建议。** + +# 贡献指南 + +欢迎参与贡献!开始前请阅读以下指引: + +| | | +|---|---| +| [贡献指南](./CONTRIBUTING.zh-CN.md) | 如何提交 Issue 和 Pull Request | +| [安全政策](./SECURITY_CN.md) | 如何报告安全漏洞 | +| [行为准则](./CODE_OF_CONDUCT_CN.md) | 社区行为规范 | +| [更新日志](./CHANGELOG_CN.md) | v1.5 起的版本历史 | + +# 联系我们 + +如有任何问题,欢迎发送邮件至:Gang Lu(yunding.lg@alibaba-inc.com)、Feiyang Xue(xuefeiyang.xfy@alibaba-inc.com)或 Qingxu Li(qingxu.lqx@alibaba-inc.com)。 + +欢迎加入 SimAI 社区交流群,左侧为钉钉群,右侧为微信群。 + +
+
++ 中文  |  English +
+# Vidur-AlibabaCloud -Vidur ([original](https://github.com/microsoft/vidur)) is a simulation framework for large language model (LLM) inference systems. -**Vidur-AlibabaCloud** (this repository) is a customized version optimized for Alibaba Cloud **SimAI** scenarios. It supports advanced features such as **Prefill–Decode (PD) disaggregation** and includes dedicated adaptations for state-of-the-art (SOTA) LLM models including **DeepSeek-V3-671B**, **Qwen3-MoE-235B**, **Qwen3-Next-80B**, and other models. +[](https://www.python.org/downloads/) +[](LICENSE) + +Vidur ([original](https://github.com/microsoft/vidur)) is a simulation framework for large language model (LLM) inference systems. +**Vidur-AlibabaCloud** (this repository) is a customized version optimized for Alibaba Cloud **SimAI** scenarios. It supports advanced features such as **Prefill–Decode (PD) disaggregation** and includes dedicated adaptations for SOTA LLM models including **DeepSeek-V3-671B**, **Qwen3-MoE-235B**, **Qwen3-Next-80B**, and others. + + +--- + +## Table of Contents + +- [Key Features](#key-features) +- [GPU Memory Calculation](#gpu-memory-calculation) +- [Supported Models](#supported-models) +- [Environment Setup](#-environment-setup) +- [Running Examples](#%EF%B8%8F-running-examples) + - [4-Scenario Configuration](#4-scenario-configuration) + - [Output Files](#output-files) +- [Key Input Parameters](#-key-input-parameter-reference) +- [Key Output Interpretation](#-key-output-interpretation) +- [Known Issues](#%EF%B8%8F-known-issues) +- [Help](#-help) --- ## Key Features -+ **Prefill–Decode (PD) Separation** – Enables running the prefill and decode stages on different nodes, allowing elastic resource allocation and performance isolation. -(Inspired by [splitwise-sim](https://github.com/Mutinifni/splitwise-sim)). -+ **Flexible Parallelism** – Supports: - - **Data Parallel (DP)** - - **Tensor Parallel (TP)** - - **Pipeline Parallel (PP)** - - **Expert Parallel (EP)** (support in progress) -Works for both **dense** and **Mixture-of-Experts (MoE)** models (MoE support in progress). -+ **Multiple Execution-Time Prediction Backends** – Choose from: - - **AICB/AIOB** - Partially supports computation kernels and TP, DP, PP, EP communication size for DeepSeek-V3-671B, Qwen3-Moe-235B, Qwen3-Next-80B - - **SimAi_simulation** – SimAI NS-3-based network simulation (supports TP) - - **SimAi_analytical** – SimAI analytical performance model (supports TP) - - **Native Vidur [original]** – Supports TP, DP, PP -+ **Workload Generation & Replay** – Replay real-world traces or generate synthetic requests using fixed or Poisson distributions. -+ **Fine-Grained Metrics** – Records: - - TTFT – Time to First Token - - TBT / TPOT – Time Between Tokens / Time Per Output Token - - End-to-end latency - - Communication cost - - Computation cost - - Scheduling delay + +- **Prefill–Decode (PD) Separation** — Enables running the prefill and decode stages on different nodes, allowing elastic resource allocation and performance isolation. + (Inspired by [splitwise-sim](https://github.com/Mutinifni/splitwise-sim)) +- **Flexible Parallelism** — Supports: + - **Data Parallel (DP)** + - **Tensor Parallel (TP)** + - **Pipeline Parallel (PP)** + - **Expert Parallel (EP)** (support in progress) + + Works for both **dense** and **Mixture-of-Experts (MoE)** models (MoE support in progress). +- **Multiple Execution-Time Prediction Backends** — Choose from: + - **AICB/AIOB** — Partially supports computation kernels and TP, DP, PP, EP communication size for DeepSeek-V3-671B, Qwen3-MoE-235B, Qwen3-Next-80B + - **SimAI Simulation** — SimAI NS-3-based network simulation (supports TP) + - **SimAI Analytical** — SimAI analytical performance model (supports TP) + - **Native Vidur [original]** — Supports TP, DP, PP +- **Workload Generation & Replay** — Replay real-world traces or generate synthetic requests using fixed or Poisson distributions. +- **Fine-Grained Metrics** — Records: + - TTFT — Time to First Token + - TBT / TPOT — Time Between Tokens / Time Per Output Token + - End-to-end latency + - Communication cost + - Computation cost + - Scheduling delay + +--- + +## GPU Memory Calculation + +This module provides accurate GPU memory estimation for modern MoE (Mixture-of-Experts) models during inference simulation, covering **model parameter memory**, **KV cache memory**, and **maximum batch size** calculation under Prefill–Decode (PD) disaggregation. + +### Supported Attention Architectures + +| Architecture | Model | Description | +|---|---|---| +| **MLA** (Multi-head Latent Attention) | DeepSeek-V3-671B | Uses LoRA-compressed KV cache (`kv_lora_rank` + `qk_rope_head_dim`) for reduced memory footprint | +| **MHA / GQA** (Multi-Head / Grouped-Query Attention) | Qwen3-MoE-235B | Standard KV cache with `num_kv_heads * head_dim` per token per layer | +| **Hybrid Full + Linear Attention** | Qwen3-Next-80B | Alternates between full attention and linear (GDN) attention every 4 layers | + +### Key Components + +- **`ParamCounter`** (`vidur/utils/param_counter.py`) — Computes per-layer and per-device parameter counts for MLA, MHA/GQA, linear attention, and MoE expert weights, with FP8 quantization support. Under PD disaggregation, it returns separate `(total_params, prefill_params, decode_params)` based on `prefill_world_size` / `decode_world_size`. +- **`MemoryPlanner`** (`vidur/scheduler/utils/memory_planner.py`) — Plans GPU memory budget: `available = GPU_mem * (1 - margin) - param_mem`, then computes KV cache capacity and maximum concurrent requests. Includes OOM detection with actionable suggestions. +- **Per-request KV cache tracking** (`vidur/entities/replica.py`) — Allocates and releases KV cache memory on a per-request basis, enabling accurate remaining-capacity queries at runtime. + +### References & Acknowledgments + +The GPU memory calculation module was developed with reference to the following works: + +- [InferSim](https://github.com/alibaba/InferSim) — Parameter counting and KV cache estimation methodology +- [DeepSeek V3 Parameter Size Analysis](https://yangwenbo.com/articles/deepseek-v3-parameter-size.html) — DeepSeek V3 MLA parameter derivation +- [DeepSeek V3 Parameter Derivation (Chinese)](https://zhuanlan.zhihu.com/p/21455638257) — Detailed MLA weight decomposition + +We gratefully acknowledge these resources for providing the foundational analysis that guided our implementation. --- ## Supported Models -+ **DeepSeek-V3-671B** (SimAI PP/EP communication、GPU memory allocation module adaptations in progress) -+ **Qwen3-Moe-235B**, **Qwen3-Next-80B** (SimAI PP/EP communication、GPU memory allocation module adaptations in progress) -+ **meta-llama/Meta-Llama-3-8B** / **Meta-Llama-3-70B** -+ **meta-llama/Llama-2-7b-hf** / **Llama-2-70b-hf** -+ **codellama/CodeLlama-34b-Instruct-hf** -+ **internlm/internlm-20b** -+ **Qwen/Qwen-72B** + +- **DeepSeek-V3-671B** (SimAI PP/EP communication and GPU memory allocation module adaptations in progress) +- **Qwen3-MoE-235B**, **Qwen3-Next-80B** (SimAI PP/EP communication and GPU memory allocation module adaptations in progress) +- **meta-llama/Meta-Llama-3-8B** / **Meta-Llama-3-70B** +- **meta-llama/Llama-2-7b-hf** / **Llama-2-70b-hf** +- **codellama/CodeLlama-34b-Instruct-hf** +- **internlm/internlm-20b** +- **Qwen/Qwen-72B** --- ## 📦 Environment Setup + ### 1. Create Conda Environment + ```bash conda env create -p ./env -f ./environment.yml ``` ### 2. (Optional) Update Dev Dependencies + ```bash conda env update -f environment-dev.yml ``` ### 3. Activate Environment + ```bash conda activate vidur ``` ### 4. Install Python Dependencies (Using Alibaba Cloud PyPI Mirror) + ```bash pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ pip install -r requirements-dev.txt -i https://mirrors.aliyun.com/pypi/simple/ @@ -66,13 +127,16 @@ pip install -r requirements-dev.txt -i https://mirrors.aliyun.com/pypi/simple/ --- -## ▶️ Running Example -### Run DeepSeek-671B **with** AICB -**Requirements: **SimAI and AICB Docker environment (see [README](../README.md) for setup instructions). +## ▶️ Running Examples + +### Run DeepSeek-671B with AICB + +**Requirements:** SimAI and AICB Docker environment (see [README](../README.md) for setup instructions). + +After setting up the environment, run the following commands: -After setting up the environment, run the following commands: +#### DeepSeek-671B with AICB (Fixed Length Generator) -#### Run DeepSeek-671B **with** AICB (Fixed Length Generator) ```bash cd SimAI/vidur-alibabacloud @@ -94,10 +158,11 @@ python -m vidur.main --replica_config_pd_p2p_comm_bandwidth 800 \ --replica_config_tensor_parallel_size 2 \ --replica_config_num_pipeline_stages 1 \ --replica_config_expert_model_parallel_size 8 \ - --random_forrest_execution_time_predictor_config_backend aicb + --random_forrest_execution_time_predictor_config_backend aicb ``` -#### Run DeepSeek-671B **with** AICB (Trace Length Generator) +#### DeepSeek-671B with AICB (Trace Length Generator) + ```bash cd SimAI/vidur-alibabacloud @@ -124,11 +189,9 @@ python -m vidur.main \ ``` > ✅ Full parameter descriptions are available via `python -m vidur.main -h`. -> - +### Run Llama-3-8B with SimAI Simulation -### Run Llama-3-8B **with** simai_simulation ```bash cd SimAI @@ -136,8 +199,8 @@ cd SimAI ./scripts/build.sh -c ns3 # Create network topo (Spectrum-X_128g_8gps_100Gbps_A100) -python3 ./astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py -topo Spectrum-X -g 128 -gt A100 -bw 100Gbps -nvbw 2400Gbps - +python3 ./astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py \ + -topo Spectrum-X -g 128 -gt A100 -bw 100Gbps -nvbw 2400Gbps cd SimAI/vidur-alibabacloud @@ -163,18 +226,16 @@ python -m vidur.main \ --random_forrest_execution_time_predictor_config_backend simai_simulation \ --random_forrest_execution_time_predictor_config_simai_dir ../ \ --random_forrest_execution_time_predictor_config_simai_simulation_topo ../Spectrum-X_128g_8gps_100Gbps_A100 \ - --random_forrest_execution_time_predictor_config_simai_simulation_config ../astra-sim-alibabacloud/inputs/config/SimAI.conf + --random_forrest_execution_time_predictor_config_simai_simulation_config ../astra-sim-alibabacloud/inputs/config/SimAI.conf ``` -> -> +### Run Llama-3-8B with SimAI Analytical -### Run Llama-3-8B **with** simai_analytical ```bash cd SimAI # Compile SimAI-Analytical -$ ./scripts/build.sh -c analytical +./scripts/build.sh -c analytical cd SimAI/vidur-alibabacloud @@ -200,10 +261,8 @@ python -m vidur.main \ --random_forrest_execution_time_predictor_config_backend simai_analytical ``` -> -> +### Run Llama-3-8B with Native Vidur [original] -### Run Llama-3-8B **with** native Vidur [original] ```bash cd SimAI/vidur-alibabacloud @@ -229,125 +288,200 @@ python -m vidur.main \ --random_forrest_execution_time_predictor_config_backend vidur ``` -> -> +### Run 4-Scenario Suite + +For a quick validation of all supported configurations, use the bundled test script: + +```bash +bash examples/vidur-ali-scenarios/run_scenarios.sh --all +``` + +See `bash examples/vidur-ali-scenarios/run_scenarios.sh --help` for details. +#### 4-Scenario Configuration +The following scenarios are pre-configured in `run_scenarios.sh`. All scenarios share the hardware configuration below. + +**Shared Hardware Configuration:** +- GPU: H20 (h20_dgx), NVLink: 1600 Gbps, RDMA: 800 Gbps +- PD P2P bandwidth: 800 Gbps, dtype: fp8 +- Request: Poisson QPS=100, 4 requests, fixed prefill=100 / decode=8 tokens + +| Scenario | Model | PD Separation | World Size | TP | PP | EP | Global Scheduler | +|----------|-------|---------------|------------|----|----|------------|------------------| +| 1 | Qwen3-Next-80B (MoE) | No | 32 (dp=32) | 1 | 1 | 1 (default) | lor | +| 2 | Qwen3-Next-80B (MoE) | Yes (P=2, D=6) | 8 | 1 | 1 | 1 (default) | split_wise | +| 3 | DeepSeek-671B (MoE) | Yes (P=2, D=6) | 8 | 8 | 1 | 8 | split_wise | +| 4 | Qwen3-MoE-235B (MoE) | Yes (P=2, D=6) | 8 | 4 | 1 | 4 | split_wise | + +> **Note:** All four models use Mixture-of-Experts (MoE) architecture. The EP column reflects the explicit `--replica_config_expert_model_parallel_size` value set in the script; scenarios without an explicit EP setting use the default value of 1. + +#### Output Files + +**Output path depends on how you run the simulation:** + +- **`run_scenarios.sh`** --- outputs to `examples/vidur-ali-scenarios/simulator_output/` +- **Direct `python -m vidur.main`** --- outputs to `./simulator_output/` (or the path specified by `--metrics_config_output_dir`) + +Each run produces the following directory: + +``` ++ 中文  |  English +
+ +# Vidur-AlibabaCloud + +[](https://www.python.org/downloads/) +[](LICENSE) + +Vidur([原版](https://github.com/microsoft/vidur))是一个大语言模型(LLM)推理系统的模拟框架。 +**Vidur-AlibabaCloud**(本仓库)是针对阿里云 **SimAI** 场景优化的定制版本。支持 **Prefill–Decode(PD)分离**等高级特性,并针对 **DeepSeek-V3-671B**、**Qwen3-MoE-235B**、**Qwen3-Next-80B** 等 SOTA 大模型进行了专门适配。 + +--- + +## 目录 + +- [主要特性](#主要特性) +- [GPU 显存计算模块](#gpu-显存计算模块) +- [支持的模型](#支持的模型) +- [📦 环境配置](#-环境配置) +- [▶️ 运行示例](#️-运行示例) + - [四场景配置说明](#四场景配置说明) + - [输出文件说明](#输出文件说明) +- [🔧 关键输入参数参考](#-关键输入参数参考) +- [📊 输出结果解读](#-输出结果解读) +- [⚠️ 已知问题](#️-已知问题) +- [📚 帮助](#-帮助) + +--- + +## 主要特性 + +- **Prefill–Decode(PD)分离** — 支持 prefill 和 decode 阶段在不同节点运行,实现弹性资源分配和性能隔离。 + (参考 [splitwise-sim](https://github.com/Mutinifni/splitwise-sim)) +- **灵活的并行策略** — 支持: + - **数据并行(DP)** + - **张量并行(TP)** + - **流水线并行(PP)** + - **专家并行(EP)**(适配中) + + 同时支持 **Dense** 模型和 **混合专家(MoE)** 模型(MoE 适配中)。 +- **多种执行时间预测后端** — 可选: + - **AICB/AIOB** — 部分支持 DeepSeek-V3-671B、Qwen3-MoE-235B、Qwen3-Next-80B 的计算核与 TP、DP、PP、EP 通信量建模 + - **SimAI 仿真(Simulation)** — 基于 SimAI NS-3 的网络通信全栈仿真(支持 TP) + - **SimAI 解析(Analytical)** — SimAI 解析性能模型(支持 TP) + - **原版 Vidur [original]** — 支持 TP、DP、PP +- **负载生成与回放** — 支持真实 trace 回放,或使用固定/泊松分布生成合成请求。 +- **细粒度指标** — 记录: + - TTFT — 首 token 时延 + - TBT / TPOT — 相邻 token 时延 / 每输出 token 耗时 + - 端到端延迟 + - 通信开销 + - 计算开销 + - 调度延迟 + +--- + +## GPU 显存计算模块 + +本模块为现代 MoE(混合专家)模型的推理仿真提供精确的 GPU 显存估算,涵盖**模型参数显存**、**KV Cache 显存**以及 Prefill–Decode(PD)分离架构下的**最大批处理量**计算。 + +### 支持的注意力架构 + +| 架构 | 模型 | 说明 | +|---|---|---| +| **MLA**(多头潜在注意力) | DeepSeek-V3-671B | 使用 LoRA 压缩的 KV Cache(`kv_lora_rank` + `qk_rope_head_dim`),显著降低显存占用 | +| **MHA / GQA**(多头 / 分组查询注意力) | Qwen3-MoE-235B | 标准 KV Cache,每 token 每层使用 `num_kv_heads * head_dim` | +| **混合全注意力 + 线性注意力** | Qwen3-Next-80B | 每 4 层交替使用全注意力和线性(GDN)注意力 | + +### 核心组件 + +- **`ParamCounter`**(`vidur/utils/param_counter.py`)— 计算每层和每设备的参数量,支持 MLA、MHA/GQA、线性注意力和 MoE 专家权重,支持 FP8 量化。在 PD 分离架构下,根据 `prefill_world_size` / `decode_world_size` 分别返回 `(total_params, prefill_params, decode_params)` 三元组。 +- **`MemoryPlanner`**(`vidur/scheduler/utils/memory_planner.py`)— 规划 GPU 显存预算:`available = GPU_mem * (1 - margin) - param_mem`,计算 KV Cache 容量和最大并发请求数,包含 OOM 检测与建议输出。 +- **逐请求 KV Cache 追踪**(`vidur/entities/replica.py`)— 按请求粒度分配和释放 KV Cache 显存,支持运行时精确查询剩余容量。 + +### 参考与致谢 + +本 GPU 显存计算模块的开发参考了以下工作: + +- [InferSim](https://github.com/alibaba/InferSim) — 参数量计算与 KV Cache 估算方法论 +- [DeepSeek V3 Parameter Size Analysis](https://yangwenbo.com/articles/deepseek-v3-parameter-size.html) — DeepSeek V3 MLA 参数推导 +- [DeepSeek V3 参数推导详解](https://zhuanlan.zhihu.com/p/21455638257) — MLA 权重分解详细分析 + +衷心感谢以上资源为我们的实现提供了基础性的分析与指导。 + +--- + +## 支持的模型 + +- **DeepSeek-V3-671B**(SimAI PP/EP 通信及 GPU 显存管理模块适配中) +- **Qwen3-MoE-235B**、**Qwen3-Next-80B**(SimAI PP/EP 通信及 GPU 显存管理模块适配中) +- **meta-llama/Meta-Llama-3-8B** / **Meta-Llama-3-70B** +- **meta-llama/Llama-2-7b-hf** / **Llama-2-70b-hf** +- **codellama/CodeLlama-34b-Instruct-hf** +- **internlm/internlm-20b** +- **Qwen/Qwen-72B** + +--- + +## 📦 环境配置 + +### 1. 创建 Conda 环境 + +```bash +conda env create -p ./env -f ./environment.yml +``` + +### 2.(可选)更新开发依赖 + +```bash +conda env update -f environment-dev.yml +``` + +### 3. 激活环境 + +```bash +conda activate vidur +``` + +### 4. 安装 Python 依赖(使用阿里云 PyPI 镜像) + +```bash +pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ +pip install -r requirements-dev.txt -i https://mirrors.aliyun.com/pypi/simple/ +``` + +--- + +## ▶️ 运行示例 + +### 使用 AICB 运行 DeepSeek-671B + +**前置条件:** 需要 SimAI 和 AICB Docker 环境(参见 [README](../README.md) 了解搭建方法)。 + +完成环境配置后,运行以下命令: + +#### DeepSeek-671B + AICB(固定长度生成器) + +```bash +cd SimAI/vidur-alibabacloud + +python -m vidur.main --replica_config_pd_p2p_comm_bandwidth 800 \ + --replica_config_nvlink_bandwidth 1600 \ + --replica_config_rdma_bandwidth 800 \ + --replica_config_pd_p2p_comm_dtype float32 \ + --poisson_request_interval_generator_config_qps 100 \ + --synthetic_request_generator_config_num_requests 5 \ + --length_generator_config_type fixed \ + --fixed_request_length_generator_config_prefill_tokens 1024 \ + --fixed_request_length_generator_config_decode_tokens 10 \ + --trace_request_length_generator_config_trace_file ./data/processed_traces/splitwise_conv.csv \ + --cluster_config_num_replicas 4 \ + --replica_config_pd_node_ratio 0.5 \ + --global_scheduler_config_type split_wise \ + --replica_scheduler_config_type split_wise \ + --replica_config_model_name deepseek-671B \ + --replica_config_tensor_parallel_size 2 \ + --replica_config_num_pipeline_stages 1 \ + --replica_config_expert_model_parallel_size 8 \ + --random_forrest_execution_time_predictor_config_backend aicb +``` + +#### DeepSeek-671B + AICB(Trace 长度生成器) + +```bash +cd SimAI/vidur-alibabacloud + +python -m vidur.main \ + --replica_config_pd_p2p_comm_bandwidth 800 \ + --replica_config_nvlink_bandwidth 1600 \ + --replica_config_rdma_bandwidth 800 \ + --replica_config_pd_p2p_comm_dtype float32 \ + --poisson_request_interval_generator_config_qps 100 \ + --synthetic_request_generator_config_num_requests 10 \ + --length_generator_config_type trace \ + --trace_request_length_generator_config_max_tokens 1024 \ + --trace_request_length_generator_config_trace_file ./data/processed_traces/splitwise_conv.csv \ + --interval_generator_config_type poisson \ + --cluster_config_num_replicas 4 \ + --replica_config_pd_node_ratio 0.5 \ + --global_scheduler_config_type split_wise \ + --replica_scheduler_config_type split_wise \ + --replica_config_model_name deepseek-671B \ + --replica_config_tensor_parallel_size 2 \ + --replica_config_num_pipeline_stages 1 \ + --replica_config_expert_model_parallel_size 8 \ + --random_forrest_execution_time_predictor_config_backend aicb +``` + +> ✅ 完整参数说明可通过 `python -m vidur.main -h` 查看。 + +### 使用 SimAI 仿真运行 Llama-3-8B + +```bash +cd SimAI + +# 编译 SimAI-Simulation(ns3) +./scripts/build.sh -c ns3 + +# 生成网络拓扑(Spectrum-X_128g_8gps_100Gbps_A100) +python3 ./astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py \ + -topo Spectrum-X -g 128 -gt A100 -bw 100Gbps -nvbw 2400Gbps + +cd SimAI/vidur-alibabacloud + +python -m vidur.main \ + --replica_config_pd_p2p_comm_bandwidth 800 \ + --replica_config_nvlink_bandwidth 1600 \ + --replica_config_rdma_bandwidth 800 \ + --replica_config_pd_p2p_comm_dtype float32 \ + --poisson_request_interval_generator_config_qps 100 \ + --synthetic_request_generator_config_num_requests 10 \ + --length_generator_config_type trace \ + --trace_request_length_generator_config_max_tokens 2048 \ + --trace_request_length_generator_config_trace_file ./data/processed_traces/splitwise_conv.csv \ + --interval_generator_config_type poisson \ + --cluster_config_num_replicas 4 \ + --replica_config_pd_node_ratio 0.5 \ + --global_scheduler_config_type split_wise \ + --replica_scheduler_config_type split_wise \ + --replica_config_model_name meta-llama/Meta-Llama-3-8B \ + --replica_config_tensor_parallel_size 4 \ + --replica_config_num_pipeline_stages 1 \ + --replica_config_expert_model_parallel_size 1 \ + --random_forrest_execution_time_predictor_config_backend simai_simulation \ + --random_forrest_execution_time_predictor_config_simai_dir ../ \ + --random_forrest_execution_time_predictor_config_simai_simulation_topo ../Spectrum-X_128g_8gps_100Gbps_A100 \ + --random_forrest_execution_time_predictor_config_simai_simulation_config ../astra-sim-alibabacloud/inputs/config/SimAI.conf +``` + +### 使用 SimAI 解析模型运行 Llama-3-8B + +```bash +cd SimAI + +# 编译 SimAI-Analytical +./scripts/build.sh -c analytical + +cd SimAI/vidur-alibabacloud + +python -m vidur.main \ + --replica_config_pd_p2p_comm_bandwidth 800 \ + --replica_config_nvlink_bandwidth 1600 \ + --replica_config_rdma_bandwidth 800 \ + --replica_config_pd_p2p_comm_dtype float32 \ + --poisson_request_interval_generator_config_qps 100 \ + --synthetic_request_generator_config_num_requests 10 \ + --length_generator_config_type trace \ + --trace_request_length_generator_config_max_tokens 2048 \ + --trace_request_length_generator_config_trace_file ./data/processed_traces/splitwise_conv.csv \ + --interval_generator_config_type poisson \ + --cluster_config_num_replicas 4 \ + --replica_config_pd_node_ratio 0.5 \ + --global_scheduler_config_type split_wise \ + --replica_scheduler_config_type split_wise \ + --replica_config_model_name meta-llama/Meta-Llama-3-8B \ + --replica_config_tensor_parallel_size 4 \ + --replica_config_num_pipeline_stages 1 \ + --replica_config_expert_model_parallel_size 1 \ + --random_forrest_execution_time_predictor_config_backend simai_analytical +``` + +### 使用原版 Vidur 运行 Llama-3-8B + +```bash +cd SimAI/vidur-alibabacloud + +python -m vidur.main \ + --replica_config_pd_p2p_comm_bandwidth 800 \ + --replica_config_nvlink_bandwidth 1600 \ + --replica_config_rdma_bandwidth 800 \ + --replica_config_pd_p2p_comm_dtype float32 \ + --poisson_request_interval_generator_config_qps 100 \ + --synthetic_request_generator_config_num_requests 10 \ + --length_generator_config_type trace \ + --trace_request_length_generator_config_max_tokens 2048 \ + --trace_request_length_generator_config_trace_file ./data/processed_traces/splitwise_conv.csv \ + --interval_generator_config_type poisson \ + --cluster_config_num_replicas 4 \ + --replica_config_pd_node_ratio 0.5 \ + --global_scheduler_config_type split_wise \ + --replica_scheduler_config_type split_wise \ + --replica_config_model_name meta-llama/Meta-Llama-3-8B \ + --replica_config_tensor_parallel_size 4 \ + --replica_config_num_pipeline_stages 1 \ + --replica_config_expert_model_parallel_size 1 \ + --random_forrest_execution_time_predictor_config_backend vidur +``` + +### 运行四场景套件 + +使用内置脚本快速验证所有支持的配置: + +```bash +bash examples/vidur-ali-scenarios/run_scenarios.sh --all +``` + +详细信息请运行 `bash examples/vidur-ali-scenarios/run_scenarios.sh --help`。 + +#### 四场景配置说明 + +以下场景已在 `run_scenarios.sh` 中预配置,所有场景共享下方硬件配置。 + +**共用硬件配置:** +- GPU:H20(h20_dgx),NVLink:1600 Gbps,RDMA:800 Gbps +- PD P2P 带宽:800 Gbps,数据类型:fp8 +- 请求生成:Poisson QPS=100,4 requests,固定 prefill=100 / decode=8 tokens + +| 场景 | 模型 | PD 分离 | World Size | TP | PP | EP | 全局调度器 | +|------|------|---------|------------|----|----|------------|------------| +| 1 | Qwen3-Next-80B (MoE) | 无 | 32 (dp=32) | 1 | 1 | 1(默认) | lor | +| 2 | Qwen3-Next-80B (MoE) | 是(P=2, D=6) | 8 | 1 | 1 | 1(默认) | split_wise | +| 3 | DeepSeek-671B (MoE) | 是(P=2, D=6) | 8 | 8 | 1 | 8 | split_wise | +| 4 | Qwen3-MoE-235B (MoE) | 是(P=2, D=6) | 8 | 4 | 1 | 4 | split_wise | + +> **说明:** 四个模型均使用混合专家(MoE)架构。EP 列反映脚本中 `--replica_config_expert_model_parallel_size` 的显式设定值;未显式指定时使用默认值 1。 + +#### 输出文件说明 + +**输出路径取决于运行方式:** + +- **`run_scenarios.sh`** --- 输出到 `examples/vidur-ali-scenarios/simulator_output/` +- **直接 `python -m vidur.main`** --- 输出到 `./simulator_output/`(或通过 `--metrics_config_output_dir` 指定的路径) + +每次运行产生如下目录: + +``` +
-
+
- 中文  |  English + 中文  |  English  |  日本語
# SimAI @@ -11,13 +11,10 @@ ### Recent Updates -- [2026/03] **SimAI 1.6 Released!** This release adds GPU memory modeling for inference simulation. Key features include: - - - **GPU Memory Module:** Accurate parameter counting and KV cache management for DeepSeek-V3-671B, Qwen3-MoE-235B, and Qwen3-Next-80B. - - **PD-Separation Memory Planning:** Independent parameter memory and KV cache budget calculation for Prefill and Decode phases. See [memory_planner.py](./vidur-alibabacloud/vidur/scheduler/utils/memory_planner.py). - - **Improved Decode Time Estimation:** Linear interpolation replacing nearest-neighbor for AICB decode time prediction, with global cache for cross-run reuse. See [execution_time.py](./vidur-alibabacloud/vidur/entities/execution_time.py). - - **4-Scenario Test Suite:** End-to-end validation covering Qwen3-Next-80B, DeepSeek-671B, and Qwen3-MoE-235B. See [run_scenarios.sh](./vidur-alibabacloud/examples/vidur-ali-scenarios/run_scenarios.sh). - - **Code Quality:** Replaced print with logging, added bilingual docstrings, removed ~390 lines of dead code, standardized TODOs, and added type annotations across vidur-alibabacloud modules. +- [2026/04] **SimAI 1.6 Released!** Key updates: + - GPU memory modeling for inference simulation (parameter counting & KV cache). + - Linear interpolation for decode time estimation (replacing nearest-neighbor). + - Prefill-Decode Disaggregation memory planning (independent budgets for Prefill/Decode). - [2025/12] **SimAI 1.5 Released!** This release brings end-to-end simulation for multi-request **inference** workloads. Key features include: @@ -47,7 +44,7 @@ | Date | Event | Location | Content | Type | |:----------------:|:------------------------------------------------------------------------ |:----------------------- |:-------------------------------------------------------- |:-------------:| -| Mar 16, 2026 | SimAI 1.6 | 🌐 Online | The release of SimAI 1.6 | 💻 Virtual | +| Apr 23, 2026 | SimAI 1.6 | 🌐 Online | The release of SimAI 1.6 | 💻 Virtual | | Dec 30, 2025 | SimAI 1.5 | 🌐 Online | The release of SimAI 1.5 | 💻 Virtual | | Jun 4, 2025 | The first workshop of the SimAI community | 📍 Peking University | Three talks from community contributors | 🎓 On-site | | May 24, 2025 | The 28th Chinasys workshop | 📍 Chongqing University | An invited talk about SimAI | 🎓 On-site | diff --git a/README_CN.md b/README_CN.md index 0baf73a3..357afc4d 100644 --- a/README_CN.md +++ b/README_CN.md @@ -1,5 +1,5 @@- 中文  |  English + 中文  |  English  |  日本語
# SimAI @@ -11,13 +11,10 @@ ### 近期更新 -- [2026/03] **SimAI 1.6 正式发布!** 本版本新增推理仿真的 GPU 内存建模能力。主要特性包括: - - - **GPU 内存计算模块:** 支持 DeepSeek-V3-671B、Qwen3-MoE-235B、Qwen3-Next-80B 的精确参数计数与 KV Cache 管理。 - - **PD 分离内存规划:** Prefill 与 Decode 阶段独立的参数内存和 KV Cache 预算计算。详见 [memory_planner.py](./vidur-alibabacloud/vidur/scheduler/utils/memory_planner.py)。 - - **Decode 时间估算改进:** 首尾线性插值替代最近邻的 AICB decode 时间预测,全局缓存支持跨运行复用。详见 [execution_time.py](./vidur-alibabacloud/vidur/entities/execution_time.py)。 - - **4 场景端到端测试:** 覆盖 Qwen3-Next-80B、DeepSeek-671B、Qwen3-MoE-235B 的完整验证套件。详见 [run_scenarios.sh](./vidur-alibabacloud/examples/vidur-ali-scenarios/run_scenarios.sh)。 - - **代码质量提升:** logging 替换 print 输出、双语 docstring、清理 ~390 行死代码、TODO 规范化、类型标注补全。 +- [2026/04] **SimAI 1.6 正式发布!** 主要更新: + - 推理仿真 GPU 显存建模(参数计数与 KV Cache 管理)。 + - Decode 耗时线性插值估算(替代最近邻查找)。 + - PD 分离内存规划(Prefill/Decode 独立预算)。 - [2025/12] **SimAI 1.5 正式发布!** 本版本新增对多请求**推理**工作负载的端到端仿真支持,主要特性包括: @@ -46,7 +43,7 @@ | 日期 | 活动 | 地点 | 内容 | 形式 | |:----------------:|:------------------------------------------------------------------------ |:----------------------- |:-------------------------------------------------------- |:-------------:| -| Mar 16, 2026 | SimAI 1.6 | 🌐 线上 | SimAI 1.6 正式发布 | 💻 线上直播 | +| Apr 23, 2026 | SimAI 1.6 | 🌐 线上 | SimAI 1.6 正式发布 | 💻 线上直播 | | Dec 30, 2025 | SimAI 1.5 | 🌐 线上 | SimAI 1.5 正式发布 | 💻 线上直播 | | Jun 4, 2025 | SimAI 社区第一届研讨会 | 📍 北京大学 | 三场社区贡献者演讲 | 🎓 线下 | | May 24, 2025 | 第 28 届 Chinasys 研讨会 | 📍 重庆大学 | SimAI 受邀演讲 | 🎓 线下 | diff --git a/vidur-alibabacloud/README.md b/vidur-alibabacloud/README.md index df99cfcd..31bacf22 100644 --- a/vidur-alibabacloud/README.md +++ b/vidur-alibabacloud/README.md @@ -31,7 +31,7 @@ Vidur ([original](https://github.com/microsoft/vidur)) is a simulation framework ## Key Features -- **Prefill–Decode (PD) Separation** — Enables running the prefill and decode stages on different nodes, allowing elastic resource allocation and performance isolation. +- **Prefill–Decode (PD) Disaggregation** — Enables running the prefill and decode stages on different nodes, allowing elastic resource allocation and performance isolation. (Inspired by [splitwise-sim](https://github.com/Mutinifni/splitwise-sim)) - **Flexible Parallelism** — Supports: - **Data Parallel (DP)** @@ -332,7 +332,7 @@ The following scenarios are pre-configured in `run_scenarios.sh`. All scenarios - PD P2P bandwidth: 800 Gbps, dtype: fp8 - Request: Poisson QPS=100, 4 requests, fixed prefill=100 / decode=8 tokens -| Scenario | Model | PD Separation | World Size | TP | PP | EP | Global Scheduler | +| Scenario | Model | PD Disaggregation | World Size | TP | PP | EP | Global Scheduler | |----------|-------|---------------|------------|----|----|------------|------------------| | 1 | Qwen3-Next-80B (MoE) | No | 32 (dp=32) | 1 | 1 | auto (=world_size) | lor | | 2 | Qwen3-Next-80B (MoE) | Yes (P=2, D=6) | 8 | 1 | 1 | auto (=world_size) | split_wise | @@ -385,7 +385,7 @@ Each run produces the following directory: | `--trace_request_length_generator_config_trace_file` | `data/processed_traces/sharegpt_8k_filtered_stats_llama2_tokenizer.csv` | Path to trace file for request lengths | | `--interval_generator_config_type` | poisson | Inter-arrival time generator type | | `--cluster_config_num_replicas` | 1 | Total number of replicas (i.e., data parallelism degree) | -| `--replica_config_pd_node_ratio` | 1 | Fraction of replicas allocated as prefill (P) nodes. 1 = MIXED mode (no PD separation). (0, 1) = PD separation enabled. E.g., 0.5 means P:D = 1:1. | +| `--replica_config_pd_node_ratio` | 1 | Fraction of replicas allocated as prefill (P) nodes. 1 = MIXED mode (no PD disaggregation). (0, 1) = PD disaggregation enabled. E.g., 0.5 means P:D = 1:1. | | `--global_scheduler_config_type` | round_robin | Global scheduler type (`split_wise`, `round_robin`, etc.) | | `--replica_scheduler_config_type` | sarathi | Per-replica scheduler type | | `--replica_config_model_name` | meta-llama/Llama-2-7b-hf | Model name (DeepSeek-671B, Qwen3-MoE-235B, Qwen3-Next-80B, etc.) | @@ -397,7 +397,7 @@ Each run produces the following directory: | `--random_forrest_execution_time_predictor_config_simai_simulation_topo` | `../example/topo` | Path to SimAI topology file (only effective when backend = `simai_simulation`) | | `--random_forrest_execution_time_predictor_config_simai_simulation_config` | `../astra-sim-alibabacloud/inputs/config/SimAI.conf` | Path to SimAI configuration file (only effective when backend = `simai_simulation`) | -### PD Separation Parameters +### PD Disaggregation Parameters When `pd_node_ratio` < 1, the following optional parameters become effective: @@ -409,7 +409,7 @@ When `pd_node_ratio` < 1, the following optional parameters become effective: | `--replica_config_decode_num_pipeline_stages` | None | Decode-specific PP size. Falls back to `num_pipeline_stages` if not set. | | `--replica_config_num_prefill_replicas` | None | Directly specify prefill replica count (takes priority over `pd_node_ratio`). | -**Example: DeepSeek-671B with PD separation (P:D = 2:6)** +**Example: DeepSeek-671B with PD disaggregation (P:D = 2:6)** ```bash python -m vidur.main \ @@ -461,7 +461,7 @@ Simulation results are saved to: **Notes:** - All time-related fields are in **seconds (s)**, based on monotonic clock or Unix timestamps. -- In non-PD-separated deployments, `prefill_replica_id` and `decode_replica_id` are typically identical. +- In non-PD-disaggregated deployments, `prefill_replica_id` and `decode_replica_id` are typically identical. - If `request_num_decode_tokens = 0`, `tbt` is undefined (may be recorded as `NaN` or `0`). - TBT is not yet logged in `request_metrics.csv`; it can be computed manually for now. From d17c6a14ef515439aff1e2f88a7a2c30cbde6119 Mon Sep 17 00:00:00 2001 From: tianhao909 <843101550@qq.com> Date: Thu, 23 Apr 2026 10:04:00 +0000 Subject: [PATCH 8/8] docs(vidur-alibabacloud): restore README-vidur.md to upstream + migrate scenario commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [EN] - Restore README-vidur.md to match official Microsoft Vidur README (remove SimAI extensions) - Migrate 4-scenario manual CLI commands to README.md and README_CN.md - Add detailed per-scenario python commands after "Run 4-Scenario Suite" section - Keep run_scenarios.sh quick-start + full manual commands for advanced users [ZH] - 恢复 README-vidur.md 与官方 Microsoft Vidur README 完全一致(移除 SimAI 扩展内容) - 将四场景手动 CLI 命令迁移至 README.md 和 README_CN.md - 在"运行四场景套件"小节后添加逐场景完整 python 命令 - 保留脚本一键运行 + 完整手动命令两种方式供用户选择 --- vidur-alibabacloud/README-vidur.md | 157 ----------------------------- vidur-alibabacloud/README.md | 139 +++++++++++++++++++++++++ vidur-alibabacloud/README_CN.md | 141 ++++++++++++++++++++++++++ 3 files changed, 280 insertions(+), 157 deletions(-) diff --git a/vidur-alibabacloud/README-vidur.md b/vidur-alibabacloud/README-vidur.md index 786ea177..d0def19e 100644 --- a/vidur-alibabacloud/README-vidur.md +++ b/vidur-alibabacloud/README-vidur.md @@ -166,160 +166,3 @@ trademarks or logos is subject to and must follow Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. -## SimAI / AICB 场景示例(一键运行) - -> 以下命令均在 `vidur-alibabacloud/` 目录下执行,需提前激活 `vidur` conda 环境。 -> 使用 AICB 后端 (`--random_forrest_execution_time_predictor_config_backend aicb`), -> 设备为 H20 DGX (`h20_dgx`),请求生成为 Poisson QPS=100,固定长度 prefill=100/decode=8。 -> 所有输入输出文件统一汇聚至 `examples/vidur-ali-scenarios/` 目录: -> - 脚本: `examples/vidur-ali-scenarios/run_scenarios.sh` -> - 运行日志: `examples/vidur-ali-scenarios/logs/scenario_