From 317fa57f9fb713890d48a241df07696a9c425b8e Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 14:40:26 +0800
Subject: [PATCH 01/14] [doc]update gradient accumulation

---
 .../gradient_accumulation_with_booster.md     | 145 +++++++++++++++
 .../gradient_accumulation_with_booster.md     | 168 ++++++++++++++++++
 2 files changed, 313 insertions(+)
 create mode 100644 docs/source/en/features/gradient_accumulation_with_booster.md
 create mode 100644 docs/source/zh-Hans/features/gradient_accumulation_with_booster.md

diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
new file mode 100644
index 000000000000..62bf8da4a47d
--- /dev/null
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -0,0 +1,145 @@
+# Gradient Accumulation(latest)
+
+Author: Shenggui Li, Yongbin Li
+
+**Prerequisite**
+- [Define Your Configuration](../basics/define_your_config.md)
+- [Booster Training](../basics/booster_api.md)
+
+## Introduction
+
+Gradient accumulation is a common way to enlarge your batch size for training.
+When training large-scale models, memory can easily become the bottleneck and the batch size can be very small, (e.g. 2), leading to unsatisfactory convergence. Gradient accumulation works by adding up the gradients calculated in multiple iterations,
+and only update the parameters in the preset iteration.
+
+## Usage
+
+It is simple to use gradient accumulation in Colossal-AI. Just call `booster.no_sync()` which returns a context manager. It accumulate gradients without synchronization meanwhile you should not update the gradients.
+
+## Hands-on Practice
+
+We Now demonstrate gradient accumulation. In this example, we let the gradient accumulation size to be 4.
+
+### Step 2. Import libraries in train.py
+Create a `train.py` and import the necessary dependencies. The version of `torch` should not be lower than 1.8.1.
+
+```python
+import os
+from pathlib import Path
+
+import torch
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet18
+from torch.utils.data import DataLoader
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.logging import get_dist_logger
+```
+
+### Step 2. Initialize Distributed Environment
+
+We then need to initialize distributed environment. For demo purpose, we uses `launch_from_torch`. You can refer to [Launch Colossal-AI](../basics/launch_colossalai.md) for other initialization methods.
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+# launch from torch
+colossalai.launch_from_torch(config=dict())
+
+```
+
+### Step 3. Create training components
+
+Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is obtained from the environment variable `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])` to a path on your machine. Data will be automatically downloaded to the root path.
+
+```python
+    # define the constants
+    BATCH_SIZE = 128
+    GRADIENT_ACCUMULATION = 4
+
+    # build resnet
+    model = resnet18(num_classes=10)
+
+    # build dataloaders
+    train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
+                            download=True,
+                            transform=transforms.Compose([
+                                transforms.RandomCrop(size=32, padding=4),
+                                transforms.RandomHorizontalFlip(),
+                                transforms.ToTensor(),
+                                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
+                            ]))
+
+    # build criterion
+    criterion = torch.nn.CrossEntropyLoss()
+
+    # optimizer
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+```
+
+### Step 4. Inject Feature
+Create a `TorchDDPPlugin` object to instantiate a `Booster`, and boost these training components with booster.
+
+```python
+    plugin = TorchDDPPlugin()
+    booster = Booster(plugin=plugin)
+    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+    model, optimizer, criterion, train_dataloader, _ = booster.boost(model=model,
+                                                                     optimizer=optimizer,
+                                                                     criterion=criterion,
+                                                                     dataloader=train_dataloader)
+```
+
+### Step 5. Train with Booster
+Use booster in a normal training loops, and verify gradient accumulation. `param_by_iter` is to record the distributed training information.
+```python
+for idx, (img, label) in enumerate(train_dataloader):
+        sync_context = booster.no_sync(model)
+        img = img.cuda()
+        label = label.cuda()
+        model.zero_grad()
+        if idx % (GRADIENT_ACCUMULATION - 1) != 0:
+            with sync_context:
+                output = model(img)
+                train_loss = criterion(output, label)
+                booster.backward(train_loss, optimizer)
+        else:
+            output = model(img)
+            train_loss = criterion(output, label)
+            booster.backward(train_loss, optimizer)
+            optimizer.step()
+            model.zero_grad()
+
+        ele_1st = next(model.parameters()).flatten()[0]
+        param_by_iter.append(str(ele_1st.item()))
+
+        if idx != 0 and idx % (GRADIENT_ACCUMULATION - 1) == 0:
+            break
+
+    for iteration, val in enumerate(param_by_iter):
+        print(f'iteration {iteration} - value: {val}')
+
+    if param_by_iter[-1] != param_by_iter[0]:
+        print('The parameter is only updated in the last iteration')
+
+```
+
+### Step 6. Invoke Training Scripts
+To verify gradient accumulation, we can just check the change of parameter values. When gradient accumulation is set, parameters are only updated in the last step. You can run the script using this command:
+```shell
+colossalai run --nproc_per_node 1 train.py --config config.py
+```
+
+You will see output similar to the text below. This shows gradient is indeed accumulated as the parameter is not updated
+in the first 3 steps, but only updated in the last step.
+
+```text
+iteration 0, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
+```
diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
new file mode 100644
index 000000000000..3b6284d8fb0e
--- /dev/null
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -0,0 +1,168 @@
+# 梯度累积
+
+作者: Shenggui Li, Yongbin Li
+
+**前置教程**
+- [定义配置文件](../basics/define_your_config.md)
+- [训练中使用Booster](../basics/engine_trainer.md) # todo 待更新链接。
+
+**示例代码**
+- [ColossalAI-Examples Gradient Accumulation](ColossalAI/examples/tutorial/feathures/gradient_accumulation/README.md)
+
+## 引言
+
+梯度累积是一种常见的增大训练 batch size 的方式。 在训练大模型时，内存经常会成为瓶颈，并且 batch size 通常会很小（如2），这导致收敛性无法保证。梯度累积将多次迭代的梯度累加，并仅在达到预设迭代次数时更新参数。
+
+## 使用
+
+在 Colossal-AI 中使用梯度累积非常简单，booster提供no_sync返回一个文件管理器，在该文件管理器下取消同步并且不更新梯度，则可以进行梯度累积， 在config.py中gradient_accumulation=4，表示进行梯度累积次数为4。
+
+```python
+gradient_accumulation = <int>
+```
+
+## 实例
+
+我们提供了一个 [运行实例](ColossalAI/examples/tutorial/feathures/gradient_accumulation/README.md)
+来展现梯度累积。在这个例子中，梯度累积次数被设置为4，你可以通过一下命令启动脚本。
+
+### 步骤 1. 创建配置文件
+
+Create a `config.py`.
+```python
+BATCH_SIZE = 128
+NUM_EPOCHS = 200
+
+gradient_accumulation = 4
+```
+
+### 步骤 2. 在 train.py 导入相关库
+创建train.py并导入必要依赖。 `torch` 的版本应不低于1.8.1。
+
+```python
+import os
+from pathlib import Path
+
+import torch
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet18
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import TorchDDPPlugin
+from colossalai.logging import get_dist_logger
+```
+
+### 步骤 3. 初始化分布式环境
+
+我们需要初始化分布式环境。为了快速演示，我们使用`launch_from_torch`。你可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md)
+使用其他初始化方法。
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+
+# launch from torch
+colossalai.launch_from_torch(config=dict())
+
+```
+
+### 步骤 4. 创建训练组件
+
+构建你的模型、优化器、损失函数、学习率调整器和数据加载器。注意数据集的路径从环境变量`DATA`获得。你可以通过 `export DATA=/path/to/data` 或 `Path(os.environ['DATA'])`，在你的机器上设置路径。数据将会被自动下载到该路径。
+
+```python
+    # define the constant
+    BATCH_SIZE = 128
+    GRADIENT_ACCUMULATION = 4
+
+    # build resnet
+    model = resnet18(num_classes=10)
+
+    # build dataloaders
+    train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
+                            download=True,
+                            transform=transforms.Compose([
+                                transforms.RandomCrop(size=32, padding=4),
+                                transforms.RandomHorizontalFlip(),
+                                transforms.ToTensor(),
+                                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
+                            ]))
+
+    # train_dataloader = get_dataloader(
+    #     dataset=train_dataset,
+    #     shuffle=True,
+    #     batch_size=gpc.config.BATCH_SIZE,
+    #     pin_memory=True,
+    # )
+
+    # build criterion
+    criterion = torch.nn.CrossEntropyLoss()
+
+    # optimizer
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+```
+
+### 步骤 5. 注入特性
+创建一个`TorchDDPPlugin`对象，并作为参实例化`Booster`, 调用booster注入特性.
+
+```python
+    plugin = TorchDDPPlugin()
+    booster = Booster(plugin=plugin)
+    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+    model, optimizer, criterion, train_dataloader, _ = booster.boost(model=model,
+                                                                     optimizer=optimizer,
+                                                                     criterion=criterion,
+                                                                     dataloader=train_dataloader)
+```
+
+### 步骤 6. 使用booster训练
+使用booster构建一个普通的训练循环。 验证梯度累积. `param_by_iter` 记录分布训练的信息。
+```python
+for idx, (img, label) in enumerate(train_dataloader):
+        sync_context = booster.no_sync(model)
+        img = img.cuda()
+        label = label.cuda()
+        model.zero_grad()
+        if idx % (gpc.config.gradient_accumulation - 1) != 0:
+            with sync_context:
+                output = model(img)
+                train_loss = criterion(output, label)
+                booster.backward(train_loss, optimizer)
+        else:
+            output = model(img)
+            train_loss = criterion(output, label)
+            booster.backward(train_loss, optimizer)
+            optimizer.step()
+            model.zero_grad()
+
+        ele_1st = next(model.parameters()).flatten()[0]
+        param_by_iter.append(str(ele_1st.item()))
+
+        if idx != 0 and idx % (gpc.config.gradient_accumulation - 1) == 0:
+            break
+
+    for iteration, val in enumerate(param_by_iter):
+        print(f'iteration {iteration} - value: {val}')
+
+    if param_by_iter[-1] != param_by_iter[0]:
+        print('The parameter is only updated in the last iteration')
+
+```
+
+### 步骤 7. 启动训练脚本
+为了验证梯度累积，我们可以只检查参数值的变化。当设置梯度累加时，仅在最后一步更新参数。您可以使用以下命令运行脚本：
+```shell
+colossalai run --nproc_per_node 1 train.py --config config.py
+```
+
+你将会看到类似下方的文本输出。这展现了梯度虽然在前3个迭代中被计算，但直到最后一次迭代，参数才被更新。
+
+```text
+iteration 0, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
+iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
+```

From 7ca288b48126dcd3815208af350e1585c45d5d57 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 14:42:18 +0800
Subject: [PATCH 02/14] [doc]update gradient accumulation

---
 docs/source/en/features/gradient_accumulation.md      | 2 +-
 docs/source/zh-Hans/features/gradient_accumulation.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/features/gradient_accumulation.md b/docs/source/en/features/gradient_accumulation.md
index ecc209fbac8d..23fe5869096c 100644
--- a/docs/source/en/features/gradient_accumulation.md
+++ b/docs/source/en/features/gradient_accumulation.md
@@ -1,4 +1,4 @@
-# Gradient Accumulation
+# Gradient Accumulation(outdated)
 
 Author: Shenggui Li, Yongbin Li
 
diff --git a/docs/source/zh-Hans/features/gradient_accumulation.md b/docs/source/zh-Hans/features/gradient_accumulation.md
index e21e5fcd43d8..0cc86e82b424 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation.md
@@ -1,4 +1,4 @@
-# 梯度累积
+# 梯度累积(旧版本)
 
 作者: Shenggui Li, Yongbin Li
 

From aba9f3413d2e18a038bb5082e8f710ce5b9dbdcf Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 14:52:41 +0800
Subject: [PATCH 03/14] [doc]update gradient accumulation

---
 .../gradient_accumulation_with_booster.md      | 18 ++++++++++--------
 .../gradient_accumulation_with_booster.md      | 18 ++++++++++--------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
index 62bf8da4a47d..2747da303c7a 100644
--- a/docs/source/en/features/gradient_accumulation_with_booster.md
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -29,6 +29,7 @@ from pathlib import Path
 
 import torch
 from torchvision import transforms
+from titans.utils import barrier_context
 from torchvision.datasets import CIFAR10
 from torchvision.models import resnet18
 from torch.utils.data import DataLoader
@@ -65,14 +66,15 @@ Build your model, optimizer, loss function, lr scheduler and dataloaders. Note t
     model = resnet18(num_classes=10)
 
     # build dataloaders
-    train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
-                            download=True,
-                            transform=transforms.Compose([
-                                transforms.RandomCrop(size=32, padding=4),
-                                transforms.RandomHorizontalFlip(),
-                                transforms.ToTensor(),
-                                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
-                            ]))
+    with barrier_context:
+        train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
+                                download=True,
+                                transform=transforms.Compose([
+                                    transforms.RandomCrop(size=32, padding=4),
+                                    transforms.RandomHorizontalFlip(),
+                                    transforms.ToTensor(),
+                                    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
+                                ]))
 
     # build criterion
     criterion = torch.nn.CrossEntropyLoss()
diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
index 3b6284d8fb0e..d05ea7a6ea89 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -45,6 +45,7 @@ from pathlib import Path
 
 import torch
 from torchvision import transforms
+from titans.utils import barrier_context
 from torchvision.datasets import CIFAR10
 from torchvision.models import resnet18
 
@@ -82,14 +83,15 @@ colossalai.launch_from_torch(config=dict())
     model = resnet18(num_classes=10)
 
     # build dataloaders
-    train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
-                            download=True,
-                            transform=transforms.Compose([
-                                transforms.RandomCrop(size=32, padding=4),
-                                transforms.RandomHorizontalFlip(),
-                                transforms.ToTensor(),
-                                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
-                            ]))
+    with barrier_context:
+        train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
+                                download=True,
+                                transform=transforms.Compose([
+                                    transforms.RandomCrop(size=32, padding=4),
+                                    transforms.RandomHorizontalFlip(),
+                                    transforms.ToTensor(),
+                                    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
+                                ]))
 
     # train_dataloader = get_dataloader(
     #     dataset=train_dataset,

From 152082fa2a2fee47a6b352b2f95879f1a2dcdc8f Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 14:56:22 +0800
Subject: [PATCH 04/14] [doc]update gradient accumulation

---
 docs/source/en/features/gradient_accumulation_with_booster.md   | 2 +-
 .../zh-Hans/features/gradient_accumulation_with_booster.md      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
index 2747da303c7a..3a37879db3c8 100644
--- a/docs/source/en/features/gradient_accumulation_with_booster.md
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -1,6 +1,6 @@
 # Gradient Accumulation(latest)
 
-Author: Shenggui Li, Yongbin Li
+Author: [Mingyan Jiang](https://github.com/jiangmingyan)
 
 **Prerequisite**
 - [Define Your Configuration](../basics/define_your_config.md)
diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
index d05ea7a6ea89..53029e0ab45b 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -1,6 +1,6 @@
 # 梯度累积
 
-作者: Shenggui Li, Yongbin Li
+作者: [Mingyan Jiang](https://github.com/jiangmingyan)
 
 **前置教程**
 - [定义配置文件](../basics/define_your_config.md)

From ca6143ca1ea80265bc0b4d72f7ec99822dc5127e Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 15:04:42 +0800
Subject: [PATCH 05/14] [doc]update gradient accumulation, fix

---
 .../gradient_accumulation_with_booster.md     |  9 ++---
 .../gradient_accumulation_with_booster.md     | 35 +++----------------
 2 files changed, 7 insertions(+), 37 deletions(-)

diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
index 3a37879db3c8..61751d5462f6 100644
--- a/docs/source/en/features/gradient_accumulation_with_booster.md
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -8,9 +8,7 @@ Author: [Mingyan Jiang](https://github.com/jiangmingyan)
 
 ## Introduction
 
-Gradient accumulation is a common way to enlarge your batch size for training.
-When training large-scale models, memory can easily become the bottleneck and the batch size can be very small, (e.g. 2), leading to unsatisfactory convergence. Gradient accumulation works by adding up the gradients calculated in multiple iterations,
-and only update the parameters in the preset iteration.
+Gradient accumulation is a common way to enlarge your batch size for training. When training large-scale models, memory can easily become the bottleneck and the batch size can be very small, (e.g. 2), leading to unsatisfactory convergence. Gradient accumulation works by adding up the gradients calculated in multiple iterations, and only update the parameters in the preset iteration.
 
 ## Usage
 
@@ -20,7 +18,7 @@ It is simple to use gradient accumulation in Colossal-AI. Just call `booster.no_
 
 We Now demonstrate gradient accumulation. In this example, we let the gradient accumulation size to be 4.
 
-### Step 2. Import libraries in train.py
+### Step 1. Import libraries in train.py
 Create a `train.py` and import the necessary dependencies. The version of `torch` should not be lower than 1.8.1.
 
 ```python
@@ -41,7 +39,6 @@ from colossalai.logging import get_dist_logger
 ```
 
 ### Step 2. Initialize Distributed Environment
-
 We then need to initialize distributed environment. For demo purpose, we uses `launch_from_torch`. You can refer to [Launch Colossal-AI](../basics/launch_colossalai.md) for other initialization methods.
 
 ```python
@@ -50,11 +47,9 @@ parser = colossalai.get_default_parser()
 args = parser.parse_args()
 # launch from torch
 colossalai.launch_from_torch(config=dict())
-
 ```
 
 ### Step 3. Create training components
-
 Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is obtained from the environment variable `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])` to a path on your machine. Data will be automatically downloaded to the root path.
 
 ```python
diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
index 53029e0ab45b..c20907115122 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -6,37 +6,20 @@
 - [定义配置文件](../basics/define_your_config.md)
 - [训练中使用Booster](../basics/engine_trainer.md) # todo 待更新链接。
 
-**示例代码**
-- [ColossalAI-Examples Gradient Accumulation](ColossalAI/examples/tutorial/feathures/gradient_accumulation/README.md)
-
 ## 引言
 
 梯度累积是一种常见的增大训练 batch size 的方式。 在训练大模型时，内存经常会成为瓶颈，并且 batch size 通常会很小（如2），这导致收敛性无法保证。梯度累积将多次迭代的梯度累加，并仅在达到预设迭代次数时更新参数。
 
 ## 使用
 
-在 Colossal-AI 中使用梯度累积非常简单，booster提供no_sync返回一个文件管理器，在该文件管理器下取消同步并且不更新梯度，则可以进行梯度累积， 在config.py中gradient_accumulation=4，表示进行梯度累积次数为4。
-
-```python
-gradient_accumulation = <int>
-```
+在 Colossal-AI 中使用梯度累积非常简单，booster提供no_sync返回一个文件管理器，在该文件管理器下取消同步并且累积梯度， 在本示例中，gradient_accumulation=4，表示进行梯度累积次数为4。
 
 ## 实例
 
 我们提供了一个 [运行实例](ColossalAI/examples/tutorial/feathures/gradient_accumulation/README.md)
 来展现梯度累积。在这个例子中，梯度累积次数被设置为4，你可以通过一下命令启动脚本。
 
-### 步骤 1. 创建配置文件
-
-Create a `config.py`.
-```python
-BATCH_SIZE = 128
-NUM_EPOCHS = 200
-
-gradient_accumulation = 4
-```
-
-### 步骤 2. 在 train.py 导入相关库
+### 步骤 1. 在 train.py 导入相关库
 创建train.py并导入必要依赖。 `torch` 的版本应不低于1.8.1。
 
 ```python
@@ -55,10 +38,9 @@ from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.logging import get_dist_logger
 ```
 
-### 步骤 3. 初始化分布式环境
+### 步骤 2. 初始化分布式环境
 
-我们需要初始化分布式环境。为了快速演示，我们使用`launch_from_torch`。你可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md)
-使用其他初始化方法。
+我们需要初始化分布式环境。为了快速演示，我们使用`launch_from_torch`。你可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md)使用其他初始化方法。
 
 ```python
 # initialize distributed setting
@@ -93,13 +75,6 @@ colossalai.launch_from_torch(config=dict())
                                     transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
                                 ]))
 
-    # train_dataloader = get_dataloader(
-    #     dataset=train_dataset,
-    #     shuffle=True,
-    #     batch_size=gpc.config.BATCH_SIZE,
-    #     pin_memory=True,
-    # )
-
     # build criterion
     criterion = torch.nn.CrossEntropyLoss()
 
@@ -121,7 +96,7 @@ colossalai.launch_from_torch(config=dict())
 ```
 
 ### 步骤 6. 使用booster训练
-使用booster构建一个普通的训练循环。 验证梯度累积. `param_by_iter` 记录分布训练的信息。
+使用booster构建一个普通的训练循环，验证梯度累积。 `param_by_iter` 记录分布训练的信息。
 ```python
 for idx, (img, label) in enumerate(train_dataloader):
         sync_context = booster.no_sync(model)

From 5c5c07d35f0346749a9f384675d17692405408f9 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 15:05:50 +0800
Subject: [PATCH 06/14] [doc]update gradient accumulation, fix

---
 .../features/gradient_accumulation_with_booster.md        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
index c20907115122..016bca8a4a7c 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -52,7 +52,7 @@ colossalai.launch_from_torch(config=dict())
 
 ```
 
-### 步骤 4. 创建训练组件
+### 步骤 3. 创建训练组件
 
 构建你的模型、优化器、损失函数、学习率调整器和数据加载器。注意数据集的路径从环境变量`DATA`获得。你可以通过 `export DATA=/path/to/data` 或 `Path(os.environ['DATA'])`，在你的机器上设置路径。数据将会被自动下载到该路径。
 
@@ -82,7 +82,7 @@ colossalai.launch_from_torch(config=dict())
     optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
 ```
 
-### 步骤 5. 注入特性
+### 步骤 4. 注入特性
 创建一个`TorchDDPPlugin`对象，并作为参实例化`Booster`, 调用booster注入特性.
 
 ```python
@@ -95,7 +95,7 @@ colossalai.launch_from_torch(config=dict())
                                                                      dataloader=train_dataloader)
 ```
 
-### 步骤 6. 使用booster训练
+### 步骤 5. 使用booster训练
 使用booster构建一个普通的训练循环，验证梯度累积。 `param_by_iter` 记录分布训练的信息。
 ```python
 for idx, (img, label) in enumerate(train_dataloader):
@@ -129,7 +129,7 @@ for idx, (img, label) in enumerate(train_dataloader):
 
 ```
 
-### 步骤 7. 启动训练脚本
+### 步骤 6. 启动训练脚本
 为了验证梯度累积，我们可以只检查参数值的变化。当设置梯度累加时，仅在最后一步更新参数。您可以使用以下命令运行脚本：
 ```shell
 colossalai run --nproc_per_node 1 train.py --config config.py

From cd0c23cb4696ab0bfb67cf54af01c4cdbc3ef45c Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 15:50:07 +0800
Subject: [PATCH 07/14] [doc]update gradient accumulation, fix

---
 .../en/features/gradient_accumulation_with_booster.md    | 4 ++--
 .../features/gradient_accumulation_with_booster.md       | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
index 61751d5462f6..c4f0b9be37e8 100644
--- a/docs/source/en/features/gradient_accumulation_with_booster.md
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -27,7 +27,6 @@ from pathlib import Path
 
 import torch
 from torchvision import transforms
-from titans.utils import barrier_context
 from torchvision.datasets import CIFAR10
 from torchvision.models import resnet18
 from torch.utils.data import DataLoader
@@ -36,6 +35,7 @@ import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.logging import get_dist_logger
+from colossalai.cluster.dist_coordinator import priority_execution
 ```
 
 ### Step 2. Initialize Distributed Environment
@@ -61,7 +61,7 @@ Build your model, optimizer, loss function, lr scheduler and dataloaders. Note t
     model = resnet18(num_classes=10)
 
     # build dataloaders
-    with barrier_context:
+    with priority_execution():
         train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
                                 download=True,
                                 transform=transforms.Compose([
diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
index 016bca8a4a7c..cc31a8180124 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -12,12 +12,11 @@
 
 ## 使用
 
-在 Colossal-AI 中使用梯度累积非常简单，booster提供no_sync返回一个文件管理器，在该文件管理器下取消同步并且累积梯度， 在本示例中，gradient_accumulation=4，表示进行梯度累积次数为4。
+在 Colossal-AI 中使用梯度累积非常简单，booster提供no_sync返回一个文件管理器，在该文件管理器下取消同步并且累积梯度。
 
 ## 实例
 
-我们提供了一个 [运行实例](ColossalAI/examples/tutorial/feathures/gradient_accumulation/README.md)
-来展现梯度累积。在这个例子中，梯度累积次数被设置为4，你可以通过一下命令启动脚本。
+我们将介绍如何使用梯度累积。在这个例子中，梯度累积次数被设置为4。
 
 ### 步骤 1. 在 train.py 导入相关库
 创建train.py并导入必要依赖。 `torch` 的版本应不低于1.8.1。
@@ -28,7 +27,6 @@ from pathlib import Path
 
 import torch
 from torchvision import transforms
-from titans.utils import barrier_context
 from torchvision.datasets import CIFAR10
 from torchvision.models import resnet18
 
@@ -36,6 +34,7 @@ import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.logging import get_dist_logger
+from colossalai.cluster.dist_coordinator import priority_execution
 ```
 
 ### 步骤 2. 初始化分布式环境
@@ -65,7 +64,7 @@ colossalai.launch_from_torch(config=dict())
     model = resnet18(num_classes=10)
 
     # build dataloaders
-    with barrier_context:
+    with priority_execution():
         train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
                                 download=True,
                                 transform=transforms.Compose([

From 69ed17d0adde0999318f5de7313bb8246a9a7900 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 15:54:03 +0800
Subject: [PATCH 08/14] [doc]update gradient accumulation, add sidebars

---
 docs/sidebars.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/sidebars.json b/docs/sidebars.json
index 44287c17eadf..6effe1220692 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -45,6 +45,7 @@
         "features/gradient_clipping",
         "features/gradient_handler",
         "features/zero_with_chunk",
+        "features/gradient_accumulation_with_booster",
         {
           "type": "category",
           "label": "Tensor Parallel",

From 7c6fd63b8cba18cca283cec4f673cbcdbfc68d9c Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 16:43:49 +0800
Subject: [PATCH 09/14] [doc]update gradient accumulation, fix

---
 docs/source/en/features/gradient_accumulation.md                | 2 +-
 docs/source/en/features/gradient_accumulation_with_booster.md   | 2 +-
 docs/source/zh-Hans/features/gradient_accumulation.md           | 2 +-
 .../zh-Hans/features/gradient_accumulation_with_booster.md      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/features/gradient_accumulation.md b/docs/source/en/features/gradient_accumulation.md
index 23fe5869096c..5dcda351db05 100644
--- a/docs/source/en/features/gradient_accumulation.md
+++ b/docs/source/en/features/gradient_accumulation.md
@@ -1,4 +1,4 @@
-# Gradient Accumulation(outdated)
+# Gradient Accumulation (outdated)
 
 Author: Shenggui Li, Yongbin Li
 
diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
index c4f0b9be37e8..0cb18e1ff325 100644
--- a/docs/source/en/features/gradient_accumulation_with_booster.md
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -1,4 +1,4 @@
-# Gradient Accumulation(latest)
+# Gradient Accumulation (latest)
 
 Author: [Mingyan Jiang](https://github.com/jiangmingyan)
 
diff --git a/docs/source/zh-Hans/features/gradient_accumulation.md b/docs/source/zh-Hans/features/gradient_accumulation.md
index 0cc86e82b424..6a52cf7bf1de 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation.md
@@ -1,4 +1,4 @@
-# 梯度累积(旧版本)
+# 梯度累积 (旧版本)
 
 作者: Shenggui Li, Yongbin Li
 
diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
index cc31a8180124..88a77e498dd7 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -1,4 +1,4 @@
-# 梯度累积
+# 梯度累积 (最新)
 
 作者: [Mingyan Jiang](https://github.com/jiangmingyan)
 

From 51878b60fddd671cc3fc9d97c922a58fb0db72a3 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Thu, 18 May 2023 17:58:50 +0800
Subject: [PATCH 10/14] [doc]update gradient accumulation, fix

---
 docs/source/en/features/gradient_accumulation_with_booster.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
index 0cb18e1ff325..799fd71de8eb 100644
--- a/docs/source/en/features/gradient_accumulation_with_booster.md
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -16,7 +16,7 @@ It is simple to use gradient accumulation in Colossal-AI. Just call `booster.no_
 
 ## Hands-on Practice
 
-We Now demonstrate gradient accumulation. In this example, we let the gradient accumulation size to be 4.
+We now demonstrate gradient accumulation. In this example, we let the gradient accumulation size to be 4.
 
 ### Step 1. Import libraries in train.py
 Create a `train.py` and import the necessary dependencies. The version of `torch` should not be lower than 1.8.1.

From e28236ca9cfcbfee00b64c0af31c724a027d5419 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Fri, 19 May 2023 12:12:22 +0800
Subject: [PATCH 11/14] [doc]update gradient accumulation, fix

---
 docs/source/en/features/gradient_accumulation.md   |  2 +-
 .../features/gradient_accumulation_with_booster.md | 12 +++++++-----
 .../features/gradient_accumulation_with_booster.md | 14 ++++++++------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/docs/source/en/features/gradient_accumulation.md b/docs/source/en/features/gradient_accumulation.md
index 5dcda351db05..071fe509510c 100644
--- a/docs/source/en/features/gradient_accumulation.md
+++ b/docs/source/en/features/gradient_accumulation.md
@@ -1,4 +1,4 @@
-# Gradient Accumulation (outdated)
+# Gradient Accumulation (Outdated)
 
 Author: Shenggui Li, Yongbin Li
 
diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
index 799fd71de8eb..ee7fd77a5d94 100644
--- a/docs/source/en/features/gradient_accumulation_with_booster.md
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -1,10 +1,10 @@
-# Gradient Accumulation (latest)
+# Gradient Accumulation (Latest)
 
 Author: [Mingyan Jiang](https://github.com/jiangmingyan)
 
 **Prerequisite**
 - [Define Your Configuration](../basics/define_your_config.md)
-- [Booster Training](../basics/booster_api.md)
+- [Training Booster](../basics/booster_api.md)
 
 ## Introduction
 
@@ -12,7 +12,7 @@ Gradient accumulation is a common way to enlarge your batch size for training. W
 
 ## Usage
 
-It is simple to use gradient accumulation in Colossal-AI. Just call `booster.no_sync()` which returns a context manager. It accumulate gradients without synchronization meanwhile you should not update the gradients.
+It is simple to use gradient accumulation in Colossal-AI. Just call `booster.no_sync()` which returns a context manager. It accumulate gradients without synchronization, meanwhile you should not update the gradients.
 
 ## Hands-on Practice
 
@@ -53,7 +53,7 @@ colossalai.launch_from_torch(config=dict())
 Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is obtained from the environment variable `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])` to a path on your machine. Data will be automatically downloaded to the root path.
 
 ```python
-    # define the constants
+    # define the training hyperparameters
     BATCH_SIZE = 128
     GRADIENT_ACCUMULATION = 4
 
@@ -79,7 +79,7 @@ Build your model, optimizer, loss function, lr scheduler and dataloaders. Note t
 ```
 
 ### Step 4. Inject Feature
-Create a `TorchDDPPlugin` object to instantiate a `Booster`, and boost these training components with booster.
+Create a `TorchDDPPlugin` object to instantiate a `Booster`, and boost these training components.
 
 ```python
     plugin = TorchDDPPlugin()
@@ -140,3 +140,5 @@ iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0
 iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
 iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
 ```
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_accumulation_with_booster.py  -->
diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
index 88a77e498dd7..33b3d10dda83 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -1,10 +1,10 @@
-# 梯度累积 (最新)
+# 梯度累积 (最新版本)
 
 作者: [Mingyan Jiang](https://github.com/jiangmingyan)
 
 **前置教程**
 - [定义配置文件](../basics/define_your_config.md)
-- [训练中使用Booster](../basics/engine_trainer.md) # todo 待更新链接。
+- [训练中使用Booster](../basics/booster_api.md)
 
 ## 引言
 
@@ -56,7 +56,7 @@ colossalai.launch_from_torch(config=dict())
 构建你的模型、优化器、损失函数、学习率调整器和数据加载器。注意数据集的路径从环境变量`DATA`获得。你可以通过 `export DATA=/path/to/data` 或 `Path(os.environ['DATA'])`，在你的机器上设置路径。数据将会被自动下载到该路径。
 
 ```python
-    # define the constant
+    # define the training hyperparameters
     BATCH_SIZE = 128
     GRADIENT_ACCUMULATION = 4
 
@@ -82,7 +82,7 @@ colossalai.launch_from_torch(config=dict())
 ```
 
 ### 步骤 4. 注入特性
-创建一个`TorchDDPPlugin`对象，并作为参实例化`Booster`, 调用booster注入特性.
+创建一个`TorchDDPPlugin`对象，并作为参实例化`Booster`， 调用`booster.boost`注入特性。
 
 ```python
     plugin = TorchDDPPlugin()
@@ -102,7 +102,7 @@ for idx, (img, label) in enumerate(train_dataloader):
         img = img.cuda()
         label = label.cuda()
         model.zero_grad()
-        if idx % (gpc.config.gradient_accumulation - 1) != 0:
+        if idx % (GRADIENT_ACCUMULATION - 1) != 0:
             with sync_context:
                 output = model(img)
                 train_loss = criterion(output, label)
@@ -117,7 +117,7 @@ for idx, (img, label) in enumerate(train_dataloader):
         ele_1st = next(model.parameters()).flatten()[0]
         param_by_iter.append(str(ele_1st.item()))
 
-        if idx != 0 and idx % (gpc.config.gradient_accumulation - 1) == 0:
+        if idx != 0 and idx % (GRADIENT_ACCUMULATION - 1) == 0:
             break
 
     for iteration, val in enumerate(param_by_iter):
@@ -142,3 +142,5 @@ iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0
 iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
 iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
 ```
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_accumulation_with_booster.py  -->

From 7eac931605d6b9f20f372a73b7908b13dc9c68ea Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Fri, 19 May 2023 16:08:26 +0800
Subject: [PATCH 12/14] [doc]update gradient accumulation, resolve comments

---
 docs/sidebars.json                            |  2 +-
 .../gradient_accumulation_with_booster.md     | 64 +++++++++----------
 .../gradient_accumulation_with_booster.md     | 64 +++++++++----------
 3 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/docs/sidebars.json b/docs/sidebars.json
index 6effe1220692..ab9ce23e67d3 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -41,11 +41,11 @@
       "collapsed": true,
       "items": [
         "features/mixed_precision_training",
+        "features/gradient_accumulation_with_booster",
         "features/gradient_accumulation",
         "features/gradient_clipping",
         "features/gradient_handler",
         "features/zero_with_chunk",
-        "features/gradient_accumulation_with_booster",
         {
           "type": "category",
           "label": "Tensor Parallel",
diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
index ee7fd77a5d94..c3e205e16f21 100644
--- a/docs/source/en/features/gradient_accumulation_with_booster.md
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -12,7 +12,7 @@ Gradient accumulation is a common way to enlarge your batch size for training. W
 
 ## Usage
 
-It is simple to use gradient accumulation in Colossal-AI. Just call `booster.no_sync()` which returns a context manager. It accumulate gradients without synchronization, meanwhile you should not update the gradients.
+It is simple to use gradient accumulation in Colossal-AI. Just call `booster.no_sync()` which returns a context manager. It accumulate gradients without synchronization, meanwhile you should not update the weights.
 
 ## Hands-on Practice
 
@@ -53,52 +53,52 @@ colossalai.launch_from_torch(config=dict())
 Build your model, optimizer, loss function, lr scheduler and dataloaders. Note that the root path of the dataset is obtained from the environment variable `DATA`. You may `export DATA=/path/to/data` or change `Path(os.environ['DATA'])` to a path on your machine. Data will be automatically downloaded to the root path.
 
 ```python
-    # define the training hyperparameters
-    BATCH_SIZE = 128
-    GRADIENT_ACCUMULATION = 4
-
-    # build resnet
-    model = resnet18(num_classes=10)
-
-    # build dataloaders
-    with priority_execution():
-        train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
-                                download=True,
-                                transform=transforms.Compose([
-                                    transforms.RandomCrop(size=32, padding=4),
-                                    transforms.RandomHorizontalFlip(),
-                                    transforms.ToTensor(),
-                                    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
-                                ]))
-
-    # build criterion
-    criterion = torch.nn.CrossEntropyLoss()
-
-    # optimizer
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+# define the training hyperparameters
+BATCH_SIZE = 128
+GRADIENT_ACCUMULATION = 4
+
+# build resnet
+model = resnet18(num_classes=10)
+
+# build dataloaders
+with priority_execution():
+    train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
+                            download=True,
+                            transform=transforms.Compose([
+                                transforms.RandomCrop(size=32, padding=4),
+                                transforms.RandomHorizontalFlip(),
+                                transforms.ToTensor(),
+                                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
+                            ]))
+
+# build criterion
+criterion = torch.nn.CrossEntropyLoss()
+
+# optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
 ```
 
 ### Step 4. Inject Feature
 Create a `TorchDDPPlugin` object to instantiate a `Booster`, and boost these training components.
 
 ```python
-    plugin = TorchDDPPlugin()
-    booster = Booster(plugin=plugin)
-    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
-    model, optimizer, criterion, train_dataloader, _ = booster.boost(model=model,
-                                                                     optimizer=optimizer,
-                                                                     criterion=criterion,
-                                                                     dataloader=train_dataloader)
+plugin = TorchDDPPlugin()
+booster = Booster(plugin=plugin)
+train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+model, optimizer, criterion, train_dataloader, _ = booster.boost(model=model,
+                                                                    optimizer=optimizer,
+                                                                    criterion=criterion,
+                                                                    dataloader=train_dataloader)
 ```
 
 ### Step 5. Train with Booster
 Use booster in a normal training loops, and verify gradient accumulation. `param_by_iter` is to record the distributed training information.
 ```python
+model.zero_grad()
 for idx, (img, label) in enumerate(train_dataloader):
         sync_context = booster.no_sync(model)
         img = img.cuda()
         label = label.cuda()
-        model.zero_grad()
         if idx % (GRADIENT_ACCUMULATION - 1) != 0:
             with sync_context:
                 output = model(img)
diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
index 33b3d10dda83..f6b5189c4af4 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -56,52 +56,52 @@ colossalai.launch_from_torch(config=dict())
 构建你的模型、优化器、损失函数、学习率调整器和数据加载器。注意数据集的路径从环境变量`DATA`获得。你可以通过 `export DATA=/path/to/data` 或 `Path(os.environ['DATA'])`，在你的机器上设置路径。数据将会被自动下载到该路径。
 
 ```python
-    # define the training hyperparameters
-    BATCH_SIZE = 128
-    GRADIENT_ACCUMULATION = 4
-
-    # build resnet
-    model = resnet18(num_classes=10)
-
-    # build dataloaders
-    with priority_execution():
-        train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
-                                download=True,
-                                transform=transforms.Compose([
-                                    transforms.RandomCrop(size=32, padding=4),
-                                    transforms.RandomHorizontalFlip(),
-                                    transforms.ToTensor(),
-                                    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
-                                ]))
-
-    # build criterion
-    criterion = torch.nn.CrossEntropyLoss()
-
-    # optimizer
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+# define the training hyperparameters
+BATCH_SIZE = 128
+GRADIENT_ACCUMULATION = 4
+
+# build resnet
+model = resnet18(num_classes=10)
+
+# build dataloaders
+with priority_execution():
+    train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
+                            download=True,
+                            transform=transforms.Compose([
+                                transforms.RandomCrop(size=32, padding=4),
+                                transforms.RandomHorizontalFlip(),
+                                transforms.ToTensor(),
+                                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
+                            ]))
+
+# build criterion
+criterion = torch.nn.CrossEntropyLoss()
+
+# optimizer
+optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
 ```
 
 ### 步骤 4. 注入特性
 创建一个`TorchDDPPlugin`对象，并作为参实例化`Booster`， 调用`booster.boost`注入特性。
 
 ```python
-    plugin = TorchDDPPlugin()
-    booster = Booster(plugin=plugin)
-    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
-    model, optimizer, criterion, train_dataloader, _ = booster.boost(model=model,
-                                                                     optimizer=optimizer,
-                                                                     criterion=criterion,
-                                                                     dataloader=train_dataloader)
+plugin = TorchDDPPlugin()
+booster = Booster(plugin=plugin)
+train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
+model, optimizer, criterion, train_dataloader, _ = booster.boost(model=model,
+                                                                    optimizer=optimizer,
+                                                                    criterion=criterion,
+                                                                    dataloader=train_dataloader)
 ```
 
 ### 步骤 5. 使用booster训练
 使用booster构建一个普通的训练循环，验证梯度累积。 `param_by_iter` 记录分布训练的信息。
 ```python
+optimizer.zero_grad()
 for idx, (img, label) in enumerate(train_dataloader):
         sync_context = booster.no_sync(model)
         img = img.cuda()
         label = label.cuda()
-        model.zero_grad()
         if idx % (GRADIENT_ACCUMULATION - 1) != 0:
             with sync_context:
                 output = model(img)
@@ -112,7 +112,7 @@ for idx, (img, label) in enumerate(train_dataloader):
             train_loss = criterion(output, label)
             booster.backward(train_loss, optimizer)
             optimizer.step()
-            model.zero_grad()
+            optimizer.zero_grad()
 
         ele_1st = next(model.parameters()).flatten()[0]
         param_by_iter.append(str(ele_1st.item()))

From 53b900752b6472df844a28f3243ea23220db1f70 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Mon, 22 May 2023 11:28:18 +0800
Subject: [PATCH 13/14] [doc]update gradient accumulation, resolve comments

---
 .../zh-Hans/features/gradient_accumulation_with_booster.md      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
index f6b5189c4af4..4dc0b3db4a86 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -12,7 +12,7 @@
 
 ## 使用
 
-在 Colossal-AI 中使用梯度累积非常简单，booster提供no_sync返回一个文件管理器，在该文件管理器下取消同步并且累积梯度。
+在 Colossal-AI 中使用梯度累积非常简单，booster提供no_sync返回一个上下文管理器，在该上下文管理器下取消同步并且累积梯度。
 
 ## 实例
 

From 01b90baa1b4cd452ccf728c16054ce9210d63827 Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Mon, 22 May 2023 19:08:13 +0800
Subject: [PATCH 14/14] fix

---
 docs/source/en/features/gradient_accumulation.md              | 2 ++
 docs/source/en/features/gradient_accumulation_with_booster.md | 4 ++--
 docs/source/zh-Hans/features/gradient_accumulation.md         | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/features/gradient_accumulation.md b/docs/source/en/features/gradient_accumulation.md
index 071fe509510c..91d89b815bf7 100644
--- a/docs/source/en/features/gradient_accumulation.md
+++ b/docs/source/en/features/gradient_accumulation.md
@@ -43,3 +43,5 @@ iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0
 iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
 iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
 ```
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_accumulation.py  -->
diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md
index c3e205e16f21..f319ef5b2db3 100644
--- a/docs/source/en/features/gradient_accumulation_with_booster.md
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -94,7 +94,7 @@ model, optimizer, criterion, train_dataloader, _ = booster.boost(model=model,
 ### Step 5. Train with Booster
 Use booster in a normal training loops, and verify gradient accumulation. `param_by_iter` is to record the distributed training information.
 ```python
-model.zero_grad()
+optimizer.zero_grad()
 for idx, (img, label) in enumerate(train_dataloader):
         sync_context = booster.no_sync(model)
         img = img.cuda()
@@ -109,7 +109,7 @@ for idx, (img, label) in enumerate(train_dataloader):
             train_loss = criterion(output, label)
             booster.backward(train_loss, optimizer)
             optimizer.step()
-            model.zero_grad()
+            optimizer.zero_grad()
 
         ele_1st = next(model.parameters()).flatten()[0]
         param_by_iter.append(str(ele_1st.item()))
diff --git a/docs/source/zh-Hans/features/gradient_accumulation.md b/docs/source/zh-Hans/features/gradient_accumulation.md
index 6a52cf7bf1de..fc8b29bbe8f1 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation.md
@@ -38,3 +38,4 @@ iteration 1, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0
 iteration 2, first 10 elements of param: tensor([-0.0208,  0.0189,  0.0234,  0.0047,  0.0116, -0.0283,  0.0071, -0.0359, -0.0267, -0.0006], device='cuda:0', grad_fn=<SliceBackward0>)
 iteration 3, first 10 elements of param: tensor([-0.0141,  0.0464,  0.0507,  0.0321,  0.0356, -0.0150,  0.0172, -0.0118, 0.0222,  0.0473], device='cuda:0', grad_fn=<SliceBackward0>)
 ```
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 gradient_accumulation.py  -->