From 9a4842c571cd63e6a660182a234bc6ff60991ba0 Mon Sep 17 00:00:00 2001
From: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Date: Mon, 17 Jul 2023 17:30:57 +0800
Subject: [PATCH 1/3] revise shardformer readme (#4246)

---
 colossalai/shardformer/README.md | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)
diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
index 6ae32e4fbd42..bf4215c52980 100644
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -22,7 +22,6 @@
     - [System Performance](#system-performance)
     - [Convergence](#convergence)
 
-
 ## 🔗 Introduction
 
 **Shardformer** is a module that automatically parallelizes the mainstream models in libraries such as HuggingFace and TIMM. This module aims to make parallelization hassle-free for users who are not from the system background.
@@ -33,7 +32,7 @@
 
 The sample API usage is given below:
 
-``` python
+```python
 from colossalai.shardformer import ShardConfig, Shard
 from transformers import BertForMaskedLM
 
@@ -74,6 +73,7 @@ shard_former.optimize(model, my_policy)
 
 
 ```
+
 ## 🗺 Roadmap
 
 We will follow this roadmap to develop Shardformer:
@@ -117,15 +117,13 @@ Please refer to the code for more details.
    <br/>
 </p>
 
-
-
 ### Distributed Modules
 
 `ShardFormer` replaces the original PyTorch module with a distributed module.
 The distributed module keeps the same attributes as the original module but replaces the original parameters with distributed parameters and defines a new `forward` function to execute distributed computation.
 Each distributed module implements its `from_native_module` static method to convert the PyTorch module to its corresponding distributed module.
 
-```python
+````python
 class ParallelModule(torch.nn.Module):
 
     @abstractmethod
@@ -140,7 +138,7 @@ class ParallelModule(torch.nn.Module):
         my_linear = Linear1D_Col.from_native_module(my_linear, process_group)
         ```
         """
-```
+````
 
 ### Shard Config
 
@@ -169,7 +167,7 @@ We abstract the policy into four stages:
 2. Providing `ModulePolicyDescription`: call `Policy.module_policy` to get a bunch of `ModulePolicyDescription` to tell `ModelSharder` how the submodules's attributes, child parameters, and deeper submodules will be substituted.
 3. Postprocessing: call `Policy.postprocess` to perform some postprocessing work, for example, binding the embedding and classifier head weights of the BERT model.
 
-``` python
+```python
 @dataclass
 class ModulePolicyDescription:
     r"""
@@ -238,7 +236,6 @@ class Policy(ABC):
         ...
 ```
 
-
 ### Model Sharder
 
 `ModelSharder` is the class in charge of sharding the model based on the given policy.
@@ -324,21 +321,20 @@ You can create a new file in the `colossalai/shardformer/policies` folder and na
 Please follow the following protocols when writing your policy:
 
 - You have to make a clear decision what you want to replace exactly in the original PyTorch module
-   - Use `ModulePolicyDescription.attribute_replacement` to replace the module attributes
-   - Use `ModulePolicyDescription.param_replacement` to replace the module parameters
-   - Use `ModulePolicyDescription.sub_module_replacement` to replace the submodules completely. The target module should implement the `from_native_module` for the .
-   - Use `ModulePolicyDescription.method_replacement` to replace the module methods. **These replacement methods should be put in the `shardformer/modeling/<model-name>.py`**.
+  - Use `ModulePolicyDescription.attribute_replacement` to replace the module attributes
+  - Use `ModulePolicyDescription.param_replacement` to replace the module parameters
+  - Use `ModulePolicyDescription.sub_module_replacement` to replace the submodules completely. The target module should implement the `from_native_module` for the replacement.
+  - Use `ModulePolicyDescription.method_replacement` to replace the module methods. **These replacement methods should be put in the `shardformer/modeling/<model-name>.py`**.
 - You can implement the `ParallelModule` for primitive modules in the `shardformer/layer/<model-name>.py` file. Primitive modules refer to modules which are not composed of other modules. For example, the `torch.nn.Linear` module is a primitive module while modules such as `BertEncoder` module in the `transformers` library is a composite module. Primitive modules do not nested inner `nn.Module` members. For composite modules, you should consider using `ModulePolicyDescription` to implement your replacement.
 - `ParallelModule` is meant to be used in two ways: `ParallelModule.from_native_module` to convert native PyTorch module to the `ParallelModule` and `ParallelModule(...)` to instantiate the module directly just like a normal PyTorch module. `ParallelModule` should be only implemented for modules whose weights are sharded. If you want to make your module compatible with the `ModulePolicyDescription.sub_module_replacement` and there is no weight sharding in your module, you can just implement the `from_native_module` method without inheriting the `ParallelModule` like `colossalai/shardformer/layer/normalization.py`.
 - **Do not import any file in the `colossalai/shardformer/policies` and `colossalai/shardformer/modeling` to avoid unwanted import error**. For example, a file in these folders accidentally imports `transformers` library at the top of the file, then the user will have to install `transformers` library even if they do not use this file. Any file in the `modeling` folder should be only imported by the policy file. A policy implementation should be only imported dynamically via the autopolicy or manually via the `ShardFormer` module.
 - Try to keep your import statement on third-party libraries such as `transformers` within the function body instead of the header section of the file. This is because we do not want to import the library when we do not use the policy.
 
-
 - Step 2. Register your policy to the autopolicy
 
 Next, you need to register your policy in the `colossalai/shardformer/policies/autopolicy.py` file.
 
-For example, if we register the policy for the BERT model, we just add a key-value in the `_POLICY_LIST` dictionary. The key if the `qualname` of the model object (you can get it by model.__class__.__qualname__). The value is a `PolicyLocation` object, which contains the file name and the class name of the policy. We do not import the policy directly because the policy file may contain libraries (such as `transformers`) which we do not want to import when we do not use the policy.
+For example, if we register the policy for the BERT model, we just add a key-value in the `_POLICY_LIST` dictionary. The key if the `qualname` of the model object (you can get it by model.\_\_class\_\_.\_\_qualname\_\_). The value is a `PolicyLocation` object, which contains the file name and the class name of the policy. We do not import the policy directly because the policy file may contain libraries (such as `transformers`) which we do not want to import when we do not use the policy.
 
 ```python
 _POLICY_LIST = {
@@ -360,7 +356,6 @@ Add your model to the `tests/kit/model_zoo` file. This allows you to define test
 
 Next, implement your unit test in the `tests/test_shardformer` folder. Please refer to other similar tests for style consistency.
 
-
 - Step 3. Execute your test
 
 When you run tests locally, you should run tests for both your newly-added test file and the whole `shardformer` module tests.

From 7ff11b5537123b50d8b1b3b0fbaca0fa31d9481b Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Mon, 17 Jul 2023 21:07:44 +0800
Subject: [PATCH 2/3] [example] add llama pretraining (#4257)

---
 README.md                         | 11 +++++++++++
 docs/README-zh-Hans.md            | 10 ++++++++++
 examples/language/llama/README.md | 11 +++++++++++
 3 files changed, 32 insertions(+)
 create mode 100644 examples/language/llama/README.md

diff --git a/README.md b/README.md
index 34c8a6b730a3..21670e1e59fb 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@
 </div>
 
 ## Latest News
+* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
 * [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
 * [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana)
 * [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
@@ -49,6 +50,7 @@
  <li>
    <a href="#Parallel-Training-Demo">Parallel Training Demo</a>
    <ul>
+     <li><a href="#LLaMA">LLaMA</a></li>
      <li><a href="#GPT-3">GPT-3</a></li>
      <li><a href="#GPT-2">GPT-2</a></li>
      <li><a href="#BERT">BERT</a></li>
@@ -216,6 +218,15 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 
 ## Parallel Training Demo
 
+### LLaMA
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA_pretraining.png" width=600/>
+</p>
+
+- 65-billion-parameter large model pretraining accelerated by 38%
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
+[[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)
+
 ### GPT-3
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/GPT3-v5.png" width=700/>
diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index 1dde7a816676..e229c65d890c 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -24,6 +24,7 @@
 </div>
 
 ## 新闻
+* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
 * [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
 * [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana)
 * [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
@@ -49,6 +50,7 @@
  <li>
    <a href="#并行训练样例展示">并行训练样例展示</a>
    <ul>
+     <li><a href="#LLaMA">LLaMA</a></li>
      <li><a href="#GPT-3">GPT-3</a></li>
      <li><a href="#GPT-2">GPT-2</a></li>
      <li><a href="#BERT">BERT</a></li>
@@ -209,6 +211,14 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 
 ## 并行训练样例展示
+### LLaMA
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA_pretraining.png" width=600/>
+</p>
+
+- 650亿参数大模型预训练加速38%
+[[代码]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
+[[博客]](https://www.hpc-ai.tech/blog/large-model-pretraining)
 
 ### GPT-3
 <p align="center">
diff --git a/examples/language/llama/README.md b/examples/language/llama/README.md
new file mode 100644
index 000000000000..871804f2ca86
--- /dev/null
+++ b/examples/language/llama/README.md
@@ -0,0 +1,11 @@
+# Pretraining LLaMA: best practices for building LLaMA-like base models
+
+<p id="ColossalChat-Speed" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA_pretraining.png" width=600/>
+</p>
+
+- 65-billion-parameter large model pretraining accelerated by 38%
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
+[[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)
+
+> Since the main branch is being updated, in order to maintain the stability of the code, this example is temporarily kept as an [independent branch](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama).

From e3da89accf62e612d7855f470fc7fef0e3300be0 Mon Sep 17 00:00:00 2001
From: Qianran Ma <qianranm@luchentech.com>
Date: Tue, 18 Jul 2023 09:32:20 +0800
Subject: [PATCH 3/3] [NFC] polish colossalai/communication/p2p.py code style

---
 colossalai/communication/p2p.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/colossalai/communication/p2p.py b/colossalai/communication/p2p.py
index 1f20fca4f74d..d28d140168fd 100644
--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
@@ -1,16 +1,18 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+import operator
+from functools import reduce
 from typing import List, Tuple, Union
+
 import torch
 import torch.distributed as dist
 
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.utils import get_current_device
-from functools import reduce
-import operator
-from .utils import split_tensor_into_1d_equal_chunks, gather_split_1d_tensor
+
+from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
 
 TensorShape = Union[torch.Size, List[int], Tuple[int]]
 
@@ -260,7 +262,7 @@ def send_forward_recv_backward(output_tensor,
                                next_rank=None,
                                dtype=torch.float,
                                scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
-    """Batched communication operation. Sends the input tensor to the 
+    """Batched communication operation. Sends the input tensor to the
     next stage in pipeline, while receives the gradient tensor from the
     next stage in pipeline as the input gradient tensor of this stage.
 
@@ -319,7 +321,7 @@ def send_forward_recv_forward(output_tensor,
                               next_rank=None,
                               dtype=torch.float,
                               scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
-    """Batched communication operation. Sends the input tensor to the 
+    """Batched communication operation. Sends the input tensor to the
     next stage in pipeline, while receives the output tensor from the
     previous stage in pipeline as the input of this stage.