jamesthesnake · jamesthesnake · May 21, 2023 · May 17, 2023 · May 17, 2023 · May 17, 2023
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
@@ -43,7 +43,7 @@ I will provide the details of each workflow below.
 
 | Workflow Name          | File name                  | Description                                                                                                                                       |
 | ---------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. |
+| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when a PR changes essential files. It will run all the unit tests in the repository with 4 GPUs. |
 | `Build on Schedule`    | `build_on_schedule.yml`    | This workflow will run the unit tests everyday with 8 GPUs. The result is sent to Lark.                                                           |
 | `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results when `Build` is done.                                                           |
 

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
@@ -2,16 +2,26 @@ name: Build on PR
 
 on:
   pull_request:
-    types: [synchronize, labeled]
+    types: [synchronize, opened, reopened]
+    paths:
+      - '.github/workflows/build_on_pr.yml' # run command & env variables change
+      - 'colossalai/**' # source code change
+      - '!colossalai/**.md' # ignore doc change
+      - 'op_builder/**' # cuda extension change
+      - '!op_builder/**.md' # ignore doc change
+      - 'requirements/**' # requirements change
+      - 'tests/**' # test change
+      - '!tests/**.md' # ignore doc change
+      - 'pytest.ini' # test config change
+      - 'setup.py' # install command change
 
 jobs:
   detect:
     name: Detect file change
     if: |
       github.event.pull_request.draft == false &&
       github.base_ref == 'main' &&
-      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' &&
-      contains( github.event.pull_request.labels.*.name, 'Run Build and Test')
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     outputs:
       changedExtenisonFiles: ${{ steps.find-extension-change.outputs.all_changed_files }}
       anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }}
@@ -66,6 +76,7 @@ jobs:
   build:
     name: Build and Test Colossal-AI
     needs: detect
+    if: needs.detect.outputs.anyLibraryFileChanged == 'true'
     runs-on: [self-hosted, gpu]
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
@@ -110,7 +121,6 @@ jobs:
           [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
 
       - name: Install Colossal-AI
-        if: needs.detect.outputs.anyLibraryFileChanged == 'true'
         run: |
           CUDA_EXT=1 pip install -v -e .
           pip install -r requirements/requirements-test.txt
@@ -127,7 +137,6 @@ jobs:
           fi
 
       - name: Execute Unit Testing
-        if: needs.detect.outputs.anyLibraryFileChanged == 'true'
         run: |
           CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. tests/
         env:

diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml
@@ -47,12 +47,16 @@ jobs:
       # we use the versions in the main branch as the guide for versions to display
       # checkout will give your merged branch
       # therefore, we need to make the merged branch as the main branch
+      # there is no main branch, so it's safe to checkout the main branch from the merged branch
+      # docer will rebase the remote main branch to the merged branch, so we have to config user
       - name: Make the merged branch main
         run: |
           cd ColossalAI
-          curBranch=$(git rev-parse --abbrev-ref HEAD)
-          git checkout main
-          git merge $curBranch             # fast-forward master up to the merge
+          git checkout -b main
+          git branch -u origin/main
+          git config user.name 'github-actions'
+          git config user.email 'github-actions@github.com'
+
 
       - name: Build docs
         run: |

diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
@@ -86,7 +86,7 @@ jobs:
       - name: Test the Doc
         run: |
           source activate pytorch
-          for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
+          for file in ${{ needs.detect-changed-doc.outputs.changed_files }}; do
             echo "Testing $file now..."
             docer test -p $file
           done

diff --git a/README.md b/README.md
@@ -127,12 +127,22 @@ distributed training and inference in a few lines.
 ### ColossalChat
 
 <div align="center">
-   <a href="https://chat.colossalai.org/">
-   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Chat-demo.png" width="700" />
+   <a href="https://www.youtube.com/watch?v=HcTiHzApHm0">
+   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20YouTube.png" width="700" />
    </a>
 </div>
 
-[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) [[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) [[demo]](https://chat.colossalai.org)
+[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. 
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) 
+[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) 
+[[demo]](https://www.youtube.com/watch?v=HcTiHzApHm0)
+[[tutorial]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
+
+<p id="ColossalChat-Speed" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20Speed.jpg" width=450/>
+</p>
+
+- Up to 10 times faster for RLHF PPO Stage3 Training
 
 <p id="ColossalChat_scaling" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT%20scaling.png" width=800/>

diff --git a/applications/Chat/README.md b/applications/Chat/README.md
@@ -67,13 +67,24 @@ More details can be found in the latest news.
 * [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
 
 ## Online demo
-You can experience the performance of Coati7B on this page.
+<div align="center">
+   <a href="https://www.youtube.com/watch?v=HcTiHzApHm0">
+   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20YouTube.png" width="700" />
+   </a>
+</div>
 
-[chat.colossalai.org](https://chat.colossalai.org/)
+[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. 
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) 
+[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) 
+[[demo]](https://www.youtube.com/watch?v=HcTiHzApHm0)
+[[tutorial]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
+
+<p id="ColossalChat-Speed" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20Speed.jpg" width=450/>
+</p>
 
-Due to resource constraints, we will only provide this service from 29th Mar 2023 to 5 April 2023. However, we have provided the inference code in the [inference](./inference/) folder. The WebUI will be open-sourced soon as well.
+> DeepSpeedChat performance comes from its blog on 2023 April 12, ColossalChat performance can be reproduced on an AWS p4d.24xlarge node with 8 A100-40G GPUs with the following command: torchrun --standalone --nproc_per_node 8 benchmark_opt_lora_dummy.py --max_timesteps 1 --update_timesteps 1 --use_kernels --strategy colossalai_zero2 --experience_batch_size 64 --train_batch_size 32
 
-> Warning: Due to model and dataset size limitations, Coati is just a baby model, Coati7B may output incorrect information and lack the ability for multi-turn dialogue. There is still significant room for improvement.
 ## Install
 
 ### Install the environment
@@ -112,12 +123,14 @@ Here is how we collected the data
 Stage1 is supervised instructs fine-tuning, which uses the datasets mentioned earlier to fine-tune the model.
 
 You can run the `examples/train_sft.sh` to start a supervised instructs fine-tuning.
+[[Stage1 tutorial video]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
 
 ### RLHF Training Stage2 - Training reward model
 
 Stage2 trains a reward model, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model
 
 You can run the `examples/train_rm.sh` to start a reward model training.
+[[Stage2 tutorial video]](https://www.youtube.com/watch?v=gMx2CApKhuo)
 
 ### RLHF Training Stage3 - Training model with reinforcement learning by human feedback
 
@@ -128,6 +141,7 @@ Stage3 uses reinforcement learning algorithm, which is the most complex part of
 </p>
 
 You can run the `examples/train_prompts.sh` to start training PPO with human feedback.
+[[Stage3 tutorial video]](https://www.youtube.com/watch?v=Z8wwSHxPL9g)
 
 For more details, see [`examples/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples).
 

diff --git a/applications/Chat/coati/dataset/prompt_dataset.py b/applications/Chat/coati/dataset/prompt_dataset.py
@@ -45,7 +45,7 @@ def __init__(self,
                 self.keyed_prompt[k].extend(tensor.to(torch.cuda.current_device()).unbind())
 
     def __len__(self):
-        return len(self.keyed_prompt)
+        return len(self.keyed_prompt["input_ids"])
 
     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
         return {k: v[i] for k, v in self.keyed_prompt.items()}
diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
@@ -48,6 +48,7 @@ The following pic shows how we collected the data.
 ## Stage1 - Supervised instructs tuning
 
 Stage1 is supervised instructs fine-tuning, which uses the datasets mentioned earlier to fine-tune the model.
+[[Stage1 tutorial video]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
 
 You can run the `examples/train_sft.sh` to start a supervised instructs fine-tuning.
 
@@ -83,6 +84,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
 ## Stage2 - Training reward model
 
 We train a reward model in stage 2, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model.
+[[Stage2 tutorial video]](https://www.youtube.com/watch?v=gMx2CApKhuo)
 
 You can run the `examples/train_rm.sh` to start a reward model training.
 
@@ -141,6 +143,7 @@ Stage3 uses reinforcement learning algorithm, which is the most complex part of
 
 You can run the `examples/train_prompts.sh` to start PPO training.
 You can also use the cmd following to start PPO training.
+[[Stage3 tutorial video]](https://www.youtube.com/watch?v=Z8wwSHxPL9g)
 
 ```
 torchrun --standalone --nproc_per_node=4 train_prompts.py \
@@ -153,7 +156,7 @@ torchrun --standalone --nproc_per_node=4 train_prompts.py \
          --rm_path /your/rm/model/path
 ```
 
-Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use the [script](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/example_data_reformat.py) to reformat [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild.  
+Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use the [script](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/generate_prompt_dataset.py) which samples `instinwild_en.json` or `instinwild_ch.json` in [InstructionWild](https://github.com/XueFuzhao/InstructionWild/tree/main/data#instructwild-data) to generate the prompt dataset.
 Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning.
 
 ### Arg List

diff --git a/applications/Chat/examples/example_data_reformat.py b/applications/Chat/examples/example_data_reformat.py
diff --git a/applications/Chat/examples/generate_prompt_dataset.py b/applications/Chat/examples/generate_prompt_dataset.py
@@ -0,0 +1,30 @@
+import argparse
+
+import random
+import json
+
+random.seed(42)
+
+
+def sample(args):
+    with open(args.dataset_path, mode='r') as f:
+        dataset_list = json.load(f)
+
+    sampled_dataset = [{"instruction": sample["instruction"], "id":idx}
+                       for idx, sample in enumerate(random.sample(dataset_list, args.sample_size))]
+
+    with open(args.save_path, mode='w') as f:
+        json.dump(sampled_dataset, f, indent=4,
+                  default=str, ensure_ascii=False)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset_path', type=str, default=None,
+                        required=True, help="path to the pretrain dataset")
+    parser.add_argument('--save_path', type=str, default='prompt.json',
+                        help="path to save the prompt dataset")
+    parser.add_argument('--sample_size', type=int,
+                        default=16384, help="size of the prompt dataset")
+    args = parser.parse_args()
+    sample(args)
diff --git a/colossalai/amp/torch_amp/_grad_scaler.py b/colossalai/amp/torch_amp/_grad_scaler.py
@@ -240,7 +240,7 @@ def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
                 for grads in per_dtype_grads.values():
                     torch._amp_foreach_non_finite_check_and_unscale_(grads, per_device_found_inf.get(device),
                                                                      per_device_inv_scale.get(device))
-        # For tensor parallel paramters it should be all-reduced over tensor parallel process group
+        # For tensor parallel parameters it should be all-reduced over tensor parallel process group
         if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1:
             vals = [val for val in per_device_found_inf._per_device_tensors.values()]
             coalesced = _flatten_dense_tensors(vals)

diff --git a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
@@ -325,7 +325,7 @@ def matmul_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L
         else:
             _is_batch_dims_same = False
 
-        # retireve dimensions
+        # retrieve dimensions
         input_dim_00 = input_tensors[0].shape[-2]
         input_dim_01 = input_tensors[0].shape[-1]
         input_dim_10 = input_tensors[1].shape[-2]

diff --git a/colossalai/auto_parallel/passes/runtime_apply_pass.py b/colossalai/auto_parallel/passes/runtime_apply_pass.py
@@ -219,7 +219,7 @@ def _comm_spec_apply(gm: torch.fx.GraphModule):
     return gm
 
 
-def _act_annotataion_pass(gm: torch.fx.GraphModule):
+def _act_annotation_pass(gm: torch.fx.GraphModule):
     """
     This pass is used to add the act annotation to the new inserted nodes.
     """

diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -54,7 +54,7 @@ def size_processing(size: Union[int, torch.Size],
     return size
 
 
-def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int],
+def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int],
                                strategies_constructor: StrategiesConstructor):
     """
     This method is used to stick the solution strategy to the nodes and add the information
@@ -496,7 +496,7 @@ def runtime_preparation_pass(gm: torch.fx.GraphModule,
                              device_mesh: DeviceMesh,
                              strategies_constructor: StrategiesConstructor,
                              overlap=False):
-    gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = solution_annotatation_pass(
+    gm, sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict = solution_annotation_pass(
         gm, solution, strategies_constructor)
     gm = size_value_converting_pass(gm, device_mesh)
     gm = node_args_converting_pass(gm, device_mesh)

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
@@ -75,7 +75,7 @@ def update_resharding_cost(self, strategy: ShardingStrategy) -> None:
                 prev_strategy.get_sharding_spec_by_name(node_name) for prev_strategy in prev_strategy_vector
             ]
 
-            # create data structrure to store costs
+            # create data structure to store costs
             if node not in resharding_costs:
                 resharding_costs[node] = []
 

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py
@@ -24,7 +24,7 @@ class BatchNormStrategyGenerator(StrategyGenerator):
     To keep the math consistency, there are two way to do BatchNorm if the input
     shards on batch dimension:
     1. We gather the input partitions through batch dimension, then do the normal BatchNorm.
-    2. We do the SyncBatchNorm on the each input partition seperately, the SyncBN op will help
+    2. We do the SyncBatchNorm on the each input partition separately, the SyncBN op will help
        us to keep the computing correctness.
     In this generator, both methods will be considered.
     """
@@ -212,7 +212,7 @@ def split_input_batch(self, mesh_dim_0):
 
         # set communication action
         # For SyncBN case, we don't need to do communication for weight and bias.
-        # TODO: the communication happens interally at SyncBN operation. We need to replace the BN operation
+        # TODO: the communication happens internally at SyncBN operation. We need to replace the BN operation
         # to SyncBN operation instead of inserting a communication node.
         output_comm_action = self.get_communication_action(
             sharding_spec=sharding_spec_mapping["output"],
@@ -250,7 +250,7 @@ def split_input_batch_1d(self, mesh_dim_0, mesh_dim_1):
 
         # set communication action
         # For SyncBN case, we don't need to do communication for gradients of weight and bias.
-        # TODO: the communication happens interally at SyncBN operation. We need to replace the BN operation
+        # TODO: the communication happens internally at SyncBN operation. We need to replace the BN operation
         # to SyncBN operation instead of inserting a communication node.
         output_comm_action = self.get_communication_action(
             sharding_spec=sharding_spec_mapping["output"],
@@ -298,7 +298,7 @@ def split_input_both_dim(self, mesh_dim_0, mesh_dim_1):
 
         # set communication action
         # For SyncBN case, we don't need to do communication for gradients of weight and bias.
-        # TODO: the communication happens interally at SyncBN operation. We need to replace the BN operation
+        # TODO: the communication happens internally at SyncBN operation. We need to replace the BN operation
         # to SyncBN operation instead of inserting a communication node.
         output_comm_action = self.get_communication_action(
             sharding_spec=sharding_spec_mapping["output"],

diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/binary_elementwise_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/binary_elementwise_generator.py
@@ -51,7 +51,7 @@ def update_memory_cost(self, strategy: ShardingStrategy) -> ShardingStrategy:
 
         # compute fwd memory cost in bytes
         # as the elementwise ops are not memory-intensive
-        # we approximate the fwd memroy cost to be the output
+        # we approximate the fwd memory cost to be the output
         # and the backward memory cost to be grad of input and other
         input_bytes = self._compute_size_in_bytes(strategy, 'input')
         other_bytes = self._compute_size_in_bytes(strategy, 'other')