ver217 · CZYCW · Apr 28, 2023 · May 4, 2023 · May 4, 2023 · May 4, 2023
diff --git a/.compatibility b/.compatibility
@@ -1,3 +1,3 @@
 1.12.0-11.3.0
-1.11.0-11.3.0
-1.10.1-11.3.0
+1.13.0-11.6.0
+2.0.0-11.7.0
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,4 @@
+[run]
+concurrency = multiprocessing
+parallel = true
+sigterm = true
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -8,4 +8,4 @@ contact_links:
     about: This issue tracker is not for technical support. Please use WeChat, and ask the community for help.
   - name: 😊 Advanced question - GitHub Discussions
     url: https://github.com/hpcaitech/ColossalAI/discussions
-    about: Use GitHub Discussions for advanced and unanswered technical questions, requiring a maintainer's answer.
+    about: Use GitHub Discussions for advanced and unanswered technical questions, requiring a maintainer's answer.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -22,7 +22,7 @@ body:
       If applicable, add screenshots to help explain your problem.
       **Suggest a potential alternative/fix**
       Tell us how we could improve this project.
-      **Optional: Affiliation** 
+      **Optional: Affiliation**
       Institution/email information helps better analyze and evaluate users to improve the project. Welcome to establish in-depth cooperation.
     placeholder: |
       A clear and concise description of your idea.

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
@@ -14,7 +14,7 @@
       - [Compatibility Test on Dispatch](#compatibility-test-on-dispatch)
     - [Release](#release)
     - [User Friendliness](#user-friendliness)
-    - [Commmunity](#commmunity)
+    - [Community](#community)
   - [Configuration](#configuration)
   - [Progress Log](#progress-log)
 
@@ -30,7 +30,7 @@ In the section below, we will dive into the details of different workflows avail
 Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow.
 I will provide the details of each workflow below.
 
-**A PR which changes the `version.txt` is considered as a release PR in the following coontext.**
+**A PR which changes the `version.txt` is considered as a release PR in the following context.**
 
 
 ### Code Style Check
@@ -43,10 +43,18 @@ I will provide the details of each workflow below.
 
 | Workflow Name          | File name                  | Description                                                                                                                                       |
 | ---------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. |
+| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when a PR changes essential files and a branch is created/deleted. It will run all the unit tests in the repository with 4 GPUs. |
 | `Build on Schedule`    | `build_on_schedule.yml`    | This workflow will run the unit tests everyday with 8 GPUs. The result is sent to Lark.                                                           |
 | `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results when `Build` is done.                                                           |
 
+To reduce the average time of the unit test on PR, `Build on PR` workflow manages testmon cache.
+
+1. When creating a new branch, it copies `cache/main/.testmondata*` to `cache/<branch>/`.
+2. When creating a new PR or change the base branch of a PR, it copies `cache/<base_ref>/.testmondata*` to `cache/_pull/<pr_number>/`.
+3. When running unit tests for each PR, it restores testmon cache from `cache/_pull/<pr_number>/`. After the test, it stores the cache back to `cache/_pull/<pr_number>/`.
+4. When a PR is closed, if it's merged, it copies `cache/_pull/<pr_number>/.testmondata*` to `cache/<base_ref>/`. Otherwise, it just removes `cache/_pull/<pr_number>`.
+5. When a branch is deleted, it removes `cache/<ref>`.
+
 ### Example Test
 
 | Workflow Name              | File name                       | Description                                                                    |
@@ -58,23 +66,23 @@ I will provide the details of each workflow below.
 #### Example Test on Dispatch
 
 This workflow is triggered by manually dispatching the workflow. It has the following input parameters:
-- `example_directory`: the example directory to test. Multiple directories are supported and must be separated b$$y comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
+- `example_directory`: the example directory to test. Multiple directories are supported and must be separated by comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
 
 ### Compatibility Test
 
 | Workflow Name                    | File name                            | Description                                                                                                          |
 | -------------------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------------------- |
-| `Compatibility Test on PR`       | `compatibility_test_on_pr.yml`       | Check Colossal-AI's compatiblity when `version.txt` is changed in a PR.                                              |
-| `Compatibility Test on Schedule` | `compatibility_test_on_schedule.yml` | This workflow will check the compatiblity of Colossal-AI against PyTorch specified in `.compatibility` every Sunday. |
-| `Compatiblity Test on Dispatch`  | `compatibility_test_on_dispatch.yml` | Test PyTorch Compatibility manually.                                                                                 |
+| `Compatibility Test on PR`       | `compatibility_test_on_pr.yml`       | Check Colossal-AI's compatibility when `version.txt` is changed in a PR.                                              |
+| `Compatibility Test on Schedule` | `compatibility_test_on_schedule.yml` | This workflow will check the compatibility of Colossal-AI against PyTorch specified in `.compatibility` every Sunday. |
+| `Compatibility Test on Dispatch`  | `compatibility_test_on_dispatch.yml` | Test PyTorch Compatibility manually.                                                                                 |
 
 
 #### Compatibility Test on Dispatch
 This workflow is triggered by manually dispatching the workflow. It has the following input parameters:
 - `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels).
 - `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda).
 
-> It only test the compatiblity of the main branch
+> It only test the compatibility of the main branch
 
 
 ### Release
@@ -97,7 +105,7 @@ This workflow is triggered by manually dispatching the workflow. It has the foll
 | `Synchronize submodule` | `submodule.yml`         | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers.                 |
 | `Close inactive issues` | `close_inactive.yml`    | This workflow will close issues which are stale for 14 days.                                                                           |
 
-### Commmunity
+### Community
 
 | Workflow Name                                | File name                        | Description                                                                      |
 | -------------------------------------------- | -------------------------------- | -------------------------------------------------------------------------------- |
@@ -113,7 +121,7 @@ This `.compatibility` file is to tell GitHub Actions which PyTorch and CUDA vers
 
 2. `.cuda_ext.json`
 
-This file controls which CUDA versions will be checked against CUDA extenson built. You can add a new entry according to the json schema below to check the AOT build of PyTorch extensions before release.
+This file controls which CUDA versions will be checked against CUDA extension built. You can add a new entry according to the json schema below to check the AOT build of PyTorch extensions before release.
 
 ```json
 {
@@ -144,7 +152,7 @@ This file controls which CUDA versions will be checked against CUDA extenson bui
   - [x] check on PR
   - [x] regular check
   - [x] manual dispatch
-- [x] compatiblity check
+- [x] compatibility check
   - [x] check on PR
   - [x] manual dispatch
   - [x] auto test when release

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
@@ -2,22 +2,93 @@ name: Build on PR
 
 on:
   pull_request:
-    types: [synchronize, labeled]
+    types: [synchronize, opened, reopened, ready_for_review, closed, edited]
+    branches:
+      - "main"
+      - "develop"
+      - "feature/**"
+    paths:
+      - ".github/workflows/build_on_pr.yml" # run command & env variables change
+      - "colossalai/**" # source code change
+      - "!colossalai/**.md" # ignore doc change
+      - "op_builder/**" # cuda extension change
+      - "!op_builder/**.md" # ignore doc change
+      - "requirements/**" # requirements change
+      - "tests/**" # test change
+      - "!tests/**.md" # ignore doc change
+      - "pytest.ini" # test config change
+      - "setup.py" # install command change
+  create:
+  delete:
 
 jobs:
+  prepare_cache:
+    name: Prepare testmon cache
+    if: |
+      github.event_name == 'create' &&
+      github.event.ref_type == 'branch' &&
+      github.event.repository.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Copy testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
+          if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
+             cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
+          fi
+        env:
+          MAIN_BRANCH: ${{ github.event.master_branch }}
+
+  prepare_cache_for_pr:
+    name: Prepare testmon cache for PR
+    if: |
+      github.event_name == 'pull_request' &&
+      (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
+    steps:
+      - name: Copy testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
+          if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
+            mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
+          fi
+        env:
+          PR_NUMBER: ${{ github.event.number }}
+
   detect:
     name: Detect file change
     if: |
-        github.event.pull_request.draft == false &&
-        github.base_ref == 'main' &&
-        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' &&
-        contains( github.event.pull_request.labels.*.name, 'Run Build and Test')
+      github.event_name == 'pull_request' &&
+      (github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'ready_for_review') &&
+      github.event.pull_request.draft == false &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     outputs:
       changedExtenisonFiles: ${{ steps.find-extension-change.outputs.all_changed_files }}
       anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }}
       changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }}
       anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
     runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - uses: actions/checkout@v2
         with:
@@ -27,10 +98,10 @@ jobs:
       - name: Locate base commit
         id: locate-base-sha
         run: |
-            curBranch=$(git rev-parse --abbrev-ref HEAD)
-            commonCommit=$(git merge-base origin/main $curBranch)
-            echo $commonCommit
-            echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
+          curBranch=$(git rev-parse --abbrev-ref HEAD)
+          commonCommit=$(git merge-base origin/main $curBranch)
+          echo $commonCommit
+          echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
 
       - name: Find the changed extension-related files
         id: find-extension-change
@@ -63,18 +134,21 @@ jobs:
             echo "$file was changed"
           done
 
-
   build:
     name: Build and Test Colossal-AI
     needs: detect
+    if: needs.detect.outputs.anyLibraryFileChanged == 'true'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.11.0-11.3.0
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
-    timeout-minutes: 40
+    timeout-minutes: 60
     defaults:
       run:
         shell: bash
+    concurrency:
+      group: ${{ github.head_ref }}
+      cancel-in-progress: false
     steps:
       - name: Checkout TensorNVMe
         uses: actions/checkout@v2
@@ -85,7 +159,9 @@ jobs:
 
       - name: Restore TensorNVMe Cache
         run: |
-          [ ! -z "$(ls -A /github/home/tensornvme_cache/)" ] && cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe
+          if [ -d /github/home/tensornvme_cache ] && [ ! -z "$(ls -A /github/home/tensornvme_cache/)" ]; then
+            cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe
+          fi
 
       - name: Install TensorNVMe
         run: |
@@ -108,10 +184,11 @@ jobs:
         if: needs.detect.outputs.anyExtensionFileChanged != 'true'
         run: |
           # -p flag is required to preserve the file timestamp to avoid ninja rebuild
-          [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
+          if [ -d /github/home/cuda_ext_cache ] && [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ]; then
+            cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
+          fi
 
       - name: Install Colossal-AI
-        if: needs.detect.outputs.anyLibraryFileChanged == 'true'
         run: |
           CUDA_EXT=1 pip install -v -e .
           pip install -r requirements/requirements-test.txt
@@ -121,15 +198,29 @@ jobs:
           # -p flag is required to preserve the file timestamp to avoid ninja rebuild
           cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
 
+      - name: Restore Testmon Cache
+        run: |
+          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
+            cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
+          fi
+        env:
+          PR_NUMBER: ${{ github.event.number }}
+
       - name: Execute Unit Testing
-        if: needs.detect.outputs.anyLibraryFileChanged == 'true'
         run: |
-          PYTHONPATH=$PWD pytest --cov=. --cov-report xml tests/
+          CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. --durations=10 tests/
         env:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 
+      - name: Store Testmon Cache
+        run: |
+          mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
+          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
+        env:
+          PR_NUMBER: ${{ github.event.number }}
+
       - name: Collate artifact
         env:
           PR_NUMBER: ${{ github.event.number }}
@@ -141,7 +232,7 @@ jobs:
           echo $PR_NUMBER > ./report/pr_number
 
           # generate coverage.xml if any
-          if [ "$anyLibraryFileChanged" == "true" ]; then
+          if [ "$anyLibraryFileChanged" == "true" ] && [ -e .coverage ]; then
             allFiles=""
             for file in $changedLibraryFiles; do
               if [ "$allFiles" == "" ]; then
@@ -166,3 +257,54 @@ jobs:
         with:
           name: report
           path: report/
+
+  store_cache:
+    name: Store testmon cache for PR
+    if: |
+      github.event_name == 'pull_request' &&
+      github.event.action == 'closed' &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Store testmon cache if possible
+        if: github.event.pull_request.merged == true
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
+          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
+            cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
+          fi
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Remove testmon cache
+        run: |
+          rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+  remove_cache:
+    name: Remove testmon cache
+    if: |
+      github.event_name == 'delete' &&
+      github.event.ref_type == 'branch' &&
+      github.event.repository.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Remove testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
+          rm -rf "/github/home/testmon_cache/${BASE}"