From d0accbff422a5154c7797d2c69a2efe3609686cf Mon Sep 17 00:00:00 2001
From: FrankLeeeee <somerlee.9@gmail.com>
Date: Tue, 9 Jan 2024 10:41:51 +0800
Subject: [PATCH 1/6] [workflow] fixed build CI

---
 .github/workflows/build_on_pr.yml             | 25 ++++++++++++---
 .github/workflows/build_on_schedule.yml       |  2 +-
 .github/workflows/doc_test_on_schedule.yml    |  2 +-
 tests/kit/model_zoo/__init__.py               | 32 +++++++++++++++++--
 tests/kit/model_zoo/registry.py               | 17 ++++++----
 .../test_plugin/test_3d_plugin.py             |  1 +
 .../test_plugin/test_gemini_plugin.py         |  4 +--
 .../test_plugin/test_low_level_zero_plugin.py |  9 ++++--
 .../test_plugin/test_torch_ddp_plugin.py      |  9 ++++--
 .../test_plugin/test_torch_fsdp_plugin.py     |  9 ++++--
 .../test_gemini_checkpoint_io.py              | 14 ++++----
 .../test_gemini_torch_compability.py          |  2 +-
 ...st_hybrid_parallel_plugin_checkpoint_io.py |  2 +-
 .../test_plugins_huggingface_compatibility.py |  2 +-
 tests/test_lazy/test_models.py                |  4 +--
 15 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 8eb358c4f42c..9e8b0a9562e2 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -140,7 +140,7 @@ jobs:
     if: needs.detect.outputs.anyLibraryFileChanged == 'true'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 60
     defaults:
@@ -174,6 +174,7 @@ jobs:
         run: |
           cd TensorNVMe
           cp -p -r ./build /github/home/tensornvme_cache/
+          cp -p -r ./cmake-build /github/home/tensornvme_cache/
 
       - name: Checkout Colossal-AI
         uses: actions/checkout@v2
@@ -208,9 +209,25 @@ jobs:
 
       - name: Execute Unit Testing
         run: |
-          CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
+          CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
+          -m "not largedist" \
+          --testmon \
+          --testmon-forceselect \
+          --testmon-cov=. \
+          --durations=0 \
+          --ignore tests/test_analyzer \
+          --ignore tests/test_auto_parallel \
+          --ignore tests/test_fx \
+          --ignore tests/test_autochunk \
+          --ignore tests/test_gptq \
+          --ignore tests/test_infer_ops \
+          --ignore tests/test_legacy \
+          --ignore tests/test_moe \
+          --ignore tests/test_smoothquant \
+          --ignore tests/test_checkpoint_io \
+          --ignore tests/test_shardformer \
+          tests/
         env:
-          DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
           TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt
@@ -268,7 +285,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --rm
     timeout-minutes: 5
     defaults:
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index e5afe9622931..c202afb58a35 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -10,7 +10,7 @@ jobs:
   build:
     name: Build and Test Colossal-AI
     if: github.repository == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, 8-gpu]
+    runs-on: [self-hosted, gpu]
     container:
       image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml
index b4c77674746f..b3536184d78a 100644
--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@@ -12,7 +12,7 @@ jobs:
     name: Test the changed Doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --gpus all --rm
     timeout-minutes: 60
     steps:
diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
index 62b9123b59b0..5f6789ff3357 100644
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
@@ -1,5 +1,33 @@
-from . import custom, diffusers, timm, torchaudio, torchrec, torchvision, transformers
+import os
+from . import custom, diffusers, timm, torchaudio, torchvision, transformers
 from .executor import run_fwd, run_fwd_bwd
 from .registry import model_zoo
 
-__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd"]
+# We pick a subset of models for fast testing in order to reduce the total testing time
+COMMON_MODELS = [
+    'custom_hanging_param_model',
+    'custom_nested_model',
+    'custom_repeated_computed_layers',
+    'custom_simple_net',
+    'diffusers_clip_text_model',
+    'diffusers_auto_encoder_kl',
+    'diffusers_unet2d_model',
+    'timm_densenet',
+    'timm_resnet',
+    'timm_swin_transformer',
+    'torchaudio_wav2vec2_base',
+    'torchaudio_conformer',
+    'transformers_bert_for_masked_lm',
+    'transformers_bloom_for_causal_lm',
+    'transformers_falcon_for_causal_lm',
+    'transformers_chatglm_for_conditional_generation',
+    'transformers_llama_for_casual_lm',
+    'transformers_vit_for_masked_image_modeling',
+    'transformers_mistral_for_casual_lm'
+]
+
+IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1'
+
+
+__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST']
+
diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
index bb522778bb5d..44a0adc6a3af 100644
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 from dataclasses import dataclass
-from typing import Callable
+from typing import Callable, List, Union
 
 __all__ = ["ModelZooRegistry", "ModelAttribute", "model_zoo"]
 
@@ -61,7 +61,7 @@ def register(
         """
         self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute)
 
-    def get_sub_registry(self, keyword: str):
+    def get_sub_registry(self, keyword: Union[str, List[str]]):
         """
         Get a sub registry with models that contain the keyword.
 
@@ -70,12 +70,15 @@ def get_sub_registry(self, keyword: str):
         """
         new_dict = dict()
 
+        if isinstance(keyword, str):
+            keyword_list = [keyword]
+        else:
+            keyword_list = keyword
+        assert isinstance(keyword_list, (list, tuple))
+
         for k, v in self.items():
-            if keyword == "transformers_gpt":
-                if keyword in k and not "gptj" in k:  # ensure GPT2 does not retrieve GPTJ models
-                    new_dict[k] = v
-            else:
-                if keyword in k:
+            for kw in keyword_list:
+                if kw in k:
                     new_dict[k] = v
 
         assert len(new_dict) > 0, f"No model found with keyword {keyword}"
diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py
index ad878fb0c86a..eca5b568843b 100644
--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -68,6 +68,7 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
     for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.get_sub_registry(
         "transformers_llama_for_casual_lm"
     ).items():
+        print(name)
         err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
         torch.cuda.empty_cache()
 
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index d4205e1f9d73..3462d5dde52b 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -13,7 +13,7 @@
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo
+from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST
 
 
 def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
@@ -66,7 +66,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t
 # @parameterize('init_method', ['lazy', 'none', 'colo'])
 
 
-@parameterize("subset", ["torchvision", "transformers", "diffusers"])
+@parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"])
 @parameterize("init_method", ["none"])
 @parameterize("zero_size", [2])
 @parameterize("tp_size", [2])
diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
index 3eaaf882c9ba..bcdcc1470e6c 100644
--- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -11,7 +11,7 @@
 
 # from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo
+from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
 
 # These models are not compatible with AMP
 _AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"]
@@ -62,7 +62,12 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True):
     ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS
     skipped_models = []
 
-    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
         # FIXME(ver217): fix these models
         if name in ignore_models:
             skipped_models.append(name)
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index 1a7ca6f2a30c..fa32feb2ff85 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -11,7 +11,7 @@
 from colossalai.booster.plugin import TorchDDPPlugin
 from colossalai.interface import OptimizerWrapper
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo
+from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
 
 
 def run_fn(model_fn, data_gen_fn, output_transform_fn):
@@ -40,7 +40,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
 
 
 def check_torch_ddp_plugin():
-    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
         if name == "dlrm_interactionarch":
             continue
         run_fn(model_fn, data_gen_fn, output_transform_fn)
diff --git a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
index 8bcbffdd06fe..8a14d7cf872d 100644
--- a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py
@@ -12,7 +12,7 @@
 
 from colossalai.interface import OptimizerWrapper
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo
+from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
 
 
 # test basic fsdp function
@@ -42,7 +42,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn):
 
 
 def check_torch_fsdp_plugin():
-    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
+    if IS_FAST_TEST:
+        registry = model_zoo.get_sub_registry(COMMON_MODELS)
+    else:
+        registry = model_zoo
+
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
         if any(
             element in name
             for element in [
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index 8343c5f07e30..49fd85ffba0a 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -7,6 +7,7 @@
 from utils import shared_tempdir
 
 import colossalai
+from colossalai.testing import skip_if_not_enough_gpus
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin
 from colossalai.lazy import LazyInitContext
@@ -68,7 +69,7 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
 @clear_cache_before_run()
 @parameterize("placement_config", OPTIM_PLACEMENT_CONFIGS)
 @parameterize("shard", [True, False])
-@parameterize("model_name", ["transformers_gpt"])
+@parameterize("model_name", ["transformers_llama_for_casual_lm"])
 @parameterize("size_per_shard", [32])
 @parameterize("tp_size", [1, 2])
 @parameterize("zero_size", [2])
@@ -156,13 +157,12 @@ def run_dist(rank, world_size, port):
 
 
 @pytest.mark.dist
-@pytest.mark.parametrize("world_size", [4])
 @rerun_if_address_is_in_use()
-def test_gemini_ckpIO(world_size):
-    spawn(run_dist, world_size)
+def test_gemini_ckpIO():
+    spawn(run_dist, 4)
 
 @pytest.mark.largedist
-@pytest.mark.parametrize("world_size", [8])
+@skip_if_not_enough_gpus(min_gpus=8)
 @rerun_if_address_is_in_use()
-def test_gemini_ckpIO_3d(world_size):
-    spawn(run_dist, world_size)
\ No newline at end of file
+def test_gemini_ckpIO_3d():
+    spawn(run_dist, 8)
\ No newline at end of file
diff --git a/tests/test_checkpoint_io/test_gemini_torch_compability.py b/tests/test_checkpoint_io/test_gemini_torch_compability.py
index bb7a60035e02..44a000113629 100644
--- a/tests/test_checkpoint_io/test_gemini_torch_compability.py
+++ b/tests/test_checkpoint_io/test_gemini_torch_compability.py
@@ -20,7 +20,7 @@
 
 @clear_cache_before_run()
 @parameterize("shard", [False, True])
-@parameterize("model_name", ["transformers_gpt"])
+@parameterize("model_name", ["transformers_llama_for_casual_lm"])
 def exam_torch_load_from_gemini(shard: bool, model_name: str):
     (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
     criterion = lambda x: x.mean()
diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
index c0bc2d2f5d0a..db3c56da874d 100644
--- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
@@ -40,7 +40,7 @@
 
 @clear_cache_before_run()
 @parameterize("shard", [True, False])
-@parameterize("model_name", ["transformers_gpt"])
+@parameterize("model_name", ["transformers_llama_for_casual_lm"])
 @parameterize("size_per_shard", [32])
 @parameterize("test_config", TEST_CONFIGS)
 def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict):
diff --git a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
index a6f67e0d7729..0353ff115840 100644
--- a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
+++ b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
@@ -18,7 +18,7 @@
 
 
 @clear_cache_before_run()
-@parameterize("model_name", ["transformers_gpt"])
+@parameterize("model_name", ["transformers_llama_for_casual_lm"])
 @parameterize("plugin_type", ["ddp", "zero", "gemini"])
 def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32):
     (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py
index a1b5763d4cd8..ee50e5b61009 100644
--- a/tests/test_lazy/test_models.py
+++ b/tests/test_lazy/test_models.py
@@ -1,11 +1,11 @@
 import pytest
 from lazy_init_utils import SUPPORT_LAZY, check_lazy_init
 
-from tests.kit.model_zoo import model_zoo
+from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS
 
 
 @pytest.mark.skipif(not SUPPORT_LAZY, reason="requires torch >= 1.12.0")
-@pytest.mark.parametrize("subset", ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"])
+@pytest.mark.parametrize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"])
 @pytest.mark.parametrize("default_device", ["cpu", "cuda"])
 def test_torchvision_models_lazy_init(subset, default_device):
     sub_model_zoo = model_zoo.get_sub_registry(subset)

From 92c0a295fcd6ac1e56c05a2b264896c565a3ebf0 Mon Sep 17 00:00:00 2001
From: FrankLeeeee <somerlee.9@gmail.com>
Date: Wed, 10 Jan 2024 18:31:23 +0800
Subject: [PATCH 2/6] polish

---
 .github/workflows/build_on_pr.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 9e8b0a9562e2..f4fcd92ba2f2 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -211,9 +211,6 @@ jobs:
         run: |
           CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
           -m "not largedist" \
-          --testmon \
-          --testmon-forceselect \
-          --testmon-cov=. \
           --durations=0 \
           --ignore tests/test_analyzer \
           --ignore tests/test_auto_parallel \

From fba359dd0fb217f7159e5554c0c1943abe0b99a7 Mon Sep 17 00:00:00 2001
From: FrankLeeeee <somerlee.9@gmail.com>
Date: Wed, 10 Jan 2024 13:06:28 +0000
Subject: [PATCH 3/6] polish

---
 .github/workflows/build_on_pr.yml       | 118 ------------------------
 .github/workflows/build_on_schedule.yml |   2 +-
 2 files changed, 1 insertion(+), 119 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index f4fcd92ba2f2..50417ac8a3a0 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -22,57 +22,6 @@ on:
   delete:
 
 jobs:
-  prepare_cache:
-    name: Prepare testmon cache
-    if: |
-      github.event_name == 'create' &&
-      github.event.ref_type == 'branch' &&
-      github.event.repository.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Copy testmon cache
-        run: | # branch name may contain slash, we need to replace it with space
-          export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
-          if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
-             cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
-          fi
-        env:
-          MAIN_BRANCH: ${{ github.event.master_branch }}
-
-  prepare_cache_for_pr:
-    name: Prepare testmon cache for PR
-    if: |
-      github.event_name == 'pull_request' &&
-      (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
-      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
-      cancel-in-progress: true
-    steps:
-      - name: Copy testmon cache
-        run: | # branch name may contain slash, we need to replace it with space
-          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
-          if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
-            mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
-          fi
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-
   detect:
     name: Detect file change
     if: |
@@ -199,14 +148,6 @@ jobs:
           # -p flag is required to preserve the file timestamp to avoid ninja rebuild
           cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
 
-      - name: Restore Testmon Cache
-        run: |
-          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
-            cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
-          fi
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-
       - name: Execute Unit Testing
         run: |
           CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
@@ -222,21 +163,12 @@ jobs:
           --ignore tests/test_moe \
           --ignore tests/test_smoothquant \
           --ignore tests/test_checkpoint_io \
-          --ignore tests/test_shardformer \
           tests/
         env:
           NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-          TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt
           LLAMA_PATH: /data/scratch/llama-tiny
 
-      - name: Store Testmon Cache
-        run: |
-          mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
-          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-
       - name: Collate artifact
         env:
           PR_NUMBER: ${{ github.event.number }}
@@ -274,53 +206,3 @@ jobs:
           name: report
           path: report/
 
-  store_cache:
-    name: Store testmon cache for PR
-    if: |
-      github.event_name == 'pull_request' &&
-      github.event.action == 'closed' &&
-      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Store testmon cache if possible
-        if: github.event.pull_request.merged == true
-        run: | # branch name may contain slash, we need to replace it with space
-          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
-          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
-            cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
-          fi
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-
-      - name: Remove testmon cache
-        run: |
-          rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-
-  remove_cache:
-    name: Remove testmon cache
-    if: |
-      github.event_name == 'delete' &&
-      github.event.ref_type == 'branch' &&
-      github.event.repository.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Remove testmon cache
-        run: | # branch name may contain slash, we need to replace it with space
-          export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
-          rm -rf "/github/home/testmon_cache/${BASE}"
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index c202afb58a35..2421cb8dde37 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -14,7 +14,7 @@ jobs:
     container:
       image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
-    timeout-minutes: 40
+    timeout-minutes: 90
     steps:
       - name: Check GPU Availability # ensure all GPUs have enough memory
         id: check-avai

From e765f7bf432ec76675750f58025f23399bdd570a Mon Sep 17 00:00:00 2001
From: FrankLeeeee <somerlee.9@gmail.com>
Date: Wed, 10 Jan 2024 13:24:13 +0000
Subject: [PATCH 4/6] polish

---
 .github/workflows/build_on_schedule.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index 2421cb8dde37..29d0f0578332 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -20,10 +20,10 @@ jobs:
         id: check-avai
         run: |
           avai=true
-          for i in $(seq 0 7);
+          for i in $(seq 0 3);
           do
             gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
-            [ "$gpu_used" -gt "10000" ] && avai=false
+            [ "$gpu_used" -gt "2000" ] && avai=false
           done
 
           echo "GPU is available: $avai"
@@ -62,7 +62,7 @@ jobs:
         run: |
           PYTHONPATH=$PWD pytest --durations=0 tests
         env:
-          DATA: /data/scratch/cifar-10
+          NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
           LLAMA_PATH: /data/scratch/llama-tiny
 

From 0474848d7efbbfb52bbeedbdeadf1a132477468e Mon Sep 17 00:00:00 2001
From: FrankLeeeee <somerlee.9@gmail.com>
Date: Wed, 10 Jan 2024 13:25:22 +0000
Subject: [PATCH 5/6] polish

---
 .github/workflows/build_on_schedule.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index 29d0f0578332..3bee3b4f96e2 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -60,7 +60,10 @@ jobs:
       - name: Unit Testing
         if: steps.check-avai.outputs.avai == 'true'
         run: |
-          PYTHONPATH=$PWD pytest --durations=0 tests
+          PYTHONPATH=$PWD pytest \
+          -m "not largedist" \
+          --durations=0 \
+          tests/
         env:
           NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64

From b11b720a1ee03fdd194a5951ff6aae5be0512fa3 Mon Sep 17 00:00:00 2001
From: FrankLeeeee <somerlee.9@gmail.com>
Date: Wed, 10 Jan 2024 13:28:52 +0000
Subject: [PATCH 6/6] polish

---
 tests/test_booster/test_plugin/test_3d_plugin.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py
index eca5b568843b..ad878fb0c86a 100644
--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -68,7 +68,6 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
     for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.get_sub_registry(
         "transformers_llama_for_casual_lm"
     ).items():
-        print(name)
         err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
         torch.cuda.empty_cache()