From d0accbff422a5154c7797d2c69a2efe3609686cf Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 9 Jan 2024 10:41:51 +0800 Subject: [PATCH 1/6] [workflow] fixed build CI --- .github/workflows/build_on_pr.yml | 25 ++++++++++++--- .github/workflows/build_on_schedule.yml | 2 +- .github/workflows/doc_test_on_schedule.yml | 2 +- tests/kit/model_zoo/__init__.py | 32 +++++++++++++++++-- tests/kit/model_zoo/registry.py | 17 ++++++---- .../test_plugin/test_3d_plugin.py | 1 + .../test_plugin/test_gemini_plugin.py | 4 +-- .../test_plugin/test_low_level_zero_plugin.py | 9 ++++-- .../test_plugin/test_torch_ddp_plugin.py | 9 ++++-- .../test_plugin/test_torch_fsdp_plugin.py | 9 ++++-- .../test_gemini_checkpoint_io.py | 14 ++++---- .../test_gemini_torch_compability.py | 2 +- ...st_hybrid_parallel_plugin_checkpoint_io.py | 2 +- .../test_plugins_huggingface_compatibility.py | 2 +- tests/test_lazy/test_models.py | 4 +-- 15 files changed, 99 insertions(+), 35 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 8eb358c4f42c..9e8b0a9562e2 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -140,7 +140,7 @@ jobs: if: needs.detect.outputs.anyLibraryFileChanged == 'true' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 + image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 60 defaults: @@ -174,6 +174,7 @@ jobs: run: | cd TensorNVMe cp -p -r ./build /github/home/tensornvme_cache/ + cp -p -r ./cmake-build /github/home/tensornvme_cache/ - name: Checkout Colossal-AI uses: actions/checkout@v2 @@ -208,9 +209,25 @@ jobs: - name: Execute Unit Testing run: | - CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/ + CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \ + -m "not largedist" \ + --testmon \ + --testmon-forceselect \ + --testmon-cov=. \ + --durations=0 \ + --ignore tests/test_analyzer \ + --ignore tests/test_auto_parallel \ + --ignore tests/test_fx \ + --ignore tests/test_autochunk \ + --ignore tests/test_gptq \ + --ignore tests/test_infer_ops \ + --ignore tests/test_legacy \ + --ignore tests/test_moe \ + --ignore tests/test_smoothquant \ + --ignore tests/test_checkpoint_io \ + --ignore tests/test_shardformer \ + tests/ env: - DATA: /data/scratch/cifar-10 NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt @@ -268,7 +285,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 + image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 options: --rm timeout-minutes: 5 defaults: diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index e5afe9622931..c202afb58a35 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -10,7 +10,7 @@ jobs: build: name: Build and Test Colossal-AI if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, 8-gpu] + runs-on: [self-hosted, gpu] container: image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml index b4c77674746f..b3536184d78a 100644 --- a/.github/workflows/doc_test_on_schedule.yml +++ b/.github/workflows/doc_test_on_schedule.yml @@ -12,7 +12,7 @@ jobs: name: Test the changed Doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 + image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 options: --gpus all --rm timeout-minutes: 60 steps: diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py index 62b9123b59b0..5f6789ff3357 100644 --- a/tests/kit/model_zoo/__init__.py +++ b/tests/kit/model_zoo/__init__.py @@ -1,5 +1,33 @@ -from . import custom, diffusers, timm, torchaudio, torchrec, torchvision, transformers +import os +from . import custom, diffusers, timm, torchaudio, torchvision, transformers from .executor import run_fwd, run_fwd_bwd from .registry import model_zoo -__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd"] +# We pick a subset of models for fast testing in order to reduce the total testing time +COMMON_MODELS = [ + 'custom_hanging_param_model', + 'custom_nested_model', + 'custom_repeated_computed_layers', + 'custom_simple_net', + 'diffusers_clip_text_model', + 'diffusers_auto_encoder_kl', + 'diffusers_unet2d_model', + 'timm_densenet', + 'timm_resnet', + 'timm_swin_transformer', + 'torchaudio_wav2vec2_base', + 'torchaudio_conformer', + 'transformers_bert_for_masked_lm', + 'transformers_bloom_for_causal_lm', + 'transformers_falcon_for_causal_lm', + 'transformers_chatglm_for_conditional_generation', + 'transformers_llama_for_casual_lm', + 'transformers_vit_for_masked_image_modeling', + 'transformers_mistral_for_casual_lm' +] + +IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1' + + +__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST'] + diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py index bb522778bb5d..44a0adc6a3af 100644 --- a/tests/kit/model_zoo/registry.py +++ b/tests/kit/model_zoo/registry.py @@ -1,6 +1,6 @@ #!/usr/bin/env python from dataclasses import dataclass -from typing import Callable +from typing import Callable, List, Union __all__ = ["ModelZooRegistry", "ModelAttribute", "model_zoo"] @@ -61,7 +61,7 @@ def register( """ self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute) - def get_sub_registry(self, keyword: str): + def get_sub_registry(self, keyword: Union[str, List[str]]): """ Get a sub registry with models that contain the keyword. @@ -70,12 +70,15 @@ def get_sub_registry(self, keyword: str): """ new_dict = dict() + if isinstance(keyword, str): + keyword_list = [keyword] + else: + keyword_list = keyword + assert isinstance(keyword_list, (list, tuple)) + for k, v in self.items(): - if keyword == "transformers_gpt": - if keyword in k and not "gptj" in k: # ensure GPT2 does not retrieve GPTJ models - new_dict[k] = v - else: - if keyword in k: + for kw in keyword_list: + if kw in k: new_dict[k] = v assert len(new_dict) > 0, f"No model found with keyword {keyword}" diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py index ad878fb0c86a..eca5b568843b 100644 --- a/tests/test_booster/test_plugin/test_3d_plugin.py +++ b/tests/test_booster/test_plugin/test_3d_plugin.py @@ -68,6 +68,7 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True): for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.get_sub_registry( "transformers_llama_for_casual_lm" ).items(): + print(name) err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) torch.cuda.empty_cache() diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index d4205e1f9d73..3462d5dde52b 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -13,7 +13,7 @@ from colossalai.nn.optimizer import HybridAdam from colossalai.tensor.colo_parameter import ColoParameter from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]: @@ -66,7 +66,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t # @parameterize('init_method', ['lazy', 'none', 'colo']) -@parameterize("subset", ["torchvision", "transformers", "diffusers"]) +@parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"]) @parameterize("init_method", ["none"]) @parameterize("zero_size", [2]) @parameterize("tp_size", [2]) diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py index 3eaaf882c9ba..bcdcc1470e6c 100644 --- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py +++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py @@ -11,7 +11,7 @@ # from colossalai.nn.optimizer import HybridAdam from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS # These models are not compatible with AMP _AMP_ERR_MODELS = ["timm_convit", "deepfm_interactionarch"] @@ -62,7 +62,12 @@ def check_low_level_zero_plugin(stage: int, early_stop: bool = True): ignore_models = _AMP_ERR_MODELS + _LOW_LEVEL_ZERO_ERR_MODELS + _STUCK_MODELS skipped_models = [] - for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items(): + if IS_FAST_TEST: + registry = model_zoo.get_sub_registry(COMMON_MODELS) + else: + registry = model_zoo + + for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items(): # FIXME(ver217): fix these models if name in ignore_models: skipped_models.append(name) diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py index 1a7ca6f2a30c..fa32feb2ff85 100644 --- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py +++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py @@ -11,7 +11,7 @@ from colossalai.booster.plugin import TorchDDPPlugin from colossalai.interface import OptimizerWrapper from colossalai.testing import rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS def run_fn(model_fn, data_gen_fn, output_transform_fn): @@ -40,7 +40,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn): def check_torch_ddp_plugin(): - for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items(): + if IS_FAST_TEST: + registry = model_zoo.get_sub_registry(COMMON_MODELS) + else: + registry = model_zoo + + for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items(): if name == "dlrm_interactionarch": continue run_fn(model_fn, data_gen_fn, output_transform_fn) diff --git a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py index 8bcbffdd06fe..8a14d7cf872d 100644 --- a/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py +++ b/tests/test_booster/test_plugin/test_torch_fsdp_plugin.py @@ -12,7 +12,7 @@ from colossalai.interface import OptimizerWrapper from colossalai.testing import rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS # test basic fsdp function @@ -42,7 +42,12 @@ def run_fn(model_fn, data_gen_fn, output_transform_fn): def check_torch_fsdp_plugin(): - for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items(): + if IS_FAST_TEST: + registry = model_zoo.get_sub_registry(COMMON_MODELS) + else: + registry = model_zoo + + for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items(): if any( element in name for element in [ diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py index 8343c5f07e30..49fd85ffba0a 100644 --- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py @@ -7,6 +7,7 @@ from utils import shared_tempdir import colossalai +from colossalai.testing import skip_if_not_enough_gpus from colossalai.booster import Booster from colossalai.booster.plugin import GeminiPlugin from colossalai.lazy import LazyInitContext @@ -68,7 +69,7 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b @clear_cache_before_run() @parameterize("placement_config", OPTIM_PLACEMENT_CONFIGS) @parameterize("shard", [True, False]) -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) @parameterize("size_per_shard", [32]) @parameterize("tp_size", [1, 2]) @parameterize("zero_size", [2]) @@ -156,13 +157,12 @@ def run_dist(rank, world_size, port): @pytest.mark.dist -@pytest.mark.parametrize("world_size", [4]) @rerun_if_address_is_in_use() -def test_gemini_ckpIO(world_size): - spawn(run_dist, world_size) +def test_gemini_ckpIO(): + spawn(run_dist, 4) @pytest.mark.largedist -@pytest.mark.parametrize("world_size", [8]) +@skip_if_not_enough_gpus(min_gpus=8) @rerun_if_address_is_in_use() -def test_gemini_ckpIO_3d(world_size): - spawn(run_dist, world_size) \ No newline at end of file +def test_gemini_ckpIO_3d(): + spawn(run_dist, 8) \ No newline at end of file diff --git a/tests/test_checkpoint_io/test_gemini_torch_compability.py b/tests/test_checkpoint_io/test_gemini_torch_compability.py index bb7a60035e02..44a000113629 100644 --- a/tests/test_checkpoint_io/test_gemini_torch_compability.py +++ b/tests/test_checkpoint_io/test_gemini_torch_compability.py @@ -20,7 +20,7 @@ @clear_cache_before_run() @parameterize("shard", [False, True]) -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) def exam_torch_load_from_gemini(shard: bool, model_name: str): (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values())) criterion = lambda x: x.mean() diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py index c0bc2d2f5d0a..db3c56da874d 100644 --- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py @@ -40,7 +40,7 @@ @clear_cache_before_run() @parameterize("shard", [True, False]) -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) @parameterize("size_per_shard", [32]) @parameterize("test_config", TEST_CONFIGS) def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict): diff --git a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py index a6f67e0d7729..0353ff115840 100644 --- a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py +++ b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py @@ -18,7 +18,7 @@ @clear_cache_before_run() -@parameterize("model_name", ["transformers_gpt"]) +@parameterize("model_name", ["transformers_llama_for_casual_lm"]) @parameterize("plugin_type", ["ddp", "zero", "gemini"]) def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32): (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next( diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py index a1b5763d4cd8..ee50e5b61009 100644 --- a/tests/test_lazy/test_models.py +++ b/tests/test_lazy/test_models.py @@ -1,11 +1,11 @@ import pytest from lazy_init_utils import SUPPORT_LAZY, check_lazy_init -from tests.kit.model_zoo import model_zoo +from tests.kit.model_zoo import model_zoo, IS_FAST_TEST, COMMON_MODELS @pytest.mark.skipif(not SUPPORT_LAZY, reason="requires torch >= 1.12.0") -@pytest.mark.parametrize("subset", ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"]) +@pytest.mark.parametrize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "diffusers", "timm", "transformers", "torchaudio", "deepfm", "dlrm"]) @pytest.mark.parametrize("default_device", ["cpu", "cuda"]) def test_torchvision_models_lazy_init(subset, default_device): sub_model_zoo = model_zoo.get_sub_registry(subset) From 92c0a295fcd6ac1e56c05a2b264896c565a3ebf0 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Wed, 10 Jan 2024 18:31:23 +0800 Subject: [PATCH 2/6] polish --- .github/workflows/build_on_pr.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 9e8b0a9562e2..f4fcd92ba2f2 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -211,9 +211,6 @@ jobs: run: | CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \ -m "not largedist" \ - --testmon \ - --testmon-forceselect \ - --testmon-cov=. \ --durations=0 \ --ignore tests/test_analyzer \ --ignore tests/test_auto_parallel \ From fba359dd0fb217f7159e5554c0c1943abe0b99a7 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Wed, 10 Jan 2024 13:06:28 +0000 Subject: [PATCH 3/6] polish --- .github/workflows/build_on_pr.yml | 118 ------------------------ .github/workflows/build_on_schedule.yml | 2 +- 2 files changed, 1 insertion(+), 119 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index f4fcd92ba2f2..50417ac8a3a0 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -22,57 +22,6 @@ on: delete: jobs: - prepare_cache: - name: Prepare testmon cache - if: | - github.event_name == 'create' && - github.event.ref_type == 'branch' && - github.event.repository.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - steps: - - name: Copy testmon cache - run: | # branch name may contain slash, we need to replace it with space - export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /") - if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then - cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}" - fi - env: - MAIN_BRANCH: ${{ github.event.master_branch }} - - prepare_cache_for_pr: - name: Prepare testmon cache for PR - if: | - github.event_name == 'pull_request' && - (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) && - github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache - cancel-in-progress: true - steps: - - name: Copy testmon cache - run: | # branch name may contain slash, we need to replace it with space - export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /") - if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then - mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER} - fi - env: - PR_NUMBER: ${{ github.event.number }} - detect: name: Detect file change if: | @@ -199,14 +148,6 @@ jobs: # -p flag is required to preserve the file timestamp to avoid ninja rebuild cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/ - - name: Restore Testmon Cache - run: | - if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then - cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/ - fi - env: - PR_NUMBER: ${{ github.event.number }} - - name: Execute Unit Testing run: | CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \ @@ -222,21 +163,12 @@ jobs: --ignore tests/test_moe \ --ignore tests/test_smoothquant \ --ignore tests/test_checkpoint_io \ - --ignore tests/test_shardformer \ tests/ env: NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 - TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt LLAMA_PATH: /data/scratch/llama-tiny - - name: Store Testmon Cache - run: | - mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} - cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/ - env: - PR_NUMBER: ${{ github.event.number }} - - name: Collate artifact env: PR_NUMBER: ${{ github.event.number }} @@ -274,53 +206,3 @@ jobs: name: report path: report/ - store_cache: - name: Store testmon cache for PR - if: | - github.event_name == 'pull_request' && - github.event.action == 'closed' && - github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - steps: - - name: Store testmon cache if possible - if: github.event.pull_request.merged == true - run: | # branch name may contain slash, we need to replace it with space - export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /") - if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then - cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/" - fi - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - - - name: Remove testmon cache - run: | - rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER} - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - - remove_cache: - name: Remove testmon cache - if: | - github.event_name == 'delete' && - github.event.ref_type == 'branch' && - github.event.repository.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - steps: - - name: Remove testmon cache - run: | # branch name may contain slash, we need to replace it with space - export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /") - rm -rf "/github/home/testmon_cache/${BASE}" diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index c202afb58a35..2421cb8dde37 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -14,7 +14,7 @@ jobs: container: image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny - timeout-minutes: 40 + timeout-minutes: 90 steps: - name: Check GPU Availability # ensure all GPUs have enough memory id: check-avai From e765f7bf432ec76675750f58025f23399bdd570a Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Wed, 10 Jan 2024 13:24:13 +0000 Subject: [PATCH 4/6] polish --- .github/workflows/build_on_schedule.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index 2421cb8dde37..29d0f0578332 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -20,10 +20,10 @@ jobs: id: check-avai run: | avai=true - for i in $(seq 0 7); + for i in $(seq 0 3); do gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits) - [ "$gpu_used" -gt "10000" ] && avai=false + [ "$gpu_used" -gt "2000" ] && avai=false done echo "GPU is available: $avai" @@ -62,7 +62,7 @@ jobs: run: | PYTHONPATH=$PWD pytest --durations=0 tests env: - DATA: /data/scratch/cifar-10 + NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny From 0474848d7efbbfb52bbeedbdeadf1a132477468e Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Wed, 10 Jan 2024 13:25:22 +0000 Subject: [PATCH 5/6] polish --- .github/workflows/build_on_schedule.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index 29d0f0578332..3bee3b4f96e2 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -60,7 +60,10 @@ jobs: - name: Unit Testing if: steps.check-avai.outputs.avai == 'true' run: | - PYTHONPATH=$PWD pytest --durations=0 tests + PYTHONPATH=$PWD pytest \ + -m "not largedist" \ + --durations=0 \ + tests/ env: NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 From b11b720a1ee03fdd194a5951ff6aae5be0512fa3 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Wed, 10 Jan 2024 13:28:52 +0000 Subject: [PATCH 6/6] polish --- tests/test_booster/test_plugin/test_3d_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py index eca5b568843b..ad878fb0c86a 100644 --- a/tests/test_booster/test_plugin/test_3d_plugin.py +++ b/tests/test_booster/test_plugin/test_3d_plugin.py @@ -68,7 +68,6 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True): for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.get_sub_registry( "transformers_llama_for_casual_lm" ).items(): - print(name) err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) torch.cuda.empty_cache()