From 6fc6a059a0eae4ed752f4f060d5d580fad9a4497 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 13 Feb 2025 14:06:57 +0800 Subject: [PATCH 01/40] fix for async io --- colossalai/checkpoint_io/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py index 50b6f1438961..8984a0b6e721 100644 --- a/colossalai/checkpoint_io/utils.py +++ b/colossalai/checkpoint_io/utils.py @@ -309,12 +309,13 @@ def async_save_state_dict_shards( checkpoint_file_path = os.path.join(checkpoint, shard_file) if state_preprocess: - state_dict, _ = _flatten_optim_state_dict(state_dict=shard, seperator=".") + state_dict, metadata = _flatten_optim_state_dict(state_dict=shard, seperator=".") else: state_dict = shard + metadata = None # Only save on master rank. - writer = save(checkpoint_file_path, state_dict=state_dict) + writer = save(checkpoint_file_path, state_dict=state_dict, metadata=metadata) writers.append(writer) shard_filenames.append(shard_file) del shard @@ -371,9 +372,10 @@ def async_move_save_state_dict_shards( checkpoint_file_path = os.path.join(checkpoint, shard_file) if state_preprocess: - state_dict, _ = _flatten_optim_state_dict(state_dict=shard) + state_dict, metadata = _flatten_optim_state_dict(state_dict=shard) else: state_dict = shard + metadata = None if pinned_state_dict is not None: sub_pinned_state_dict = {k: pinned_state_dict[k] for k in state_dict.keys()} @@ -382,7 +384,7 @@ def async_move_save_state_dict_shards( returned_state_dict.update(sub_pinned_state_dict) # Only save on master rank. - writer = move_and_save(checkpoint_file_path, state_dict, sub_pinned_state_dict) + writer = move_and_save(checkpoint_file_path, state_dict, sub_pinned_state_dict, metadata) writers.append(writer) shard_filenames.append(shard_file) del shard From 3ecb5000e3adca011e6575f4629763f87a47b834 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 27 Mar 2025 18:08:37 +0800 Subject: [PATCH 02/40] test for upgrading transformers --- requirements/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index f357c45fde64..696442f2948e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -16,7 +16,7 @@ ray sentencepiece google protobuf -transformers==4.39.3 +transformers==4.50.0 peft>=0.7.1,<=0.13.2 bitsandbytes>=0.39.0 rpyc==6.0.0 From 0b81be7f7f0f41bc5852c07c345d9585b4eb8fb7 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 28 Mar 2025 18:04:03 +0800 Subject: [PATCH 03/40] add ci machine --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 89b7f1f3b913..0c5a41b5a27c 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -34,7 +34,7 @@ jobs: anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }} changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }} anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} - runs-on: ubuntu-latest + runs-on: gpu-h20-10 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true From 6c728df3e38e3592bb210588867c74bd48f32878 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 31 Mar 2025 11:22:59 +0800 Subject: [PATCH 04/40] fix --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 0c5a41b5a27c..308aebe8c651 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -87,7 +87,7 @@ jobs: name: Build and Test Colossal-AI needs: detect if: needs.detect.outputs.anyLibraryFileChanged == 'true' - runs-on: [self-hosted, gpu] + runs-on: gpu-h20-10 container: image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch From 43885a431774c0454d0a3dc3100d7676e8d06103 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 31 Mar 2025 15:17:30 +0800 Subject: [PATCH 05/40] fix --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 308aebe8c651..e84240fa55b6 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -89,7 +89,7 @@ jobs: if: needs.detect.outputs.anyLibraryFileChanged == 'true' runs-on: gpu-h20-10 container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch timeout-minutes: 90 defaults: From 837a503f50097d4c40c2587ce369b7bb5f651c0d Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 31 Mar 2025 15:32:51 +0800 Subject: [PATCH 06/40] fix --- requirements/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 86b4c730cd6c..688c47cc2221 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -16,7 +16,7 @@ ray sentencepiece google protobuf -transformers==4.50.0 +transformers==4.39.3 peft>=0.7.1,<=0.13.2 bitsandbytes>=0.39.0 rpyc==6.0.0 From 8c66b7c3e95fcc65daa14d7aadfd21d49e459ca9 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 31 Mar 2025 15:39:37 +0800 Subject: [PATCH 07/40] fix --- requirements/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 688c47cc2221..86b4c730cd6c 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -16,7 +16,7 @@ ray sentencepiece google protobuf -transformers==4.39.3 +transformers==4.50.0 peft>=0.7.1,<=0.13.2 bitsandbytes>=0.39.0 rpyc==6.0.0 From 621cb93bb12cd245b759fc2703b5a0c2ee0956ef Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 31 Mar 2025 16:16:15 +0800 Subject: [PATCH 08/40] fix --- requirements/requirements.txt | 2 +- tests/test_booster/test_plugin/test_gemini_plugin.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 86b4c730cd6c..688c47cc2221 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -16,7 +16,7 @@ ray sentencepiece google protobuf -transformers==4.50.0 +transformers==4.39.3 peft>=0.7.1,<=0.13.2 bitsandbytes>=0.39.0 rpyc==6.0.0 diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index 2e9b24fecc6d..cf054302e920 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -67,7 +67,6 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t # TODO(ver217): CI does not support lazy now # @parameterize('init_method', ['lazy', 'none', 'colo']) - @parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"]) @parameterize("init_method", ["none"]) @parameterize("zero_size", [2]) From 822556a8ca78c60c6481bdf841e04e0245fe59e5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 31 Mar 2025 08:17:16 +0000 Subject: [PATCH 09/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_booster/test_plugin/test_gemini_plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index cf054302e920..2e9b24fecc6d 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -67,6 +67,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t # TODO(ver217): CI does not support lazy now # @parameterize('init_method', ['lazy', 'none', 'colo']) + @parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"]) @parameterize("init_method", ["none"]) @parameterize("zero_size", [2]) From 4b8b67ae23896962483a86abe1134263ee5ef008 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Tue, 1 Apr 2025 15:32:11 +0800 Subject: [PATCH 10/40] fix --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index f6d6e8303904..d98171a3d554 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -1,5 +1,6 @@ import torch from torch.optim import Adam +import pytest import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision @@ -35,6 +36,7 @@ def run_torch_amp(rank, world_size, port): del model, optimizer, criterion, data, output, mixed_precision +@pytest.mark.skip("test ci.") @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) From 3491a9f7e3dfa8a17e2ecff86bf3468b5b264c56 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 1 Apr 2025 07:34:48 +0000 Subject: [PATCH 11/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index d98171a3d554..bb76b354dfb0 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -1,6 +1,6 @@ +import pytest import torch from torch.optim import Adam -import pytest import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision From ca914147eb419b485fbf6b80e181f46a720c3064 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 14:01:47 +0800 Subject: [PATCH 12/40] Update test_fp16_torch.py --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index bb76b354dfb0..f6d6e8303904 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -1,4 +1,3 @@ -import pytest import torch from torch.optim import Adam @@ -36,7 +35,6 @@ def run_torch_amp(rank, world_size, port): del model, optimizer, criterion, data, output, mixed_precision -@pytest.mark.skip("test ci.") @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) From 397875e640151f2d476459adc0047481e2060ccc Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 15:14:17 +0800 Subject: [PATCH 13/40] Update build_on_pr.yml --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index e84240fa55b6..ed66c04d026b 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -161,7 +161,7 @@ jobs: --ignore tests/test_infer_ops \ --ignore tests/test_legacy \ --ignore tests/test_smoothquant \ - tests/ + tests/test_fp8/ env: LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny From 28cf1e2c57188b116b467ef14beb7225192e8188 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 15:20:14 +0800 Subject: [PATCH 14/40] fix --- tests/test_fp8/test_fp8_allgather.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index 91e66e83c67b..df54c252fc5e 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -1,4 +1,5 @@ import torch +import pytest import torch.distributed as dist from torch.distributed.distributed_c10d import _get_default_group from torch.testing import assert_close @@ -36,6 +37,7 @@ def run_dist(rank, world_size, port): check_4gpu() +@pytest.mark.skip("tested in corresponding sharderformer") @rerun_if_address_is_in_use() def test_all_gather(): spawn(run_dist, 4) From b38d45ee5177d77e29c85d0a5c93c794b2c281c8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 07:23:03 +0000 Subject: [PATCH 15/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_fp8/test_fp8_allgather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index df54c252fc5e..432d24abf951 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -1,5 +1,5 @@ -import torch import pytest +import torch import torch.distributed as dist from torch.distributed.distributed_c10d import _get_default_group from torch.testing import assert_close From c0811d73424ba472046747d1ee674b8eac06f8c0 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 15:52:42 +0800 Subject: [PATCH 16/40] fix --- tests/test_device/test_init_logical_pg.py | 2 +- tests/test_fp8/test_fp8_allgather.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py index d93f656983d4..20d69b2a7b27 100644 --- a/tests/test_device/test_init_logical_pg.py +++ b/tests/test_device/test_init_logical_pg.py @@ -26,7 +26,7 @@ def check_layer(rank, world_size, port): dist.all_reduce(tensor, op=ReduceOp.SUM, group=pg) assert tensor.equal(tensor_to_check) - +@pytest.mark.skip("tested in corresponding sharderformer") @pytest.mark.dist @rerun_if_address_is_in_use() def test_logical_pg(): diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index 432d24abf951..91e66e83c67b 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -1,4 +1,3 @@ -import pytest import torch import torch.distributed as dist from torch.distributed.distributed_c10d import _get_default_group @@ -37,7 +36,6 @@ def run_dist(rank, world_size, port): check_4gpu() -@pytest.mark.skip("tested in corresponding sharderformer") @rerun_if_address_is_in_use() def test_all_gather(): spawn(run_dist, 4) From 466b61e67450b782661be2f9eaf04ee168bf1403 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 07:53:50 +0000 Subject: [PATCH 17/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_device/test_init_logical_pg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py index 20d69b2a7b27..a73f0af1670c 100644 --- a/tests/test_device/test_init_logical_pg.py +++ b/tests/test_device/test_init_logical_pg.py @@ -26,6 +26,7 @@ def check_layer(rank, world_size, port): dist.all_reduce(tensor, op=ReduceOp.SUM, group=pg) assert tensor.equal(tensor_to_check) + @pytest.mark.skip("tested in corresponding sharderformer") @pytest.mark.dist @rerun_if_address_is_in_use() From a4e5ed9990dbeaaeca4ea3355a9129fbb40e3d37 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 16:32:10 +0800 Subject: [PATCH 18/40] fix --- tests/test_fp8/test_fp8_allgather.py | 3 ++- tests/test_fp8/test_fp8_allreduce.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index 91e66e83c67b..e6b6185604f0 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -6,13 +6,14 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_gather_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run @parameterize( "shape", [(3, 7, 16)], ) + @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) @parameterize("async_op", [True, False]) diff --git a/tests/test_fp8/test_fp8_allreduce.py b/tests/test_fp8/test_fp8_allreduce.py index ccc43ed2979f..d7e706ffde78 100644 --- a/tests/test_fp8/test_fp8_allreduce.py +++ b/tests/test_fp8/test_fp8_allreduce.py @@ -5,7 +5,7 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_reduce_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run @parameterize( @@ -20,6 +20,7 @@ (8,), ], ) +@clear_cache_before_run() @parameterize("dtype", [torch.float16, torch.bfloat16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) @parameterize("async_op", [True, False]) From 57d7b16a186f347a15432ec00e34f4c4105339c7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 08:34:30 +0000 Subject: [PATCH 19/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_fp8/test_fp8_allgather.py | 3 +-- tests/test_fp8/test_fp8_allreduce.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index e6b6185604f0..91e66e83c67b 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -6,14 +6,13 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_gather_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn @parameterize( "shape", [(3, 7, 16)], ) - @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) @parameterize("async_op", [True, False]) diff --git a/tests/test_fp8/test_fp8_allreduce.py b/tests/test_fp8/test_fp8_allreduce.py index d7e706ffde78..297b05e4885d 100644 --- a/tests/test_fp8/test_fp8_allreduce.py +++ b/tests/test_fp8/test_fp8_allreduce.py @@ -5,7 +5,7 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_reduce_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn @parameterize( From 0e900ac5cdcf863a2e1f08ac9883e44f27eff5e5 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 17:29:08 +0800 Subject: [PATCH 20/40] fix --- .github/workflows/build_on_pr.yml | 2 +- tests/test_device/test_init_logical_pg.py | 2 -- tests/test_fp8/test_fp8_allgather.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index ed66c04d026b..e84240fa55b6 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -161,7 +161,7 @@ jobs: --ignore tests/test_infer_ops \ --ignore tests/test_legacy \ --ignore tests/test_smoothquant \ - tests/test_fp8/ + tests/ env: LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py index a73f0af1670c..4be99b17cc0d 100644 --- a/tests/test_device/test_init_logical_pg.py +++ b/tests/test_device/test_init_logical_pg.py @@ -1,4 +1,3 @@ -import pytest import torch import torch.distributed as dist from torch.distributed import ReduceOp @@ -27,7 +26,6 @@ def check_layer(rank, world_size, port): assert tensor.equal(tensor_to_check) -@pytest.mark.skip("tested in corresponding sharderformer") @pytest.mark.dist @rerun_if_address_is_in_use() def test_logical_pg(): diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index e6b6185604f0..f29512182984 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -13,7 +13,7 @@ "shape", [(3, 7, 16)], ) - +@clear_cache_before_run() @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) @parameterize("async_op", [True, False]) From 603e2296c738d795b43933981df2f9cb58243b36 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 17:56:07 +0800 Subject: [PATCH 21/40] fix --- tests/test_fp8/test_fp8_allgather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index a7db4ff733f5..f29512182984 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -6,7 +6,7 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_gather_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run @parameterize( From dce221283d6e29b0af44d22eaaf99c3897a902b0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 09:57:33 +0000 Subject: [PATCH 22/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_fp8/test_fp8_allgather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index f29512182984..ebbe2476a5fd 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -6,7 +6,7 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_gather_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn @parameterize( From 25c5e420f20f70a19c97f463ceaab91ff1a5c0d1 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 18:24:33 +0800 Subject: [PATCH 23/40] fix --- tests/test_device/test_init_logical_pg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py index 4be99b17cc0d..d93f656983d4 100644 --- a/tests/test_device/test_init_logical_pg.py +++ b/tests/test_device/test_init_logical_pg.py @@ -1,3 +1,4 @@ +import pytest import torch import torch.distributed as dist from torch.distributed import ReduceOp From eaef783ec360e729d83642adf5f9c7351b626b3e Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 10:19:38 +0800 Subject: [PATCH 24/40] fix --- .../test_kernels/cuda/test_flash_decoding_attention.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py index e9bf24d53531..c4267d49fd40 100644 --- a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py +++ b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py @@ -18,6 +18,7 @@ generate_caches_and_block_tables_vllm, torch_attn_ref, ) +from colossalai.testing import clear_cache_before_run q_len = 1 PARTITION_SIZE = 512 @@ -55,7 +56,7 @@ def numpy_allclose(x, y, rtol, atol): np.testing.assert_allclose(x_numpy, y_numpy, rtol=rtol, atol=atol) - +@clear_cache_before_run() @pytest.mark.parametrize("BATCH_SIZE", [1, 4, 7, 32]) @pytest.mark.parametrize("BLOCK_SIZE", [8, 16, 32]) @pytest.mark.parametrize("MAX_NUM_BLOCKS_PER_SEQ", [1, 8, 32, 256, 512]) @@ -196,7 +197,7 @@ def test_flash_decoding_attention( HAS_VLLM = False print("The subsequent test requires vllm. Please refer to https://github.com/vllm-project/vllm") - +@clear_cache_before_run() @pytest.mark.skipif(not HAS_VLLM, reason="requires vllm") @pytest.mark.parametrize("BATCH_SIZE", [1, 7, 32]) @pytest.mark.parametrize("BLOCK_SIZE", [6, 32]) From 964f9a7974b59fe72c1fdcce46472530d604d5c2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 02:20:40 +0000 Subject: [PATCH 25/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../test_kernels/cuda/test_flash_decoding_attention.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py index c4267d49fd40..d656c4834a72 100644 --- a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py +++ b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py @@ -11,6 +11,7 @@ inference_ops = InferenceOpsLoader().load() +from colossalai.testing import clear_cache_before_run from tests.test_infer.test_kernels.triton.kernel_utils import ( convert_kv_unpad_to_padded, create_attention_mask, @@ -18,7 +19,6 @@ generate_caches_and_block_tables_vllm, torch_attn_ref, ) -from colossalai.testing import clear_cache_before_run q_len = 1 PARTITION_SIZE = 512 @@ -56,6 +56,7 @@ def numpy_allclose(x, y, rtol, atol): np.testing.assert_allclose(x_numpy, y_numpy, rtol=rtol, atol=atol) + @clear_cache_before_run() @pytest.mark.parametrize("BATCH_SIZE", [1, 4, 7, 32]) @pytest.mark.parametrize("BLOCK_SIZE", [8, 16, 32]) @@ -197,6 +198,7 @@ def test_flash_decoding_attention( HAS_VLLM = False print("The subsequent test requires vllm. Please refer to https://github.com/vllm-project/vllm") + @clear_cache_before_run() @pytest.mark.skipif(not HAS_VLLM, reason="requires vllm") @pytest.mark.parametrize("BATCH_SIZE", [1, 7, 32]) From e8a3d52381f88e925db938c188d5bb33be3b45c6 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 12:55:02 +0800 Subject: [PATCH 26/40] fix --- tests/test_fp8/test_all_to_all_single.py | 4 +++- tests/test_fp8/test_fp8_all_to_all.py | 3 ++- tests/test_fp8/test_fp8_all_to_all_single.py | 3 ++- tests/test_fp8/test_fp8_allgather.py | 2 +- tests/test_fp8/test_fp8_cast.py | 4 +++- tests/test_fp8/test_fp8_fsdp_comm_hook.py | 4 ++-- tests/test_fp8/test_fp8_reduce_scatter.py | 3 ++- 7 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/test_fp8/test_all_to_all_single.py b/tests/test_fp8/test_all_to_all_single.py index 722cbce9ac02..0de5e836a930 100644 --- a/tests/test_fp8/test_all_to_all_single.py +++ b/tests/test_fp8/test_all_to_all_single.py @@ -6,9 +6,10 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_to_all_single_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +@clear_cache_before_run() @parameterize("shape", [(4,), (1, 8, 16), (4, 8, 16)]) @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("async_op", [True, False]) @@ -24,6 +25,7 @@ def check_all2all(shape, dtype, async_op): assert_close(output, output_fp8, rtol=0.1, atol=0.1) +@clear_cache_before_run() @parameterize("shape", [(8, 8, 16)]) @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("async_op", [True, False]) diff --git a/tests/test_fp8/test_fp8_all_to_all.py b/tests/test_fp8/test_fp8_all_to_all.py index 98bbbad8550d..236ac2af8e94 100644 --- a/tests/test_fp8/test_fp8_all_to_all.py +++ b/tests/test_fp8/test_fp8_all_to_all.py @@ -6,9 +6,10 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_to_all_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +@clear_cache_before_run() @parameterize("shape", [(16, 8, 4)]) @parameterize("scatter_dim", [0, 1, 2]) @parameterize("dtype", [torch.bfloat16, torch.float16]) diff --git a/tests/test_fp8/test_fp8_all_to_all_single.py b/tests/test_fp8/test_fp8_all_to_all_single.py index 70765f2d48de..b5229d097579 100644 --- a/tests/test_fp8/test_fp8_all_to_all_single.py +++ b/tests/test_fp8/test_fp8_all_to_all_single.py @@ -6,11 +6,12 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_to_all_single_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run dist.all_to_all_single +@clear_cache_before_run() @parameterize("shape", [(4), (8, 7), (4, 8, 16)]) @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index ebbe2476a5fd..79b55395db8e 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -9,11 +9,11 @@ from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn +@clear_cache_before_run() @parameterize( "shape", [(3, 7, 16)], ) -@clear_cache_before_run() @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) @parameterize("async_op", [True, False]) diff --git a/tests/test_fp8/test_fp8_cast.py b/tests/test_fp8/test_fp8_cast.py index db9a909e60a7..88bdc094f431 100644 --- a/tests/test_fp8/test_fp8_cast.py +++ b/tests/test_fp8/test_fp8_cast.py @@ -3,9 +3,11 @@ from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import cast_from_fp8, cast_from_fp8_pipeline, cast_to_fp8, cast_to_fp8_pipeline -from colossalai.testing import parameterize +from colossalai.testing import parameterize, clear_cache_before_run + +@clear_cache_before_run() @parameterize("shape", [(100, 10), (10, 100), (3, 7), (2, 1), (1, 2), (2, 2), (4, 2), (5,), (4,), (2,)]) @parameterize("dtype", [torch.bfloat16, torch.float16, torch.float32]) @parameterize("fp8_format", ["e4m3", "e5m2"]) diff --git a/tests/test_fp8/test_fp8_fsdp_comm_hook.py b/tests/test_fp8/test_fp8_fsdp_comm_hook.py index 3d0660961f17..97ba0ff364a8 100644 --- a/tests/test_fp8/test_fp8_fsdp_comm_hook.py +++ b/tests/test_fp8/test_fp8_fsdp_comm_hook.py @@ -8,7 +8,7 @@ from torch.testing import assert_close from colossalai import launch -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run # example modified from https://pytorch.org/tutorials/intermediate/ddp_tutorial.html @@ -27,7 +27,7 @@ def __init__(self): def forward(self, x): return self.net2(self.relu(self.net1(x))) - +@clear_cache_before_run() @parameterize("mode", ["grad", "params"]) def run_model(mode): rank = dist.get_rank() diff --git a/tests/test_fp8/test_fp8_reduce_scatter.py b/tests/test_fp8/test_fp8_reduce_scatter.py index e0b558a257ed..7a2dc31889c9 100644 --- a/tests/test_fp8/test_fp8_reduce_scatter.py +++ b/tests/test_fp8/test_fp8_reduce_scatter.py @@ -6,9 +6,10 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import reduce_scatter_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +@clear_cache_before_run() @parameterize("shape", [(16, 8, 4)]) @parameterize("scatter_dim", [0, 1, 2]) @parameterize("dtype", [torch.bfloat16, torch.float16]) From 6997862a91bb871d2c458c8e92bb88032643f59e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 04:58:49 +0000 Subject: [PATCH 27/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_fp8/test_all_to_all_single.py | 2 +- tests/test_fp8/test_fp8_all_to_all.py | 2 +- tests/test_fp8/test_fp8_all_to_all_single.py | 2 +- tests/test_fp8/test_fp8_cast.py | 3 +-- tests/test_fp8/test_fp8_fsdp_comm_hook.py | 3 ++- tests/test_fp8/test_fp8_reduce_scatter.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_fp8/test_all_to_all_single.py b/tests/test_fp8/test_all_to_all_single.py index 0de5e836a930..448a3f031a29 100644 --- a/tests/test_fp8/test_all_to_all_single.py +++ b/tests/test_fp8/test_all_to_all_single.py @@ -6,7 +6,7 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_to_all_single_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn @clear_cache_before_run() diff --git a/tests/test_fp8/test_fp8_all_to_all.py b/tests/test_fp8/test_fp8_all_to_all.py index 236ac2af8e94..a86741b4cb4f 100644 --- a/tests/test_fp8/test_fp8_all_to_all.py +++ b/tests/test_fp8/test_fp8_all_to_all.py @@ -6,7 +6,7 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_to_all_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn @clear_cache_before_run() diff --git a/tests/test_fp8/test_fp8_all_to_all_single.py b/tests/test_fp8/test_fp8_all_to_all_single.py index b5229d097579..a301301b3e75 100644 --- a/tests/test_fp8/test_fp8_all_to_all_single.py +++ b/tests/test_fp8/test_fp8_all_to_all_single.py @@ -6,7 +6,7 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_to_all_single_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn dist.all_to_all_single diff --git a/tests/test_fp8/test_fp8_cast.py b/tests/test_fp8/test_fp8_cast.py index 88bdc094f431..479cb37701a6 100644 --- a/tests/test_fp8/test_fp8_cast.py +++ b/tests/test_fp8/test_fp8_cast.py @@ -3,8 +3,7 @@ from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import cast_from_fp8, cast_from_fp8_pipeline, cast_to_fp8, cast_to_fp8_pipeline -from colossalai.testing import parameterize, clear_cache_before_run - +from colossalai.testing import clear_cache_before_run, parameterize @clear_cache_before_run() diff --git a/tests/test_fp8/test_fp8_fsdp_comm_hook.py b/tests/test_fp8/test_fp8_fsdp_comm_hook.py index 97ba0ff364a8..a95fbdf013de 100644 --- a/tests/test_fp8/test_fp8_fsdp_comm_hook.py +++ b/tests/test_fp8/test_fp8_fsdp_comm_hook.py @@ -8,7 +8,7 @@ from torch.testing import assert_close from colossalai import launch -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn # example modified from https://pytorch.org/tutorials/intermediate/ddp_tutorial.html @@ -27,6 +27,7 @@ def __init__(self): def forward(self, x): return self.net2(self.relu(self.net1(x))) + @clear_cache_before_run() @parameterize("mode", ["grad", "params"]) def run_model(mode): diff --git a/tests/test_fp8/test_fp8_reduce_scatter.py b/tests/test_fp8/test_fp8_reduce_scatter.py index 7a2dc31889c9..a2eac1c7ef72 100644 --- a/tests/test_fp8/test_fp8_reduce_scatter.py +++ b/tests/test_fp8/test_fp8_reduce_scatter.py @@ -6,7 +6,7 @@ from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import reduce_scatter_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn @clear_cache_before_run() From de4f7a1d2542ed025514a90b2dbaf5f63434871d Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 14:34:39 +0800 Subject: [PATCH 28/40] fix --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index f6d6e8303904..341be96fdd74 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -3,10 +3,11 @@ import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision -from colossalai.testing import rerun_if_address_is_in_use, spawn +from colossalai.testing import rerun_if_address_is_in_use, spawn, clear_cache_before_run from tests.kit.model_zoo import model_zoo +@clear_cache_before_run() def run_torch_amp(rank, world_size, port): # init dist env colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost") From 0d09c0e80f1a3d65cbf6fbcf434de7e5ad316f0a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 06:36:21 +0000 Subject: [PATCH 29/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 341be96fdd74..808b11d87641 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -3,7 +3,7 @@ import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision -from colossalai.testing import rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn from tests.kit.model_zoo import model_zoo From 914b1794353e78746a3d041b2888b395fa435c1e Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 15:41:54 +0800 Subject: [PATCH 30/40] fix --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index e84240fa55b6..b26ed427d280 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -161,7 +161,7 @@ jobs: --ignore tests/test_infer_ops \ --ignore tests/test_legacy \ --ignore tests/test_smoothquant \ - tests/ + tests/test_booster/test_mixed_precision/test_fp16_torch.py env: LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny From 21707a77d3b7ee457a1fe666746b302db0c5b1de Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 16:39:08 +0800 Subject: [PATCH 31/40] fix --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index b26ed427d280..12568e8902b8 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -90,7 +90,7 @@ jobs: runs-on: gpu-h20-10 container: image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 - options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch + options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch timeout-minutes: 90 defaults: run: From 910433f070e6c12830925925716c6250fa7f253b Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 17:28:59 +0800 Subject: [PATCH 32/40] fix --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 808b11d87641..3fd6b7df111f 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -3,11 +3,10 @@ import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision -from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn +from colossalai.testing import rerun_if_address_is_in_use, spawn from tests.kit.model_zoo import model_zoo -@clear_cache_before_run() def run_torch_amp(rank, world_size, port): # init dist env colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost") @@ -35,7 +34,6 @@ def run_torch_amp(rank, world_size, port): optimizer.step() del model, optimizer, criterion, data, output, mixed_precision - @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) From 0950b07a328809335470812959341c585f1a9e2a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 09:32:53 +0000 Subject: [PATCH 33/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 3fd6b7df111f..f6d6e8303904 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -34,6 +34,7 @@ def run_torch_amp(rank, world_size, port): optimizer.step() del model, optimizer, criterion, data, output, mixed_precision + @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) From db4c73f643c6a1d6b9a4859a280c59567823775a Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 11 Apr 2025 11:20:35 +0800 Subject: [PATCH 34/40] fix --- .github/workflows/build_on_pr.yml | 2 +- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 12568e8902b8..abb5d87b8cc9 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -161,7 +161,7 @@ jobs: --ignore tests/test_infer_ops \ --ignore tests/test_legacy \ --ignore tests/test_smoothquant \ - tests/test_booster/test_mixed_precision/test_fp16_torch.py + tests/ env: LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 3fd6b7df111f..09ec1b88f766 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -5,6 +5,7 @@ from colossalai.booster.mixed_precision import FP16TorchMixedPrecision from colossalai.testing import rerun_if_address_is_in_use, spawn from tests.kit.model_zoo import model_zoo +import pytest def run_torch_amp(rank, world_size, port): @@ -34,6 +35,7 @@ def run_torch_amp(rank, world_size, port): optimizer.step() del model, optimizer, criterion, data, output, mixed_precision +@pytest.mark.skip(reason="Skip because assertion may fail for CI devices") @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) From dc60efe1545b4eb9fa84ba2816d45af499f22b40 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Apr 2025 03:22:25 +0000 Subject: [PATCH 35/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 09ec1b88f766..1d4a5c0d8768 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -1,3 +1,4 @@ +import pytest import torch from torch.optim import Adam @@ -5,7 +6,6 @@ from colossalai.booster.mixed_precision import FP16TorchMixedPrecision from colossalai.testing import rerun_if_address_is_in_use, spawn from tests.kit.model_zoo import model_zoo -import pytest def run_torch_amp(rank, world_size, port): @@ -35,6 +35,7 @@ def run_torch_amp(rank, world_size, port): optimizer.step() del model, optimizer, criterion, data, output, mixed_precision + @pytest.mark.skip(reason="Skip because assertion may fail for CI devices") @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): From a2e623db78777c0b55a04cc4f38a8a98b858da4b Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 17 Apr 2025 16:49:48 +0800 Subject: [PATCH 36/40] fix --- .github/workflows/build_on_pr.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index abb5d87b8cc9..50d488f18541 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -34,7 +34,7 @@ jobs: anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }} changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }} anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} - runs-on: gpu-h20-10 + runs-on: ubuntu-latest concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true @@ -87,7 +87,7 @@ jobs: name: Build and Test Colossal-AI needs: detect if: needs.detect.outputs.anyLibraryFileChanged == 'true' - runs-on: gpu-h20-10 + runs-on: ubuntu-latest container: image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch From afe07a63aceee8f0d9dfe27dd1763acc8fd26386 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 17 Apr 2025 17:53:48 +0800 Subject: [PATCH 37/40] fiux --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 -- .../test_kernels/cuda/test_flash_decoding_attention.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 1d4a5c0d8768..f6d6e8303904 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -1,4 +1,3 @@ -import pytest import torch from torch.optim import Adam @@ -36,7 +35,6 @@ def run_torch_amp(rank, world_size, port): del model, optimizer, criterion, data, output, mixed_precision -@pytest.mark.skip(reason="Skip because assertion may fail for CI devices") @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) diff --git a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py index d656c4834a72..c93055fece9f 100644 --- a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py +++ b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py @@ -11,7 +11,6 @@ inference_ops = InferenceOpsLoader().load() -from colossalai.testing import clear_cache_before_run from tests.test_infer.test_kernels.triton.kernel_utils import ( convert_kv_unpad_to_padded, create_attention_mask, @@ -57,7 +56,6 @@ def numpy_allclose(x, y, rtol, atol): np.testing.assert_allclose(x_numpy, y_numpy, rtol=rtol, atol=atol) -@clear_cache_before_run() @pytest.mark.parametrize("BATCH_SIZE", [1, 4, 7, 32]) @pytest.mark.parametrize("BLOCK_SIZE", [8, 16, 32]) @pytest.mark.parametrize("MAX_NUM_BLOCKS_PER_SEQ", [1, 8, 32, 256, 512]) From 7af46ab6676f14726cd336eef8ea74fc9c3541bd Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 17 Apr 2025 17:59:46 +0800 Subject: [PATCH 38/40] fix --- .../test_kernels/cuda/test_flash_decoding_attention.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py index c93055fece9f..e9bf24d53531 100644 --- a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py +++ b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py @@ -197,7 +197,6 @@ def test_flash_decoding_attention( print("The subsequent test requires vllm. Please refer to https://github.com/vllm-project/vllm") -@clear_cache_before_run() @pytest.mark.skipif(not HAS_VLLM, reason="requires vllm") @pytest.mark.parametrize("BATCH_SIZE", [1, 7, 32]) @pytest.mark.parametrize("BLOCK_SIZE", [6, 32]) From 52ead00795e567b6f2ce81558aa9297e4863a4d2 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 18 Apr 2025 11:29:24 +0800 Subject: [PATCH 39/40] fix --- .github/workflows/build_on_pr.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 50d488f18541..35040451a466 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -34,7 +34,7 @@ jobs: anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }} changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }} anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} - runs-on: ubuntu-latest + runs-on: [self-hosted, gpu] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true @@ -87,7 +87,7 @@ jobs: name: Build and Test Colossal-AI needs: detect if: needs.detect.outputs.anyLibraryFileChanged == 'true' - runs-on: ubuntu-latest + runs-on: [self-hosted, gpu] container: image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch From 0c5ed653051b5ac72a73d898103b6bf1ee511db5 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 18 Apr 2025 11:33:44 +0800 Subject: [PATCH 40/40] fix --- .github/workflows/build_on_pr.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 35040451a466..50d488f18541 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -34,7 +34,7 @@ jobs: anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }} changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }} anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} - runs-on: [self-hosted, gpu] + runs-on: ubuntu-latest concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true @@ -87,7 +87,7 @@ jobs: name: Build and Test Colossal-AI needs: detect if: needs.detect.outputs.anyLibraryFileChanged == 'true' - runs-on: [self-hosted, gpu] + runs-on: ubuntu-latest container: image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch