From f181170048f101e2bd14143841fd749ad5a084d0 Mon Sep 17 00:00:00 2001 From: GuangyaoZhang Date: Mon, 1 Jul 2024 09:35:42 +0000 Subject: [PATCH 1/2] Support Pytorch 2.2.2 --- .github/workflows/build_on_pr.yml | 2 +- colossalai/tensor/d_tensor/layout_converter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 151454239afe..d2a45cfdbe3b 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -89,7 +89,7 @@ jobs: if: needs.detect.outputs.anyLibraryFileChanged == 'true' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch timeout-minutes: 90 defaults: diff --git a/colossalai/tensor/d_tensor/layout_converter.py b/colossalai/tensor/d_tensor/layout_converter.py index c2cf73181345..0f0150d90e7a 100644 --- a/colossalai/tensor/d_tensor/layout_converter.py +++ b/colossalai/tensor/d_tensor/layout_converter.py @@ -473,7 +473,7 @@ def _group_alive_check(cached_comm_action_sequence): for process_group in used_process_groups: try: dist.get_rank(process_group) - except RuntimeError as e: + except (ValueError, RuntimeError) as e: # If the group is not registered, it means it has been deleted if str(e) == ( f"Group {process_group} is not registered, please create group with torch.distributed.new_group API" From 11ba1d74a2a95b4e0fa91da76e0bc0e2bb6fca36 Mon Sep 17 00:00:00 2001 From: GuangyaoZhang Date: Wed, 3 Jul 2024 02:51:47 +0000 Subject: [PATCH 2/2] keep build_on_pr file and update .compatibility --- .compatibility | 1 + .github/workflows/build_on_pr.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.compatibility b/.compatibility index d90a74b584d8..7ecced62469e 100644 --- a/.compatibility +++ b/.compatibility @@ -1 +1,2 @@ 2.1.0-12.1.0 +2.2.2-12.1.0 diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index d2a45cfdbe3b..151454239afe 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -89,7 +89,7 @@ jobs: if: needs.detect.outputs.anyLibraryFileChanged == 'true' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch timeout-minutes: 90 defaults: