From bd21cd39d44398071aaf2f0988e17fc96d810afd Mon Sep 17 00:00:00 2001 From: Edenzzzz Date: Mon, 19 Aug 2024 08:31:36 +0000 Subject: [PATCH 1/7] remove triton version --- requirements/requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 93a3690fe1d3..3fcf53e1858e 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -9,7 +9,7 @@ torchx-nightly==2022.6.29 # torchrec 0.2.0 requires torchx-nightly. This package torchrec==0.2.0 contexttimer einops -triton==2.1.0 +triton requests==2.27.1 # downgrade to avoid huggingface error https://github.com/huggingface/transformers/issues/17611 SentencePiece ninja From a4632943215eb5f68c4e1d0dc14e4993b767a1bb Mon Sep 17 00:00:00 2001 From: Edenzzzz Date: Mon, 19 Aug 2024 09:01:52 +0000 Subject: [PATCH 2/7] remove torch 2.2 --- .compatibility | 1 - 1 file changed, 1 deletion(-) diff --git a/.compatibility b/.compatibility index 62d19faffa9e..3b3d4a378458 100644 --- a/.compatibility +++ b/.compatibility @@ -1,4 +1,3 @@ 2.1.0-12.1.0 -2.2.2-12.1.0 2.3.0-12.1.0 2.4.0-12.4.1 From d00e16bff14ce894770b4efd601276700a054758 Mon Sep 17 00:00:00 2001 From: Edenzzzz Date: Tue, 20 Aug 2024 08:22:51 +0000 Subject: [PATCH 3/7] remove torch 2.1 --- .compatibility | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.compatibility b/.compatibility index 3b3d4a378458..e1836506aae6 100644 --- a/.compatibility +++ b/.compatibility @@ -1,3 +1,3 @@ -2.1.0-12.1.0 +2.2.2-12.1.0 2.3.0-12.1.0 2.4.0-12.4.1 From 94e62f09fcec7872cbc479647d7c1bc3b67a3bb5 Mon Sep 17 00:00:00 2001 From: Edenzzzz Date: Wed, 21 Aug 2024 09:14:10 +0000 Subject: [PATCH 4/7] debug --- .compatibility | 4 +--- .github/workflows/compatiblity_test_on_pr.yml | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.compatibility b/.compatibility index e1836506aae6..d90a74b584d8 100644 --- a/.compatibility +++ b/.compatibility @@ -1,3 +1 @@ -2.2.2-12.1.0 -2.3.0-12.1.0 -2.4.0-12.4.1 +2.1.0-12.1.0 diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index 770f4b933156..303a83558449 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -67,7 +67,7 @@ jobs: - name: Unit Testing run: | - PYTHONPATH=$PWD pytest --durations=0 tests + PYTHONPATH=$PWD CUDA_LAUNCH_BLOCKING=1 pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 LD_LIBRARY_PATH: /github/home/.tensornvme/lib From 8e9033166af5d8cf6582c30464f82b6a7601fe41 Mon Sep 17 00:00:00 2001 From: Edenzzzz Date: Thu, 22 Aug 2024 12:04:03 +0000 Subject: [PATCH 5/7] remove 2.1 build tests --- .compatibility | 4 +++- .github/workflows/build_on_pr.yml | 2 +- .github/workflows/build_on_schedule.yml | 2 +- .github/workflows/doc_test_on_pr.yml | 2 +- .github/workflows/doc_test_on_schedule.yml | 2 +- .github/workflows/example_check_on_dispatch.yml | 2 +- .github/workflows/example_check_on_pr.yml | 2 +- .github/workflows/example_check_on_schedule.yml | 2 +- .github/workflows/run_chatgpt_examples.yml | 2 +- .github/workflows/run_chatgpt_unit_tests.yml | 2 +- .github/workflows/run_colossalqa_unit_tests.yml | 2 +- 11 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.compatibility b/.compatibility index d90a74b584d8..e1836506aae6 100644 --- a/.compatibility +++ b/.compatibility @@ -1 +1,3 @@ -2.1.0-12.1.0 +2.2.2-12.1.0 +2.3.0-12.1.0 +2.4.0-12.4.1 diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 58cd8826809a..ceb33c9ac7a8 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -89,7 +89,7 @@ jobs: if: needs.detect.outputs.anyLibraryFileChanged == 'true' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch timeout-minutes: 90 defaults: diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index fc688a71bd92..f8ca07d9731e 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -12,7 +12,7 @@ jobs: if: github.repository == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/ timeout-minutes: 90 steps: diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml index 31c421846e2c..2e0ff6a59c74 100644 --- a/.github/workflows/doc_test_on_pr.yml +++ b/.github/workflows/doc_test_on_pr.yml @@ -56,7 +56,7 @@ jobs: needs: detect-changed-doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm timeout-minutes: 30 defaults: diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml index e2491e4607f5..3ea6481f9980 100644 --- a/.github/workflows/doc_test_on_schedule.yml +++ b/.github/workflows/doc_test_on_schedule.yml @@ -12,7 +12,7 @@ jobs: name: Test the changed Doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm timeout-minutes: 60 steps: diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml index d877b06cee1c..6a65c4ff5462 100644 --- a/.github/workflows/example_check_on_dispatch.yml +++ b/.github/workflows/example_check_on_dispatch.yml @@ -45,7 +45,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 15 steps: diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml index 56fa006b1633..fac40be80cef 100644 --- a/.github/workflows/example_check_on_pr.yml +++ b/.github/workflows/example_check_on_pr.yml @@ -89,7 +89,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 30 concurrency: diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml index 6ec1b0591fc3..bc98e0b0ce5b 100644 --- a/.github/workflows/example_check_on_schedule.yml +++ b/.github/workflows/example_check_on_schedule.yml @@ -34,7 +34,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 30 steps: diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml index d0b5c2164119..fb7acaf27cb9 100644 --- a/.github/workflows/run_chatgpt_examples.yml +++ b/.github/workflows/run_chatgpt_examples.yml @@ -19,7 +19,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data --shm-size=10.24gb timeout-minutes: 60 defaults: diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml index c0e74ecbbab0..21545098af74 100644 --- a/.github/workflows/run_chatgpt_unit_tests.yml +++ b/.github/workflows/run_chatgpt_unit_tests.yml @@ -19,7 +19,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data timeout-minutes: 30 defaults: diff --git a/.github/workflows/run_colossalqa_unit_tests.yml b/.github/workflows/run_colossalqa_unit_tests.yml index 00944b92d9b6..326ef4526a43 100644 --- a/.github/workflows/run_colossalqa_unit_tests.yml +++ b/.github/workflows/run_colossalqa_unit_tests.yml @@ -19,7 +19,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 volumes: - /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa - /data/scratch/llama-tiny:/data/scratch/llama-tiny From c18007f6e7ec35d3ee8bdef5b82a176034b71736 Mon Sep 17 00:00:00 2001 From: Edenzzzz Date: Thu, 22 Aug 2024 12:52:31 +0000 Subject: [PATCH 6/7] require torch >=2.2 --- README.md | 2 +- requirements/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 69506e338f34..22c565b5058d 100644 --- a/README.md +++ b/README.md @@ -420,7 +420,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt ## Installation Requirements: -- PyTorch >= 2.1 +- PyTorch >= 2.2 - Python >= 3.7 - CUDA >= 11.0 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 578122d47072..b77a33b0a151 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -8,7 +8,7 @@ click fabric contexttimer ninja -torch>=2.1.0,<=2.4.0 +torch>=2.2.0,<=2.4.0 safetensors einops pydantic From fb1142e20713eecc7e414ac329e358d446dfb6ee Mon Sep 17 00:00:00 2001 From: Edenzzzz Date: Mon, 26 Aug 2024 03:50:28 +0000 Subject: [PATCH 7/7] remove blocking --- .github/workflows/compatiblity_test_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index 303a83558449..770f4b933156 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -67,7 +67,7 @@ jobs: - name: Unit Testing run: | - PYTHONPATH=$PWD CUDA_LAUNCH_BLOCKING=1 pytest --durations=0 tests + PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 LD_LIBRARY_PATH: /github/home/.tensornvme/lib