From ffa3d76a6c68c164aaa8c5814b8d746efe81bc5a Mon Sep 17 00:00:00 2001 From: ver217 Date: Mon, 8 Jul 2024 16:41:55 +0800 Subject: [PATCH 1/9] [misc] support torch2.3 --- .compatibility | 1 + requirements/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.compatibility b/.compatibility index 7ecced62469e..4f808740bc02 100644 --- a/.compatibility +++ b/.compatibility @@ -1,2 +1,3 @@ 2.1.0-12.1.0 2.2.2-12.1.0 +2.3.0-12.1.0 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 27bbc3769448..a01445200676 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -8,7 +8,7 @@ click fabric contexttimer ninja -torch>=2.1.0,<2.3.0 +torch>=2.1.0,<=2.3.0 safetensors einops pydantic From b9f17f5a97d8e4de7dbb807f73d272d17b5ad214 Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 9 Jul 2024 11:19:13 +0800 Subject: [PATCH 2/9] [devops] update compatibility ci --- .../compatiblity_test_on_dispatch.yml | 18 +++++++----------- .github/workflows/compatiblity_test_on_pr.yml | 19 ++++++++----------- .../compatiblity_test_on_schedule.yml | 17 +++++------------ 3 files changed, 20 insertions(+), 34 deletions(-) diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml index 3eee564c29ea..da118ff2de43 100644 --- a/.github/workflows/compatiblity_test_on_dispatch.yml +++ b/.github/workflows/compatiblity_test_on_dispatch.yml @@ -56,17 +56,7 @@ jobs: - name: Install dependencies run: | pip install -U pip setuptools==68.2.2 wheel --user - - uses: actions/checkout@v2 - with: - repository: hpcaitech/TensorNVMe - ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} - path: TensorNVMe - - name: Install tensornvme - run: | - cd TensorNVMe - apt update && apt install -y cmake - pip install -r requirements.txt - DISABLE_URING=1 pip install -v . + - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} @@ -85,6 +75,12 @@ jobs: run: | BUILD_EXT=1 pip install -v . pip install -r requirements/requirements-test.txt + + - name: Install tensornvme + run: | + apt update && apt install -y cmake + DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git + - name: Unit Testing run: | PYTHONPATH=$PWD pytest --durations=0 tests diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index b418c843e7f6..cde5ef1c62a1 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -50,20 +50,11 @@ jobs: - name: Install dependencies run: | pip install -U pip setuptools==68.2.2 wheel --user - - uses: actions/checkout@v2 - with: - repository: hpcaitech/TensorNVMe - ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} - path: TensorNVMe - - name: Install tensornvme - run: | - cd TensorNVMe - apt update && apt install -y cmake - pip install -r requirements.txt - DISABLE_URING=1 pip install -v . + - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} + - name: Download cub for CUDA 10.2 run: | CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}') @@ -80,6 +71,12 @@ jobs: run: | BUILD_EXT=1 pip install -v . pip install -r requirements/requirements-test.txt + + - name: Install tensornvme + run: | + apt update && apt install -y cmake + DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git + - name: Unit Testing run: | PYTHONPATH=$PWD pytest --durations=0 tests diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml index 8d98e775c828..494392c285b2 100644 --- a/.github/workflows/compatiblity_test_on_schedule.yml +++ b/.github/workflows/compatiblity_test_on_schedule.yml @@ -45,18 +45,6 @@ jobs: run: | pip install -U pip setuptools==68.2.2 wheel --user - - uses: actions/checkout@v2 - with: - repository: hpcaitech/TensorNVMe - ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} - path: TensorNVMe - - - name: Install tensornvme - run: | - cd TensorNVMe - apt update && apt install -y cmake - pip install -r requirements.txt - DISABLE_URING=1 pip install -v . - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} @@ -78,6 +66,11 @@ jobs: BUILD_EXT=1 pip install -v . pip install -r requirements/requirements-test.txt + - name: Install tensornvme + run: | + apt update && apt install -y cmake + DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git + - name: Unit Testing run: | PYTHONPATH=$PWD pytest --durations=0 tests From b7c85966eed9b965bcd0aa3d0ab83b64f4670f5b Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 9 Jul 2024 16:38:03 +0800 Subject: [PATCH 3/9] [devops] update compatibility ci --- .../workflows/compatiblity_test_on_dispatch.yml | 12 +----------- .github/workflows/compatiblity_test_on_pr.yml | 14 +------------- .../workflows/compatiblity_test_on_schedule.yml | 14 +------------- 3 files changed, 3 insertions(+), 37 deletions(-) diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml index da118ff2de43..ea97fbf94867 100644 --- a/.github/workflows/compatiblity_test_on_dispatch.yml +++ b/.github/workflows/compatiblity_test_on_dispatch.yml @@ -60,17 +60,7 @@ jobs: - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} - - name: Download cub for CUDA 10.2 - run: | - CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}') - # check if it is CUDA 10.2 - # download cub - if [ "$CUDA_VERSION" = "10.2" ]; then - wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip - unzip 1.8.0.zip - cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/ - fi - name: Install Colossal-AI run: | BUILD_EXT=1 pip install -v . @@ -86,6 +76,6 @@ jobs: PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 - LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + LD_LIBRARY_PATH: /github/home/.tensornvme/lib LLAMA_PATH: /data/scratch/llama-tiny MOE_TENSOR_PATH: /data/scratch/moe_tensors diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index cde5ef1c62a1..afc755ff006d 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -55,18 +55,6 @@ jobs: with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} - - name: Download cub for CUDA 10.2 - run: | - CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}') - - # check if it is CUDA 10.2 - # download cub - if [ "$CUDA_VERSION" = "10.2" ]; then - wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip - unzip 1.8.0.zip - cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/ - fi - - name: Install Colossal-AI run: | BUILD_EXT=1 pip install -v . @@ -82,6 +70,6 @@ jobs: PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 - LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + LD_LIBRARY_PATH: /github/home/.tensornvme/lib LLAMA_PATH: /data/scratch/llama-tiny MOE_TENSOR_PATH: /data/scratch/moe_tensors diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml index 494392c285b2..cba860efa66e 100644 --- a/.github/workflows/compatiblity_test_on_schedule.yml +++ b/.github/workflows/compatiblity_test_on_schedule.yml @@ -49,18 +49,6 @@ jobs: with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} - - name: Download cub for CUDA 10.2 - run: | - CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}') - - # check if it is CUDA 10.2 - # download cub - if [ "$CUDA_VERSION" = "10.2" ]; then - wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip - unzip 1.8.0.zip - cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/ - fi - - name: Install Colossal-AI run: | BUILD_EXT=1 pip install -v . @@ -76,7 +64,7 @@ jobs: PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 - LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + LD_LIBRARY_PATH: /github/home/.tensornvme/lib LLAMA_PATH: /data/scratch/llama-tiny MOE_TENSOR_PATH: /data/scratch/moe_tensors From 426d83d15e4150ebd63af6ee514e397a89b36dcb Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 10 Jul 2024 15:44:46 +0800 Subject: [PATCH 4/9] [devops] add debug --- .github/workflows/compatiblity_test_on_pr.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index afc755ff006d..1cee7d3fb4ff 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -49,7 +49,10 @@ jobs: steps: - name: Install dependencies run: | + apt update && apt install -y cmake pip install -U pip setuptools==68.2.2 wheel --user + pip show torch + nvcc -V - uses: actions/checkout@v2 with: @@ -59,14 +62,22 @@ jobs: run: | BUILD_EXT=1 pip install -v . pip install -r requirements/requirements-test.txt + python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" + pip show torch + nvcc -V - name: Install tensornvme run: | - apt update && apt install -y cmake DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git + python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" + pip show torch + nvcc -V - name: Unit Testing run: | + python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" + pip show torch + nvcc -V PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 From 2437abfe9d42841e79f12b3eb944991323a4be08 Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 10 Jul 2024 15:49:07 +0800 Subject: [PATCH 5/9] [devops] add debug --- .compatibility | 2 -- 1 file changed, 2 deletions(-) diff --git a/.compatibility b/.compatibility index 4f808740bc02..5cd9ce8680fb 100644 --- a/.compatibility +++ b/.compatibility @@ -1,3 +1 @@ -2.1.0-12.1.0 -2.2.2-12.1.0 2.3.0-12.1.0 From 5a92962aeebb6596df70fcce001a4b65be392e50 Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 10 Jul 2024 16:44:11 +0800 Subject: [PATCH 6/9] [devops] add debug --- .github/workflows/compatiblity_test_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index 1cee7d3fb4ff..c8b5522c8d1f 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -61,7 +61,7 @@ jobs: - name: Install Colossal-AI run: | BUILD_EXT=1 pip install -v . - pip install -r requirements/requirements-test.txt + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install -r requirements/requirements-test.txt python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" pip show torch nvcc -V From 7ae69019b70ef874114343fab08a1479bf6c4cd8 Mon Sep 17 00:00:00 2001 From: ver217 Date: Wed, 10 Jul 2024 19:29:17 +0800 Subject: [PATCH 7/9] [devops] add debug --- .github/workflows/compatiblity_test_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index c8b5522c8d1f..1d514f39f6cf 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -61,7 +61,7 @@ jobs: - name: Install Colossal-AI run: | BUILD_EXT=1 pip install -v . - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install -r requirements/requirements-test.txt + FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --no-cache-dir -r requirements/requirements-test.txt python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" pip show torch nvcc -V From 4c477ecfc741e7aa59c0246399a56c33b8d9883e Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 11 Jul 2024 10:41:42 +0800 Subject: [PATCH 8/9] [devops] remove debug --- .github/workflows/compatiblity_test_on_dispatch.yml | 4 ++-- .github/workflows/compatiblity_test_on_pr.yml | 13 +------------ .github/workflows/compatiblity_test_on_schedule.yml | 4 ++-- 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml index ea97fbf94867..1a458d7bbc96 100644 --- a/.github/workflows/compatiblity_test_on_dispatch.yml +++ b/.github/workflows/compatiblity_test_on_dispatch.yml @@ -55,6 +55,7 @@ jobs: steps: - name: Install dependencies run: | + apt update && apt install -y cmake pip install -U pip setuptools==68.2.2 wheel --user - uses: actions/checkout@v2 @@ -64,11 +65,10 @@ jobs: - name: Install Colossal-AI run: | BUILD_EXT=1 pip install -v . - pip install -r requirements/requirements-test.txt + pip install --no-cache-dir -r requirements/requirements-test.txt - name: Install tensornvme run: | - apt update && apt install -y cmake DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git - name: Unit Testing diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index 1d514f39f6cf..770f4b933156 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -51,8 +51,6 @@ jobs: run: | apt update && apt install -y cmake pip install -U pip setuptools==68.2.2 wheel --user - pip show torch - nvcc -V - uses: actions/checkout@v2 with: @@ -61,23 +59,14 @@ jobs: - name: Install Colossal-AI run: | BUILD_EXT=1 pip install -v . - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --no-cache-dir -r requirements/requirements-test.txt - python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" - pip show torch - nvcc -V + pip install --no-cache-dir -r requirements/requirements-test.txt - name: Install tensornvme run: | DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git - python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" - pip show torch - nvcc -V - name: Unit Testing run: | - python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" - pip show torch - nvcc -V PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml index cba860efa66e..c6455604f070 100644 --- a/.github/workflows/compatiblity_test_on_schedule.yml +++ b/.github/workflows/compatiblity_test_on_schedule.yml @@ -43,6 +43,7 @@ jobs: steps: - name: Install dependencies run: | + apt update && apt install -y cmake pip install -U pip setuptools==68.2.2 wheel --user - uses: actions/checkout@v2 @@ -52,11 +53,10 @@ jobs: - name: Install Colossal-AI run: | BUILD_EXT=1 pip install -v . - pip install -r requirements/requirements-test.txt + pip install --no-cache-dir -r requirements/requirements-test.txt - name: Install tensornvme run: | - apt update && apt install -y cmake DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git - name: Unit Testing From 72e5cc679c7b9aaa09bfbd8e27743fa19a891b35 Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 11 Jul 2024 11:55:40 +0800 Subject: [PATCH 9/9] [devops] remove debug --- .compatibility | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.compatibility b/.compatibility index 5cd9ce8680fb..4f808740bc02 100644 --- a/.compatibility +++ b/.compatibility @@ -1 +1,3 @@ +2.1.0-12.1.0 +2.2.2-12.1.0 2.3.0-12.1.0