diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 915493bb3183..346055e2685f 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -19,7 +19,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu117, v100] + runs-on: [self-hosted, nvidia, cu121, v100] steps: - uses: actions/checkout@v4 @@ -29,7 +29,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118 + pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index c823c194e83c..2ad336cac4ed 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -26,7 +26,7 @@ permissions: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu117, v100] + runs-on: [self-hosted, nvidia, cu121, v100] steps: - uses: actions/checkout@v4 @@ -36,7 +36,7 @@ jobs: - name: Install pytorch run: | - pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118 + pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml index f863226bfb95..c0a4275bd2b6 100644 --- a/.github/workflows/nv-inference.yml +++ b/.github/workflows/nv-inference.yml @@ -22,7 +22,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu117, v100] + runs-on: [self-hosted, nvidia, cu121, v100] steps: - uses: actions/checkout@v4 @@ -32,7 +32,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu118 + pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -58,8 +58,8 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - #pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="11.8" - pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="11.8" - pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="11.8" + #pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="12.1" + pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="12.1" + pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="12.1" # run ds_report again to check updated op list ds_report diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml index 6af228f7fb2f..a9a26b7ce816 100644 --- a/.github/workflows/nv-lightning-v100.yml +++ b/.github/workflows/nv-lightning-v100.yml @@ -19,7 +19,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu111, v100] + runs-on: [self-hosted, nvidia, cu121, v100] env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions @@ -31,7 +31,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118 + pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml index d394b7e24bd6..a576e5933b08 100644 --- a/.github/workflows/nv-mii.yml +++ b/.github/workflows/nv-mii.yml @@ -27,7 +27,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu117, v100] + runs-on: [self-hosted, nvidia, cu121, v100] steps: - uses: actions/checkout@v4 @@ -37,7 +37,7 @@ jobs: - name: Install pytorch run: | - pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118 + pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index 8658ff5d2348..e6032941ebb3 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -18,7 +18,7 @@ permissions: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu117, v100] + runs-on: [self-hosted, nvidia, cu121, v100] steps: - uses: actions/checkout@v4 @@ -28,7 +28,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118 + pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -58,7 +58,7 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.4" --cuda_ver="11.8" + pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.4" --cuda_ver="12.1" - name: Open GitHub issue if nightly CI fails if: ${{ failure() && (github.event_name == 'schedule') }} diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index ebef2d35c278..e888c472638f 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -19,7 +19,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu117, v100] + runs-on: [self-hosted, nvidia, cu121, v100] steps: - uses: actions/checkout@v4 @@ -29,7 +29,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118 + pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -55,5 +55,5 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.4" --cuda_ver="11.8" - pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.4" --cuda_ver="11.8" + pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.4" --cuda_ver="12.1" + pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.4" --cuda_ver="12.1" diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml index 257040439005..74495812add5 100644 --- a/.github/workflows/nv-torch-nightly-v100.yml +++ b/.github/workflows/nv-torch-nightly-v100.yml @@ -15,7 +15,7 @@ permissions: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu117, v100] + runs-on: [self-hosted, nvidia, cu121, v100] steps: - uses: actions/checkout@v4 @@ -25,7 +25,7 @@ jobs: - name: Install pytorch run: | - pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118 + pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index cfed6d6583e6..18c5e2c98bc6 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -18,7 +18,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu117, v100] + runs-on: [self-hosted, nvidia, cu121, v100] steps: - uses: actions/checkout@v4 @@ -29,7 +29,7 @@ jobs: - name: Install pytorch run: | # use the same pytorch version as transformers CI - pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu118 --index-url https://download.pytorch.org/whl/cu118 + pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu121 --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())"