diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index ccc0659409d2..2dec7c4cdfc6 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -97,7 +97,7 @@ jobs: latest-torch-deepspeed-docker: name: "Latest PyTorch + DeepSpeed" runs-on: - group: aws-g4dn-2xlarge-cache + group: aws-general-8-plus steps: - name: Set up Docker Buildx diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index d18428fd0d82..5216410e38af 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -329,7 +329,7 @@ jobs: working-directory: ${{ inputs.working-directory-prefix }}/ run: | python3 -m pip uninstall -y deepspeed - DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check # To avoid unknown test failures - name: Pre build DeepSpeed *again* (for nightly & Past CI) @@ -339,7 +339,7 @@ jobs: python3 -m pip uninstall -y deepspeed rm -rf DeepSpeed git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index 41611b350def..bb1bc830eeaf 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p # Install latest release PyTorch # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) -RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA +RUN python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate @@ -43,7 +43,7 @@ RUN python3 -m pip uninstall -y deepspeed # This has to be run (again) inside the GPU VMs running the tests. # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests. # TODO: Find out why test fail. -RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 +RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check 2>&1 # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs RUN python3 -m pip uninstall -y kernels