From 919253aa0122f295d387871cf8c9df39da7fe85d Mon Sep 17 00:00:00 2001 From: ydshieh Date: Tue, 4 Nov 2025 07:38:43 +0100 Subject: [PATCH 1/2] fix --- .github/workflows/build-docker-images.yml | 2 +- .github/workflows/self-scheduled.yml | 4 ++-- .../transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 9 +++++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index ccc0659409d2..2dec7c4cdfc6 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -97,7 +97,7 @@ jobs: latest-torch-deepspeed-docker: name: "Latest PyTorch + DeepSpeed" runs-on: - group: aws-g4dn-2xlarge-cache + group: aws-general-8-plus steps: - name: Set up Docker Buildx diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index d18428fd0d82..5216410e38af 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -329,7 +329,7 @@ jobs: working-directory: ${{ inputs.working-directory-prefix }}/ run: | python3 -m pip uninstall -y deepspeed - DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check # To avoid unknown test failures - name: Pre build DeepSpeed *again* (for nightly & Past CI) @@ -339,7 +339,7 @@ jobs: python3 -m pip uninstall -y deepspeed rm -rf DeepSpeed git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index 41611b350def..f7181834b30e 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -8,6 +8,9 @@ ARG PYTORCH='2.8.0' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu126' +# This needs to be compatible with the above `PYTORCH`. Let's use cpu version for now. +ARG TORCHCODEC='0.7.0' + RUN apt -y update RUN apt install -y libaio-dev RUN python3 -m pip install --no-cache-dir --upgrade pip @@ -21,7 +24,9 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p # Install latest release PyTorch # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) -RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA +RUN set -e; \ + python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA; \ + python3 -m pip install --no-cache-dir -U torchcodec==${TORCHCODEC}.* --extra-index-url https://download.pytorch.org/whl/cpu; RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate @@ -43,7 +48,7 @@ RUN python3 -m pip uninstall -y deepspeed # This has to be run (again) inside the GPU VMs running the tests. # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests. # TODO: Find out why test fail. -RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 +RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check 2>&1 # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs RUN python3 -m pip uninstall -y kernels From 2162eb60351bee8220e490b6b6595307d4198cc2 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Tue, 4 Nov 2025 11:32:47 +0100 Subject: [PATCH 2/2] delete --- .../transformers-pytorch-deepspeed-latest-gpu/Dockerfile | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index f7181834b30e..bb1bc830eeaf 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -8,9 +8,6 @@ ARG PYTORCH='2.8.0' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu126' -# This needs to be compatible with the above `PYTORCH`. Let's use cpu version for now. -ARG TORCHCODEC='0.7.0' - RUN apt -y update RUN apt install -y libaio-dev RUN python3 -m pip install --no-cache-dir --upgrade pip @@ -24,9 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p # Install latest release PyTorch # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) -RUN set -e; \ - python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA; \ - python3 -m pip install --no-cache-dir -U torchcodec==${TORCHCODEC}.* --extra-index-url https://download.pytorch.org/whl/cpu; +RUN python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate