From 15feebbd70fa68c5a80ba2f937442edde0a7a36b Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 30 Jul 2025 04:56:19 +0000 Subject: [PATCH 1/8] feat: dockerfile can build hermetically or from build context Signed-off-by: Terry Kong --- docker/Dockerfile | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 828156d039..c407204fe9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,14 @@ +# Usage: +# Default build (clones from git): docker build -f docker/Dockerfile . +# Local source override: docker build -f docker/Dockerfile --build-context nemo-rl=./path/to/local . + +# Default source stage that clones nemo-rl repo +# This can be overridden with --build-context nemo-rl=./path/to/local ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04 +ARG NRL_GIT_REF=main +FROM scratch AS nemo-rl +ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} / + FROM ${BASE_IMAGE} AS base # It is more convenient for users to run as root @@ -65,8 +75,8 @@ VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT uv pip install --link-mode symlink flash-att EOF # First copy only the dependency files -COPY pyproject.toml uv.lock ./ -COPY --link 3rdparty/ ./3rdparty/ +COPY --from=nemo-rl pyproject.toml uv.lock ./ +COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/ RUN <<"EOF" bash -exu # uv sync has a more reliable resolver than simple uv pip install which can fail @@ -100,7 +110,8 @@ LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}" ENV NEMO_RL_VENV_DIR=/opt/ray_venvs -# Copy in source and prefetch all virtual environments -COPY . /opt/nemo-rl +# Copy in source from build context (defaults to cloned repo, can be overridden) +COPY --from=nemo-rl . /opt/nemo-rl +RUN git fetch --unshallow # Unshallow the repo to get the full history (in the case it was from the scratch layer) RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py From e8a636936bd8110aae650905952043c5f325d16a Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 30 Jul 2025 05:22:37 +0000 Subject: [PATCH 2/8] update documentatino Signed-off-by: Terry Kong --- docker/Dockerfile | 7 +++---- docker/README.md | 4 ++-- docs/docker.md | 29 +++++++++++++++-------------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index c407204fe9..7e425a983c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,9 +1,8 @@ # Usage: -# Default build (clones from git): docker build -f docker/Dockerfile . -# Local source override: docker build -f docker/Dockerfile --build-context nemo-rl=./path/to/local . +# Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile --tag /nemo-rl:latest --push . +# Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . +# Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile --tag /nemo-rl:latest --push . -# Default source stage that clones nemo-rl repo -# This can be overridden with --build-context nemo-rl=./path/to/local ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04 ARG NRL_GIT_REF=main FROM scratch AS nemo-rl diff --git a/docker/README.md b/docker/README.md index b21c3e7401..66b1da6855 100644 --- a/docker/README.md +++ b/docker/README.md @@ -3,8 +3,8 @@ NOTE: *We use `docker buildx` instead of `docker build` for these containers* This directory contains the `Dockerfile` for NeMo-RL Docker images. You can build two types of images: -- A **base image**: A minimal image where Python dependencies can be specified at runtime. -- A **hermetic image**: An image that includes default dependencies for offline use. +- A **release image** (recommended): Contains everything from the hermetic image, plus the nemo-rl source code and pre-fetched virtual environments for isolated workers. +- A **hermetic image**: Includes the base image plus pre-fetched NeMo RL python packages in the `uv` cache. For detailed instructions on building these images, please see [docs/docker.md](../docs/docker.md). \ No newline at end of file diff --git a/docs/docker.md b/docs/docker.md index 1157e92ebc..4c4761d157 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -1,23 +1,27 @@ # Build Docker Images -This guide provides three methods for building Docker images: +This guide provides two methods for building Docker images: * **release**: Contains everything from the hermetic image, plus the nemo-rl source code and pre-fetched virtual environments for isolated workers. * **hermetic**: Includes the base image plus pre-fetched NeMo RL python packages in the `uv` cache. -* **base**: A minimal image with CUDA, `ray`, and `uv` installed, ideal for specifying Python dependencies at runtime. Use the: * **release** (recommended): if you want to pre-fetch the NeMo RL [worker virtual environments](./design-docs/uv.md#worker-configuration) and copy in the project source code. * **hermetic**: if you want to pre-fetch NeMo RL python packages into the `uv` cache to eliminate the initial overhead of program start. -* **base**: if you just need a minimal image with CUDA, `ray`, and `uv` installed and are okay with dynamically downloading your requirements at runtime. This option trades off fast container download/startup with slower initial overhead to download python packages. ## Release Image The release image is our recommended option as it provides the most complete environment. It includes everything from the hermetic image, plus the nemo-rl source code and pre-fetched virtual environments for isolated workers. This is the ideal choice for production deployments. ```sh -cd docker/ -docker buildx build --target release -t nemo_rl -f Dockerfile .. +# Self-contained build (default: builds from main): +docker buildx build --target release -f docker/Dockerfile --tag /nemo-rl:latest --push . + +# Self-contained build (specific git ref): +docker buildx build --target release -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . + +# Local NeMo RL source override: +docker buildx build --target release --build-context nemo-rl=. -f docker/Dockerfile --tag /nemo-rl:latest --push . ``` ## Hermetic Image @@ -25,15 +29,12 @@ docker buildx build --target release -t nemo_rl -f Dockerfile .. The hermetic image includes all Python dependencies pre-downloaded in the `uv` cache, eliminating the initial overhead of downloading packages at runtime. This is useful when you need a more predictable environment or have limited network connectivity. ```sh -cd docker/ -docker buildx build --target hermetic -t nemo_rl -f Dockerfile .. -``` +# Self-contained build (default: builds from main): +docker buildx build --target hermetic -f docker/Dockerfile --tag /nemo-rl:latest --push . -## Base Image +# Self-contained build (specific git ref): +docker buildx build --target hermetic -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . -The base image provides a minimal environment with CUDA, `ray`, and `uv` installed. While it's the smallest image, it requires downloading Python dependencies at runtime, which may not be ideal for all use cases. - -```sh -cd docker/ -docker buildx build --target base -t nemo_rl -f Dockerfile .. +# Local source override: +docker buildx build --target hermetic --build-context nemo-rl=. -f docker/Dockerfile --tag /nemo-rl:latest --push . ``` From c240f57dbbc76cae26c017a32dadf95e101eed73 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 30 Jul 2025 16:52:34 +0000 Subject: [PATCH 3/8] comment Signed-off-by: Terry Kong --- .dockerignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index a5aa48cb04..8e4e560ff5 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,8 @@ # Adding to .gitignore helps reduce the size of your working_dir -.git +# Note: removing .git from .dockerignore since it is valuable to have the git history to +# know where this container was built +# .git *.out *.log *.tar From d4713e425c3346c17620e2eb418635476dcfe10e Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 30 Jul 2025 19:53:07 +0000 Subject: [PATCH 4/8] try moving around arg Signed-off-by: Terry Kong --- docker/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7e425a983c..98cfaf933e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,11 +1,12 @@ # Usage: # Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile --tag /nemo-rl:latest --push . # Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . +# Self-contained build (remote NeMo RL source; no need for clone): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git # Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile --tag /nemo-rl:latest --push . ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04 -ARG NRL_GIT_REF=main FROM scratch AS nemo-rl +ARG NRL_GIT_REF=main ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} / FROM ${BASE_IMAGE} AS base From 208ffaa3ce2d2e4f3287bf14c5c7e08517fe220e Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 30 Jul 2025 20:02:51 +0000 Subject: [PATCH 5/8] unshallow conditionally Signed-off-by: Terry Kong --- docker/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 98cfaf933e..e5ee9d622a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -112,6 +112,9 @@ ENV NEMO_RL_VENV_DIR=/opt/ray_venvs # Copy in source from build context (defaults to cloned repo, can be overridden) COPY --from=nemo-rl . /opt/nemo-rl -RUN git fetch --unshallow # Unshallow the repo to get the full history (in the case it was from the scratch layer) +# Unshallow the repo to get the full history (in the case it was from the scratch layer). +# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history), +# so do a quick check before trying to unshallow. +RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py From 4bf82bff489f46782d500e3cee4998ed33af1486 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 30 Jul 2025 20:08:40 +0000 Subject: [PATCH 6/8] more ways to build nemo rl Signed-off-by: Terry Kong --- docker/Dockerfile | 2 +- docs/docker.md | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index e5ee9d622a..600b0e93fd 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,7 +1,7 @@ # Usage: # Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile --tag /nemo-rl:latest --push . # Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . -# Self-contained build (remote NeMo RL source; no need for clone): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git +# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git # Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile --tag /nemo-rl:latest --push . ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04 diff --git a/docs/docker.md b/docs/docker.md index 4c4761d157..f6f93fc1b8 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -20,10 +20,15 @@ docker buildx build --target release -f docker/Dockerfile --tag /nemo- # Self-contained build (specific git ref): docker buildx build --target release -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . +# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): +docker buildx build --target release -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git + # Local NeMo RL source override: docker buildx build --target release --build-context nemo-rl=. -f docker/Dockerfile --tag /nemo-rl:latest --push . ``` +**Note:** The `--tag /nemo-rl:latest --push` flags are not necessary if you just want to build locally. + ## Hermetic Image The hermetic image includes all Python dependencies pre-downloaded in the `uv` cache, eliminating the initial overhead of downloading packages at runtime. This is useful when you need a more predictable environment or have limited network connectivity. @@ -35,6 +40,11 @@ docker buildx build --target hermetic -f docker/Dockerfile --tag /nemo # Self-contained build (specific git ref): docker buildx build --target hermetic -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . -# Local source override: +# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): +docker buildx build --target hermetic -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git + +# Local NeMo RL source override: docker buildx build --target hermetic --build-context nemo-rl=. -f docker/Dockerfile --tag /nemo-rl:latest --push . ``` + +**Note:** The `--tag /nemo-rl:latest --push` flags are not necessary if you just want to build locally. From 4f22834e1ce491b4d922de1e12cefabcddc19bb1 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 31 Jul 2025 05:19:23 +0000 Subject: [PATCH 7/8] one more Signed-off-by: Terry Kong --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 600b0e93fd..b12e1b929f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -115,6 +115,6 @@ COPY --from=nemo-rl . /opt/nemo-rl # Unshallow the repo to get the full history (in the case it was from the scratch layer). # Potentially not necessary if the repo is passed in as a complete repository (w/ full git history), # so do a quick check before trying to unshallow. -RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow +RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py From b7c02d82b4cf0f15c30800528118e7e3ef8f3fe1 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 1 Aug 2025 16:30:28 +0000 Subject: [PATCH 8/8] use build context Signed-off-by: Terry Kong --- .github/workflows/cicd-main.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index f112c9ca26..2e2d178dc5 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -162,13 +162,15 @@ jobs: build-container: if: ${{ needs.pre-flight.outputs.test_level != 'none' }} needs: [pre-flight] - uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.30.0 + uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.52.0 with: build-ref: ${{ github.sha }} image-name: nemo_rl_container dockerfile: docker/Dockerfile image-label: nemo-rl target: hermetic + build-contexts: | + nemo-rl=. build-args: | MAX_JOBS=32 NEMO_RL_COMMIT=${{ github.sha }}