From 76d60fded9ed381dba884d384c8b68af62ba94e4 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 18:37:44 -0500 Subject: [PATCH 01/20] Replace _placeholder with project name Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 16 ++++++++-------- .github/workflows/build-test-publish-wheel.yml | 2 +- .github/workflows/cicd-main.yml | 2 +- .github/workflows/release-freeze.yml | 4 ++-- .github/workflows/release.yaml | 6 +++--- CONTRIBUTING.md | 4 ++-- nemo__placeholder/__init__.py | 13 ------------- nemo_reinforcer/__init__.py | 13 +++++++++++++ .../package_info.py | 12 ++++++------ tests/unit/test__placeholder.py | 18 ------------------ 10 files changed, 36 insertions(+), 54 deletions(-) delete mode 100644 nemo__placeholder/__init__.py rename {nemo__placeholder => nemo_reinforcer}/package_info.py (70%) delete mode 100644 tests/unit/test__placeholder.py diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index 3a85b30c04..13eea2356c 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -60,19 +60,19 @@ jobs: - name: Docker pull image run: | - docker pull nemoci.azurecr.io/nemo__placeholder_container:${{ github.run_id }} + docker pull nemoci.azurecr.io/nemo_reinforcer_container:${{ github.run_id }} - name: Start container run: | docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g \ --env TRANSFORMERS_OFFLINE=0 \ --env HYDRA_FULL_ERROR=1 \ - --env HF_HOME=/home/TestData/_placeholder/hf_home \ - --env _PLACEHOLDER_CI_DIR=/home/TestData/_placeholder \ - --env _PLACEHOLDER_REPO_DIR=/opt/NeMo-_Placeholder \ - --volume /mnt/datadrive/TestData/_placeholder/checkpoints:/home/TestData/_placeholder/checkpoints:ro \ - --volume /mnt/datadrive/TestData/_placeholder/hf_home/hub:/home/TestData/_placeholder/hf_home/hub:ro \ - nemoci.azurecr.io/nemo__placeholder_container:${{ github.run_id }} \ + --env HF_HOME=/home/TestData/reinforcer/hf_home \ + --env REINFORCER_CI_DIR=/home/TestData/reinforcer \ + --env REINFORCER_REPO_DIR=/opt/NeMo-Reinforcer \ + --volume /mnt/datadrive/TestData/reinforcer/checkpoints:/home/TestData/reinforcer/checkpoints:ro \ + --volume /mnt/datadrive/TestData/reinforcer/hf_home/hub:/home/TestData/reinforcer/hf_home/hub:ro \ + nemoci.azurecr.io/nemo_reinforcer_container:${{ github.run_id }} \ bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))" - id: main @@ -92,7 +92,7 @@ jobs: nvidia-smi # Sanity check the driver/cuda combo cudaCheck - # In case git commands need to be run inside _Placeholder + # In case git commands need to be run inside Reinforcer git config --global --add safe.directory $_PLACHOLDER_REPO_DIR ${{ inputs.SCRIPT }} RUN_TEST_EOF diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml index 1b6c5a3021..ddc2237a72 100644 --- a/.github/workflows/build-test-publish-wheel.yml +++ b/.github/workflows/build-test-publish-wheel.yml @@ -28,7 +28,7 @@ # uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.22.3 # with: # dry-run: true -# python-package: nemo__placeholder +# python-package: nemo_reinforcer # python-version: "3.12" # secrets: # TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 627544fc81..b6229211e5 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -name: "CICD NeMo _Placeholder" +name: "CICD Reinforcer" on: pull_request: diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index 176b6681e0..5588b97e0f 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -36,8 +36,8 @@ jobs: code-freeze: uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_code_freeze.yml@v0.22.5 with: - library-name: NeMo-_Placeholder - python-package: nemo__placeholder + library-name: NeMo-reinforcer + python-package: nemo_reinforcer release-type: ${{ inputs.release-type }} freeze-commit: ${{ inputs.freeze-commit }} dry-run: ${{ inputs.dry-run }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 247e821beb..829193c302 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -name: "Release _Placeholder" +name: "Release Reinforcer" on: workflow_dispatch: @@ -35,9 +35,9 @@ jobs: uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.22.6 with: release-ref: ${{ inputs.release-ref }} - python-package: nemo__placeholder + python-package: nemo_reinforcer python-version: "3.11" - library-name: NeMo-_Placeholder + library-name: NeMo-Reinforcer dry-run: ${{ inputs.dry-run }} version-bump-branch: ${{ inputs.version-bump-branch }} secrets: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ba48b680ae..2b02a7fb63 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -37,8 +37,8 @@ We follow a direct clone and branch workflow for now: 1. Clone the repository directly: ```bash - git clone https://github.com/NVIDIA/nemo__placeholder - cd nemo-reinforcer + git clone https://github.com/NVIDIA/reinforcer + cd reinforcer ``` 2. Create a new branch for your changes: diff --git a/nemo__placeholder/__init__.py b/nemo__placeholder/__init__.py deleted file mode 100644 index f5e16b0d09..0000000000 --- a/nemo__placeholder/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from nemo__placeholder.package_info import ( - __contact_emails__, - __contact_names__, - __description__, - __download_url__, - __homepage__, - __keywords__, - __license__, - __package_name__, - __repository_url__, - __shortversion__, - __version__, -) diff --git a/nemo_reinforcer/__init__.py b/nemo_reinforcer/__init__.py index e69de29bb2..76b1dda065 100644 --- a/nemo_reinforcer/__init__.py +++ b/nemo_reinforcer/__init__.py @@ -0,0 +1,13 @@ +from nemo_reinforcer.package_info import ( + __contact_emails__, + __contact_names__, + __description__, + __download_url__, + __homepage__, + __keywords__, + __license__, + __package_name__, + __repository_url__, + __shortversion__, + __version__, +) diff --git a/nemo__placeholder/package_info.py b/nemo_reinforcer/package_info.py similarity index 70% rename from nemo__placeholder/package_info.py rename to nemo_reinforcer/package_info.py index e1daf52f75..ea2977b049 100644 --- a/nemo__placeholder/package_info.py +++ b/nemo_reinforcer/package_info.py @@ -24,12 +24,12 @@ __shortversion__ = ".".join(map(str, VERSION[:3])) __version__ = ".".join(map(str, VERSION[:3])) + "".join(VERSION[3:]) -__package_name__ = "nemo__placeholder" +__package_name__ = "nemo_reinforcer" __contact_names__ = "NVIDIA" -__contact_emails__ = "nemo-_placeholder@nvidia.com" +__contact_emails__ = "nemo-tookit@nvidia.com" __homepage__ = "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/" -__repository_url__ = "https://github.com/nvidia/nemo__placeholder" -__download_url__ = "https://github.com/NVIDIA/NeMo__placeholder" -__description__ = "_placeholder" +__repository_url__ = "https://github.com/NVIDIA/reinforcer" +__download_url__ = "https://github.com/NVIDIA/reinforcer/releases" +__description__ = "NeMo-Reinforcer - a toolkit for model alignment" __license__ = "Apache2" -__keywords__ = "_placeholder" +__keywords__ = "deep learning, machine learning, gpu, NLP, NeMo, nvidia, pytorch, torch, language, reinforcement learning, RLHF, preference modeling, SteerLM, DPO" diff --git a/tests/unit/test__placeholder.py b/tests/unit/test__placeholder.py deleted file mode 100644 index 0ae23d24f7..0000000000 --- a/tests/unit/test__placeholder.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def test__placeholder(): - """Should be True""" - assert True is True From 48d3a86bcf4a025fcc1fb84a838bc24d21a8a50f Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 18:41:20 -0500 Subject: [PATCH 02/20] Run unit tests on self hosted runner Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index b6229211e5..975deb07eb 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -105,7 +105,7 @@ jobs: unit-tests: name: Unit tests needs: [pre-flight] - runs-on: ubuntu-latest + runs-on: self-hosted-azure if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} steps: - name: Checkout repository @@ -114,9 +114,8 @@ jobs: run: | pip install uv uv venv -p python3.10 .venv - uv pip install --force-reinstall . - uv run --group test -- pytest - + uv pip install --force-reinstall .[vllm,test,dev,compile] + uv run --group test -- bash -x tests/run_unit.sh - name: after_script if: always() run: | From f5496db82409f581156bea9e0440259fb740568d Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 20:06:59 -0500 Subject: [PATCH 03/20] Run tests in container Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 37 ++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 975deb07eb..0abdf94a25 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -101,22 +101,29 @@ jobs: pip install pre-commit pre-commit install pre-commit run --all-files --show-diff-on-failure --color=always + + build-container: + if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} + needs: [pre-flight] + uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.22.7 + with: + build-ref: ${{ github.sha }} + image-name: nemo_reinforcer_container + dockerfile: docker/Dockerfile + image-label: nemo-reinforcer + build-args: | + MAX_JOBS=32 + REINFORCER_COMMIT=${{ github.sha }} unit-tests: name: Unit tests - needs: [pre-flight] - runs-on: self-hosted-azure + needs: [build-container, pre-flight] + uses: ./.github/workflows/_run_test.yml if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Run unit tests - run: | - pip install uv - uv venv -p python3.10 .venv - uv pip install --force-reinstall .[vllm,test,dev,compile] - uv run --group test -- bash -x tests/run_unit.sh - - name: after_script - if: always() - run: | - rm -rf .venv + with: + RUNNER: self-hosted-azure + TIMEOUT: 10 + SCRIPT: | + nvidia-smi + cd ${REINFORCER_REPO_DIR} + bash tests/run_unit_in_docker.sh From d3b35f003a3fbd7343428f887d64863002337199 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 20:28:29 -0500 Subject: [PATCH 04/20] Remove call to cudaCheck Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index 13eea2356c..f73e9dac6d 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -90,10 +90,9 @@ jobs: cmd=$(cat <<"RUN_TEST_EOF" nvidia-smi - # Sanity check the driver/cuda combo - cudaCheck + # In case git commands need to be run inside Reinforcer - git config --global --add safe.directory $_PLACHOLDER_REPO_DIR + git config --global --add safe.directory $REINFORCER_REPO_DIR ${{ inputs.SCRIPT }} RUN_TEST_EOF ) From 5ad9a5fd324ac8596f3ee3aeab26498c9acf6972 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 21:10:43 -0500 Subject: [PATCH 05/20] Install project in docker image Signed-off-by: Charlie Truong --- docker/Dockerfile | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 9031b0a9be..8884b2bdee 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,7 +1,17 @@ ARG BASE_IMAGE=anyscale/ray:2.43.0-py312-cu125 FROM ${BASE_IMAGE} +WORKDIR /opt/NeMo-Reinforcer + RUN sudo apt-get update && sudo apt-get install -y jq -RUN pip install --no-cache-dir uv RUN echo "unset RAY_RUNTIME_ENV_HOOK" >> /home/ray/.bashrc + +COPY pyproject.toml pyproject.toml + +RUN pip install --no-cache-dir uv && \ + uv pip install . --only-deps[dev,test] + +COPY . . + +RUN uv pip install . From 2ab0a4343e237fdd1ae158daf8c682a94637f846 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 21:12:43 -0500 Subject: [PATCH 06/20] Remove build-test-publish-wheel job for now Signed-off-by: Charlie Truong --- .../workflows/build-test-publish-wheel.yml | 37 ------------------- 1 file changed, 37 deletions(-) delete mode 100644 .github/workflows/build-test-publish-wheel.yml diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml deleted file mode 100644 index ddc2237a72..0000000000 --- a/.github/workflows/build-test-publish-wheel.yml +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# name: Build, test, and publish a PyPi wheel (to testpypi) - -# on: -# push: -# branches: -# - main -# - 'r**' - -# defaults: -# run: -# shell: bash -x -e -u -o pipefail {0} - -# jobs: -# build-test-publish-wheel: -# uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.22.3 -# with: -# dry-run: true -# python-package: nemo_reinforcer -# python-version: "3.12" -# secrets: -# TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} -# TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} -# SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }} -# SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} From 5a967cbe5559c4ef338a659d72795f8789c4232a Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 21:38:19 -0500 Subject: [PATCH 07/20] Fix docker build Signed-off-by: Charlie Truong --- docker/Dockerfile | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 8884b2bdee..efbfeeefa7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -7,11 +7,8 @@ RUN sudo apt-get update && sudo apt-get install -y jq RUN echo "unset RAY_RUNTIME_ENV_HOOK" >> /home/ray/.bashrc -COPY pyproject.toml pyproject.toml - -RUN pip install --no-cache-dir uv && \ - uv pip install . --only-deps[dev,test] +RUN pip install --no-cache-dir COPY . . -RUN uv pip install . +RUN uv pip install -e .[dev,test] From 9c0d6358c1b04b478f78539ae8dc0ba247a110d0 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 21:40:41 -0500 Subject: [PATCH 08/20] Separate dependency installs from package install Signed-off-by: Charlie Truong --- docker/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index efbfeeefa7..9532096e58 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -7,7 +7,9 @@ RUN sudo apt-get update && sudo apt-get install -y jq RUN echo "unset RAY_RUNTIME_ENV_HOOK" >> /home/ray/.bashrc -RUN pip install --no-cache-dir +COPY pyproject.toml . + +RUN uv pip install -r pyproject.toml COPY . . From 26fcda7c27fe2b24c44a4a808b6f2189c4053b6f Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 21:43:47 -0500 Subject: [PATCH 09/20] Fix install of uv in docker Signed-off-by: Charlie Truong --- docker/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 9532096e58..73b53a8c58 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,7 +9,8 @@ RUN echo "unset RAY_RUNTIME_ENV_HOOK" >> /home/ray/.bashrc COPY pyproject.toml . -RUN uv pip install -r pyproject.toml +RUN pip install --no-cache-dir uv && \ + uv pip install -r pyproject.toml COPY . . From f6b5d6b651289de44e9ba33b04bf8c75abdfd00b Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 22:14:25 -0500 Subject: [PATCH 10/20] Fix install of packages in docker Signed-off-by: Charlie Truong --- docker/Dockerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 73b53a8c58..3ac272200e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,9 +9,10 @@ RUN echo "unset RAY_RUNTIME_ENV_HOOK" >> /home/ray/.bashrc COPY pyproject.toml . -RUN pip install --no-cache-dir uv && \ - uv pip install -r pyproject.toml +RUN pip install uv && \ + uv venv -p python3.12 && \ + uv pip install -r pyproject.toml --extra dev --extra test COPY . . -RUN uv pip install -e .[dev,test] +RUN uv pip install -e . From 79b72a0abc8324ea2b9c9c9862dd2199dff40aea Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 22:27:51 -0500 Subject: [PATCH 11/20] Fix unit test command Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 0abdf94a25..43be5fa92b 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -124,6 +124,5 @@ jobs: RUNNER: self-hosted-azure TIMEOUT: 10 SCRIPT: | - nvidia-smi cd ${REINFORCER_REPO_DIR} - bash tests/run_unit_in_docker.sh + uv run --extra test bash -x ./run_unit.sh From 63fb149908df9398d20ae8093dcb929afe52f8ad Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 22:33:38 -0500 Subject: [PATCH 12/20] Fix unit test command Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 43be5fa92b..ba293ca09a 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -125,4 +125,4 @@ jobs: TIMEOUT: 10 SCRIPT: | cd ${REINFORCER_REPO_DIR} - uv run --extra test bash -x ./run_unit.sh + uv run --extra test bash -x ./tests/run_unit.sh From e4bc09e42851b3d2f216ea14aaa2d5444c7298ad Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 22:54:15 -0500 Subject: [PATCH 13/20] Pass HF_TOKEN to unit test Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index f73e9dac6d..c8ecc06c1e 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -78,6 +78,8 @@ jobs: - id: main name: Run main script timeout-minutes: ${{ inputs.TIMEOUT }} + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | # Print the host driver for debugging nvidia-smi @@ -96,7 +98,7 @@ jobs: ${{ inputs.SCRIPT }} RUN_TEST_EOF ) - docker exec nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd" + docker exec -e HF_TOKEN nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd" ) 2> >(tee err.log) EXIT_CODE=$? From 9ca951a61ff26f310a47dbb33fad76fe719ebc1a Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 23:11:18 -0500 Subject: [PATCH 14/20] Check if secret passed Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index c8ecc06c1e..9ca788a196 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -85,6 +85,9 @@ jobs: nvidia-smi mkdir -p ${{ github.run_id }} cd ${{ github.run_id }}/ + + length=$(echo -n "$HF_TOKEN" | wc -c) + echo $length set +e ( From 427a7a35e237638902cb2aa6aa0b6e8d30604127 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 23:13:30 -0500 Subject: [PATCH 15/20] Check if HF_TOKEN exists Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index 9ca788a196..51eef67bb9 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -86,8 +86,9 @@ jobs: mkdir -p ${{ github.run_id }} cd ${{ github.run_id }}/ - length=$(echo -n "$HF_TOKEN" | wc -c) - echo $length + if [ -n "${HF_TOKEN}" ]; then + echo "HF_TOKEN is set and not empty" + fi set +e ( From c613082208d50ea26aab802a1b47bd5c847f2d16 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 23:30:16 -0500 Subject: [PATCH 16/20] Do not reinstall test dependencies Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index ba293ca09a..3a0b5a509b 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -125,4 +125,4 @@ jobs: TIMEOUT: 10 SCRIPT: | cd ${REINFORCER_REPO_DIR} - uv run --extra test bash -x ./tests/run_unit.sh + uv run bash -x ./tests/run_unit.sh From 68b99a435e65bbd24d89a2a8bda6d333a6a7622b Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Thu, 20 Mar 2025 23:57:17 -0500 Subject: [PATCH 17/20] Fix reference to secret in test Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 7 +++---- .github/workflows/cicd-main.yml | 6 ++++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index 51eef67bb9..d766fffeb8 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -39,6 +39,9 @@ on: description: Failure will cancel all other tests if set to true required: false default: false + secrets: + HF_TOKEN: + required: true outputs: conclusion: description: Conclusion of main test step @@ -85,10 +88,6 @@ jobs: nvidia-smi mkdir -p ${{ github.run_id }} cd ${{ github.run_id }}/ - - if [ -n "${HF_TOKEN}" ]; then - echo "HF_TOKEN is set and not empty" - fi set +e ( diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 3a0b5a509b..27cad4b889 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -106,8 +106,8 @@ jobs: if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} needs: [pre-flight] uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.22.7 - with: - build-ref: ${{ github.sha }} + + build-ref: ${{ github.sha }} image-name: nemo_reinforcer_container dockerfile: docker/Dockerfile image-label: nemo-reinforcer @@ -126,3 +126,5 @@ jobs: SCRIPT: | cd ${REINFORCER_REPO_DIR} uv run bash -x ./tests/run_unit.sh + secrets: + HF_TOKEN: ${{ secrets.HF_TOKEN }} From e37dc38bc11fbda242412b5ec0e8ccb533d337a2 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Fri, 21 Mar 2025 00:00:18 -0500 Subject: [PATCH 18/20] Fix build Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 27cad4b889..d79d0cf61f 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -106,8 +106,8 @@ jobs: if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} needs: [pre-flight] uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.22.7 - - build-ref: ${{ github.sha }} + with: + build-ref: ${{ github.sha }} image-name: nemo_reinforcer_container dockerfile: docker/Dockerfile image-label: nemo-reinforcer From 3251d71869d0b61b5b36458da155f646ea964e9e Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Fri, 21 Mar 2025 00:12:14 -0500 Subject: [PATCH 19/20] Have the HF hub cache directory be writeable Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index d766fffeb8..d78f741d6d 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -74,7 +74,7 @@ jobs: --env REINFORCER_CI_DIR=/home/TestData/reinforcer \ --env REINFORCER_REPO_DIR=/opt/NeMo-Reinforcer \ --volume /mnt/datadrive/TestData/reinforcer/checkpoints:/home/TestData/reinforcer/checkpoints:ro \ - --volume /mnt/datadrive/TestData/reinforcer/hf_home/hub:/home/TestData/reinforcer/hf_home/hub:ro \ + --volume /mnt/datadrive/TestData/reinforcer/hf_home/hub:/home/TestData/reinforcer/hf_home/hub \ nemoci.azurecr.io/nemo_reinforcer_container:${{ github.run_id }} \ bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))" From 6b02d5e3f6bd77bc5039a7ba64e1435e39a5c635 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Fri, 21 Mar 2025 00:30:13 -0500 Subject: [PATCH 20/20] Run tests as root in container Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index d78f741d6d..faf1848cad 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -101,7 +101,7 @@ jobs: ${{ inputs.SCRIPT }} RUN_TEST_EOF ) - docker exec -e HF_TOKEN nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd" + docker exec -u root -e HF_TOKEN nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd" ) 2> >(tee err.log) EXIT_CODE=$?