From a6de56057647296b85ae0063d6691bc071006bbf Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Tue, 15 Apr 2025 23:14:43 -0500 Subject: [PATCH 1/9] Refactor docker container to only include dependencies Signed-off-by: Charlie Truong --- docker/Dockerfile | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 347f68e3ca..c984793b1f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -18,14 +18,16 @@ RUN chmod 755 /home/ray/.cache FROM base AS hermetic WORKDIR /opt/reinforcer -# This is less efficient as this invalidates the cache more frequently, but -# creates a smaller image. Adding reinforcer afterwards and doing -# `uv pip install --no-deps --editable .` causes a "sync" of some of the environment, -# which defeats the purpose of pre-installing. -# In the future we may optimize this: https://github.com/NVIDIA/reinforcer/issues/129 -COPY --chown=ray --chmod=755 . /opt/reinforcer + +# First copy only the dependency files +COPY --chown=ray --chmod=755 pyproject.toml uv.lock ./ + +ENV UV_PROJECT_ENVIRONMENT=/opt/reinforcer_venv +ENV VIRTUAL_ENV=/opt/reinforcer_venv + +# Create and activate virtual environment RUN <<"EOF" -uv venv .venv +uv venv /opt/reinforcer_venv # uv sync has a more reliable resolver than simple uv pip install which can fail # Sync each training + inference backend one at a time (since they may conflict) @@ -33,11 +35,11 @@ uv venv .venv # Do everything in one layer to prevent large layers. uv sync --locked --extra vllm --no-install-project -uv sync --locked --all-groups +uv sync --locked --all-groups --no-install-project EOF -ENV VIRTUAL_ENV=/opt/reinforcer/.venv -ENV PATH="/opt/reinforcer/.venv/bin:$PATH" +ENV UV_NO_SYNC=1 +ENV PATH="/opt/reinforcer_venv/bin:$PATH" # The ray images automatically activate the anaconda venv. We will # comment this out of the .bashrc to give the same UX between docker @@ -50,7 +52,4 @@ sed -i '/# >>> conda initialize >>>/,/# <<< conda initialize << Date: Tue, 15 Apr 2025 23:27:33 -0500 Subject: [PATCH 2/9] Mount repo into container during testing Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 4 ++++ .github/workflows/cicd-main.yml | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index 89827efdad..c90460425e 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -70,6 +70,9 @@ jobs: run: | docker pull nemoci.azurecr.io/nemo_reinforcer_container:${{ github.run_id }} + - name: Checkout repository + uses: actions/checkout@v4 + - name: Start container run: | nvidia-smi @@ -81,6 +84,7 @@ jobs: --env HF_DATASETS_CACHE=/home/TestData/reinforcer/hf_datasets_cache \ --env REINFORCER_REPO_DIR=/opt/reinforcer \ --env HF_TOKEN \ + --volume $GITHUB_WORKSPACE:/opt/reinforcer \ --volume $GITHUB_ACTION_DIR:$GITHUB_ACTION_DIR \ --volume /mnt/datadrive/TestData/reinforcer/datasets:/opt/reinforcer/datasets:ro \ --volume /mnt/datadrive/TestData/reinforcer/checkpoints:/home/TestData/reinforcer/checkpoints:ro \ diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 7d2e77028c..be69c25b67 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -149,22 +149,22 @@ jobs: UNIT_TEST_SCRIPT: | cd /opt/reinforcer if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then - uv run --no-sync bash -x ./tests/run_unit.sh + uv run bash -x ./tests/run_unit.sh else echo Skipping unit tests for docs-only level fi DOC_TEST_SCRIPT: | cd /opt/reinforcer/docs if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(docs|L0|L1|L2)$ ]]; then - uv run --no-sync sphinx-build -b doctest . _build/doctest + uv run sphinx-build -b doctest . _build/doctest else echo Skipping doc tests for level ${{ needs.pre-flight.outputs.test_level }} fi FUNCTIONAL_TEST_SCRIPT: | cd /opt/reinforcer if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L1|L2)$ ]]; then - uv run --no-sync bash ./tests/functional/sft.sh - uv run --no-sync bash ./tests/functional/grpo.sh + uv run bash ./tests/functional/sft.sh + uv run bash ./tests/functional/grpo.sh else echo Skipping functional tests for level ${{ needs.pre-flight.outputs.test_level }} fi @@ -174,7 +174,7 @@ jobs: # if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L2)$ ]]; then # echo "Running convergence tests" # # Add your convergence test commands here - # # uv run --no-sync bash ./tests/convergence/test.sh + # # uv run bash ./tests/convergence/test.sh # else # echo "Skipping convergence tests for level ${{ needs.pre-flight.outputs.test_level }}" # fi @@ -182,12 +182,12 @@ jobs: cd /opt/reinforcer cat <- ${{ (needs.pre-flight.outputs.test_level == 'none') || - (needs.pre-flight.outputs.test_level != 'none' && - needs.lint-check.result == 'success' && - needs.sphinx-build.result == 'success' && + (needs.pre-flight.outputs.test_level != 'none' && + needs.lint-check.result == 'success' && + needs.sphinx-build.result == 'success' && needs.tests.result == 'success') }} CI_SKIP: ${{ github.event.label.name == 'Skip CICD' }} From 02c409443d5facae5920624e4c3ad68b352613a2 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Wed, 16 Apr 2025 00:10:08 -0500 Subject: [PATCH 3/9] Revert no-sync env var Signed-off-by: Charlie Truong --- .github/workflows/cicd-main.yml | 10 +++++----- docker/Dockerfile | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index be69c25b67..c9a5958fb9 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -149,22 +149,22 @@ jobs: UNIT_TEST_SCRIPT: | cd /opt/reinforcer if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then - uv run bash -x ./tests/run_unit.sh + uv run --no-sync bash -x ./tests/run_unit.sh else echo Skipping unit tests for docs-only level fi DOC_TEST_SCRIPT: | cd /opt/reinforcer/docs if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(docs|L0|L1|L2)$ ]]; then - uv run sphinx-build -b doctest . _build/doctest + uv run --no-sync sphinx-build -b doctest . _build/doctest else echo Skipping doc tests for level ${{ needs.pre-flight.outputs.test_level }} fi FUNCTIONAL_TEST_SCRIPT: | cd /opt/reinforcer if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L1|L2)$ ]]; then - uv run bash ./tests/functional/sft.sh - uv run bash ./tests/functional/grpo.sh + uv run --no-sync bash ./tests/functional/sft.sh + uv run --no-sync bash ./tests/functional/grpo.sh else echo Skipping functional tests for level ${{ needs.pre-flight.outputs.test_level }} fi @@ -174,7 +174,7 @@ jobs: # if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L2)$ ]]; then # echo "Running convergence tests" # # Add your convergence test commands here - # # uv run bash ./tests/convergence/test.sh + # # uv run --no-sync bash ./tests/convergence/test.sh # else # echo "Skipping convergence tests for level ${{ needs.pre-flight.outputs.test_level }}" # fi diff --git a/docker/Dockerfile b/docker/Dockerfile index c984793b1f..b7d39c841f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -38,7 +38,6 @@ uv sync --locked --extra vllm --no-install-project uv sync --locked --all-groups --no-install-project EOF -ENV UV_NO_SYNC=1 ENV PATH="/opt/reinforcer_venv/bin:$PATH" # The ray images automatically activate the anaconda venv. We will From 790b8ca9b785fe0afb986e1a4ffbc160f8e40b21 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Wed, 16 Apr 2025 00:55:39 -0500 Subject: [PATCH 4/9] Resolve file permissions Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index c90460425e..ec6a75be9e 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -111,6 +111,10 @@ jobs: - name: after_script if: always() && inputs.AFTER_SCRIPT != ':' run: | + # Ensure any added files in the mounted directoryare owned by the runner user to allow it to clean up + docker exec nemo_container_${{ github.run_id }} bash -c "chown -R $(id -u):$(id -g) /opt/reinforcer" + + # Run the after script cmd=$(cat <<"RUN_TEST_EOF" ${{ inputs.AFTER_SCRIPT }} RUN_TEST_EOF From 079295e7f0b740b971b72b8ce5fab362495f34e3 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Wed, 16 Apr 2025 01:28:00 -0500 Subject: [PATCH 5/9] Fix changing /opt/reinforcer permissions in container Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index ec6a75be9e..22e54dc834 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -112,7 +112,7 @@ jobs: if: always() && inputs.AFTER_SCRIPT != ':' run: | # Ensure any added files in the mounted directoryare owned by the runner user to allow it to clean up - docker exec nemo_container_${{ github.run_id }} bash -c "chown -R $(id -u):$(id -g) /opt/reinforcer" + docker exec nemo_container_${{ github.run_id }} bash -c "find /opt/reinforcer -path '/opt/reinforcer/datasets' -prune -o -exec chown -R $(id -u):$(id -g) {} +" # Run the after script cmd=$(cat <<"RUN_TEST_EOF" From b903a74b2b5294130d157309d0b4fb505c3618d2 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Wed, 16 Apr 2025 08:31:27 -0500 Subject: [PATCH 6/9] Fix chown of test data in container Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index 22e54dc834..c0ca06db8b 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -112,7 +112,7 @@ jobs: if: always() && inputs.AFTER_SCRIPT != ':' run: | # Ensure any added files in the mounted directoryare owned by the runner user to allow it to clean up - docker exec nemo_container_${{ github.run_id }} bash -c "find /opt/reinforcer -path '/opt/reinforcer/datasets' -prune -o -exec chown -R $(id -u):$(id -g) {} +" + docker exec nemo_container_${{ github.run_id }} bash -c "find /opt/reinforcer -path '/opt/reinforcer/datasets' -prune -o -exec chown $(id -u):$(id -g) {} +" # Run the after script cmd=$(cat <<"RUN_TEST_EOF" From 5048d6a0c287f081f36ba365571c4d76c7ec1e20 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Wed, 16 Apr 2025 08:58:06 -0500 Subject: [PATCH 7/9] Move chown step to final clean up step in test script Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index c0ca06db8b..e2ef796f74 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -111,9 +111,6 @@ jobs: - name: after_script if: always() && inputs.AFTER_SCRIPT != ':' run: | - # Ensure any added files in the mounted directoryare owned by the runner user to allow it to clean up - docker exec nemo_container_${{ github.run_id }} bash -c "find /opt/reinforcer -path '/opt/reinforcer/datasets' -prune -o -exec chown $(id -u):$(id -g) {} +" - # Run the after script cmd=$(cat <<"RUN_TEST_EOF" ${{ inputs.AFTER_SCRIPT }} @@ -133,5 +130,7 @@ jobs: - name: Container shutdown if: always() run: | + # Ensure any added files in the mounted directoryare owned by the runner user to allow it to clean up + docker exec nemo_container_${{ github.run_id }} bash -c "find /opt/reinforcer -path '/opt/reinforcer/datasets' -prune -o -exec chown $(id -u):$(id -g) {} +" docker container stop nemo_container_${{ github.run_id }} || true docker container rm nemo_container_${{ github.run_id }} || true From f64e04097d2b53cf4d3afec4aea9964f92f9427f Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Wed, 16 Apr 2025 09:10:31 -0500 Subject: [PATCH 8/9] Have test container set git in repo as safe directory Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index e2ef796f74..f57762f23e 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -95,6 +95,7 @@ jobs: - name: Run unit tests run: | + docker exec nemo_container_${{ github.run_id }} git config --global --add safe.directory /opt/reinforcer docker exec nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "${{ inputs.UNIT_TEST_SCRIPT }}" - name: Run doc tests From 0c16a1c4f449869a7ff042f31740e35e68b2895a Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Wed, 16 Apr 2025 15:35:21 -0500 Subject: [PATCH 9/9] Update .github/workflows/_run_test.yml Co-authored-by: Terry Kong Signed-off-by: Charlie Truong --- .github/workflows/_run_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index f57762f23e..59af0dc80b 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -131,7 +131,7 @@ jobs: - name: Container shutdown if: always() run: | - # Ensure any added files in the mounted directoryare owned by the runner user to allow it to clean up + # Ensure any added files in the mounted directory are owned by the runner user to allow it to clean up docker exec nemo_container_${{ github.run_id }} bash -c "find /opt/reinforcer -path '/opt/reinforcer/datasets' -prune -o -exec chown $(id -u):$(id -g) {} +" docker container stop nemo_container_${{ github.run_id }} || true docker container rm nemo_container_${{ github.run_id }} || true