NVIDIA-NeMo · yfw · Mar 25, 2025 · Mar 26, 2025 · Mar 26, 2025 · Mar 26, 2025
@@ -27,11 +27,16 @@ on:
         default: 10
       SCRIPT:
         type: string
-        description: Test script to execute
+        description: Test script to execute in container
         required: true
       AFTER_SCRIPT:
         type: string
-        description: Script to run after main test
+        description: Script to run after main test in container
+        required: false
+        default: ":"
+      FINAL_SCRIPT_EXTERNAL:
+        type: string
+        description: Script to run after SCRIPT and AFTER_SCRIPT, but outside container (useful for logging)
         required: false
         default: ":"
       IS_OPTIONAL:
@@ -163,6 +168,15 @@ jobs:
             )
             docker exec nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd"
 
+        - name: final_script_external
+          if: always() && inputs.FINAL_SCRIPT_EXTERNAL != ':'
+          run: |
+            cmd=$(cat <<"RUN_TEST_EOF"
+            ${{ inputs.FINAL_SCRIPT_EXTERNAL }}
+            RUN_TEST_EOF
+            )
+            bash -eux -o pipefail -c "$cmd"
+
         - name: Container shutdown
           if: always()
           run: |

@@ -16,8 +16,8 @@ name: "CICD Reinforcer"
 on:
   pull_request:
     branches:
-      - 'main'
-      - 'r**'
+      - "main"
+      - "r**"
     types: [labeled]
   merge_group:
     types: [checks_requested]
@@ -28,9 +28,10 @@ on:
         default: all
         type: string
         description: Comma-separated list of tests to run. Use "all" to run the full test suite.
-  push:
-    branches:
-      - 'main'
+  # TODO: Due to limited compute, disabling pushes to main. This is okay to do since we force PRs to be up to date and the CI tests on pull/$PR_NUM/merge
+  #push:
+  #  branches:
+  #    - 'main'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}
@@ -80,14 +81,14 @@ jobs:
           # Some output that's helpful for debugging
           echo "Docs changed: $CHANGED_DOCS"
           echo "Src changed: $CHANGED_SRC"
-          
+
           # echo "DOCS_ONLY: $DOCS_ONLY"
           echo "LABEL: $LABEL"
           echo "IS_PULLREQUEST: $IS_PULLREQUEST"
-          
+
           # Run CI only (on main or if label is attached) and if it's not only docs
           echo run_ci=$([[ ("$LABEL" = "true" || "$IS_PULLREQUEST" = "false" || "$MERGE_GROUP" = "true")  && "$DOCS_ONLY" = "false" ]] && echo "true" || echo "false") | tee -a "$GITHUB_OUTPUT"
-  
+
   lint-check:
     name: Lint check
     needs: [pre-flight]
@@ -101,7 +102,7 @@ jobs:
           pip install pre-commit
           pre-commit install
           pre-commit run --all-files --show-diff-on-failure --color=always
-  
+
   sphinx-build:
     name: Sphinx build
     needs: [pre-flight]
@@ -114,7 +115,7 @@ jobs:
         run: |
           pip install uv
           cd docs/
-          uv run --extra docs sphinx-build . _build/html
+          uv run --group docs sphinx-build . _build/html
 
   build-container:
     if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
@@ -139,7 +140,7 @@ jobs:
       TIMEOUT: 10
       SCRIPT: |
         cd ${REINFORCER_REPO_DIR}/docs
-        uv run --extra docs sphinx-build -b doctest . _build/doctest
+        uv run --group docs sphinx-build -b doctest . _build/doctest
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
@@ -153,27 +154,36 @@ jobs:
       TIMEOUT: 15
       SCRIPT: |
         cd ${REINFORCER_REPO_DIR}
-        uv run --extra test bash -x ./tests/run_unit.sh
+        uv run --group test bash -x ./tests/run_unit.sh
+      FINAL_SCRIPT_EXTERNAL: |
+        cat <<EOF | tee -a $GITHUB_STEP_SUMMARY
+        # Unit test results
+        \`\`\`json
+        $(cat tests/unit/unit_results.json || echo "n/a")
+        \`\`\`
+        EOF
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
-  functional-tests:
-    name: ${{ matrix.test_case }}
-    needs: [build-container, pre-flight]
-    uses: ./.github/workflows/_run_test.yml
-    if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
-    strategy:
-      matrix:
-        test_case:
-          - sft.sh
-          - grpo.sh
-    with:
-      # TODO: For now, allow these to fail since the checks are not robust.
-      IS_OPTIONAL: true
-      RUNNER: self-hosted-azure
-      TIMEOUT: 15
-      SCRIPT: |
-        cd ${REINFORCER_REPO_DIR}
-        uv run bash ./tests/functional/${{ matrix.test_case }}
-    secrets:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  # TODO: Temporarily disable functional tests until we have more capacity and tests run quicker
+  #       Related: https://github.com/NVIDIA/reinforcer/pull/27
+  #functional-tests:
+  #  name: ${{ matrix.test_case }}
+  #  needs: [build-container, pre-flight]
+  #  uses: ./.github/workflows/_run_test.yml
+  #  if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
+  #  strategy:
+  #    matrix:
+  #      test_case:
+  #        - sft.sh
+  #        - grpo.sh
+  #  with:
+  #    # TODO: For now, allow these to fail since the checks are not robust.
+  #    IS_OPTIONAL: true
+  #    RUNNER: self-hosted-azure
+  #    TIMEOUT: 15
+  #    SCRIPT: |
+  #      cd ${REINFORCER_REPO_DIR}
+  #      uv run bash ./tests/functional/${{ matrix.test_case }}
+  #  secrets:
+  #    HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -28,3 +28,4 @@ docker/
 wandb/
 checkpoints/
 results/
+code_snapshots/
@@ -60,20 +60,22 @@ We provide a sample SFT experiment that uses the [SQuAD dataset](https://rajpurk
 
 #### Single Node
 
-The experiment is set up to run on 8 GPUs. If using a machine that has access to 8 GPUs, you can launch the experiment as follows:
+The default SFT experiment is configured to run on a single GPU. To launch the experiment,
 
 ```sh
 uv run python examples/run_sft.py
 ```
 
-This trains `Llama3.1-8B` on 8 GPUs. To run on a single GPU, we'll have to override a few of the experiment settings. We replace the 8B model with a smaller 1B model, decrease the batch size, and update the cluster configuration to use a single gpu:
+This trains `Llama3.2-1B` on one GPU using the SQUAD dataset.
+
+If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the batch size:
 
 ```sh
 uv run python examples/run_sft.py \
-  policy.model_name="meta-llama/Llama-3.2-1B" \
-  policy.train_global_batch_size=16 \
-  sft.val_global_batch_size=16 \
-  cluster.gpus_per_node=1
+  policy.model_name="meta-llama/Meta-Llama-3-8B" \
+  policy.train_global_batch_size=128 \
+  sft.val_global_batch_size=128 \
+  cluster.gpus_per_node=8
 ```
 
 Refer to [sft.yaml](examples/configs/sft.yaml) for a full list of parameters that can be overridden.

@@ -21,7 +21,7 @@ RUN chmod 755 /home/ray/.cache
 WORKDIR /opt/reinforcer
 RUN uv venv .venv
 # uv sync has a more reliable resolver than simple uv pip install which can fail
-RUN uv sync --extra test --extra dev --extra docs --no-install-project
+RUN uv sync --group test --group dev --group docs --no-install-project
 
 ENV VIRTUAL_ENV=/opt/reinforcer/.venv
 ENV PATH="/opt/reinforcer/.venv/bin:$PATH"

@@ -95,20 +95,30 @@ The {py:class}`UpdatableVllmInternalWorker <nemo_reinforcer.models.generation.vl
 To use a generation backend:
 
 ```python
+from transformers import AutoTokenizer
+
 from nemo_reinforcer.models.generation.vllm import VllmGeneration, VllmConfig
 from nemo_reinforcer.distributed.virtual_cluster import RayVirtualCluster
 from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict
 
 # Set up the configuration
+tokenizer = AutoTokenizer.from_pretrained(policy_config["model_name"])
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
 config = VllmConfig(
-    backend="vllm",
     model_name="Qwen/Qwen2.5-1.5B",
     max_new_tokens=100,
     temperature=0.7,
     top_p=1,
+    top_k=None,
+    stop_token_ids=[tokenizer.eos_token_id]
+    pad_token=tokenizer.pad_token_id,
+    skip_tokenizer_init=True,
     vllm_cfg={
         "tensor_parallel_size": 1,
-        "gpu_memory_utilization": 0.8
+        "gpu_memory_utilization": 0.8,
+        "max_model_len": 2048,
     }
 )
 

@@ -15,7 +15,7 @@ Switch to the documentation source folder and generate HTML output.
 
 ```sh
 cd docs/
-uv run --extra docs sphinx-build . _build/html
+uv run --group docs sphinx-build . _build/html
 ```
 
 * The resulting HTML files are generated in a `_build/html` folder that is created under the project `docs/` folder.
@@ -29,7 +29,7 @@ To do so run:
 
 ```sh
 cd docs/
-uv run --extra docs sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0
+uv run --group docs sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0
 ```
 
 Open a web browser and go to `http://${HOST_WHERE_SPHINX_COMMAND_RUN}:12345` to view the output.
@@ -41,7 +41,7 @@ We also run tests in our python docstrings. You can run them with:
 
 ```sh
 cd docs/
-uv run --extra docs sphinx-build -b doctest . _build/doctest
+uv run --group docs sphinx-build -b doctest . _build/doctest
 ```
 
 ## Writing Tests in Python Docstrings

@@ -0,0 +1,33 @@
+# Evaluation
+
+## Start Evaluation
+
+### Start Script
+```sh
+# To run the evaluation with default config (examples/configs/eval.yaml)
+uv run python examples/run_eval.py
+
+# Specify a custom config file
+uv run python examples/run_eval.py --config path/to/custom_config.yaml
+
+# Override specific config values via command line
+uv run python examples/run_eval.py generation.model_name="Qwen/Qwen2.5-Math-7B-Instruct"
+```
+
+### Example Output
+
+```
+============================================================
+model_name='Qwen2.5-Math-1.5B-Instruct' dataset_name='aime_2024'
+score=0.10 (3.0/30)
+============================================================
+```
+
+## Configuration
+
+An example Evaluation configuration file can be found [here](../../examples/configs/eval.yaml).
+
+### Prompt Template Configuration
+Always remember to use the same `prompt_file` and `system_prompt_file` that were used during training.
+
+For open-source models, we recommend setting `prompt_file=null` and `system_prompt_file=null` to allow them to use their native chat templates.
@@ -18,6 +18,7 @@ cluster.md
 adding_new_models.md
 guides/sft.md
 guides/grpo.md
+guides/eval.md
 ```
 
 ```{toctree}

@@ -32,6 +32,71 @@ CONTAINER=... bash tests/run_unit_in_docker.sh
 
 The required `CONTAINER` can be built by following the instructions in the [docker documentation](docker.md).
 
+### Tracking metrics in unit tests
+
+Unit tests may also log metrics to a fixture. The fixture is called `tracker` and has the following API:
+```python
+# Track an arbitrary metric (must be json serializable)
+tracker.track(metric_name, metric_value)
+# Log the maximum memory across the entire cluster. Okay for tests since they are run serially.
+tracker.log_max_mem(metric_name)
+# Returns the maximum memory. Useful if you are measuring changes in memory.
+tracker.get_max_mem()
+```
+
+Including the `tracker` fixture also tracks the elapsed time for the test implicitly.
+
+Here is an example test:
+```python
+def test_exponentiate(tracker):
+    starting_mem = tracker.get_max_mem()
+    base = 2
+    exponent = 4
+    result = base ** exponent
+    tracker.track("result", result)
+    tracker.log_max_mem("memory_after_exponentiating")
+    change_in_mem = tracker.get_max_mem() - starting_mem
+    tracker.track("change_in_mem", change_in_mem)
+    assert result == 16
+```
+
+Which would produce this file in `tests/unit/unit_results.json`:
+```json
+{
+  "exit_status": 0,
+  "git_commit": "f1062bd3fd95fc64443e2d9ee4a35fc654ba897e",
+  "start_time": "2025-03-24 23:34:12",
+  "metrics": {
+    "test_hf_ray_policy::test_hf_policy_generation": {
+      "avg_prob_mult_error": 1.0000039339065552,
+      "mean_lps": -1.5399343967437744,
+      "_elapsed": 17.323044061660767
+    }
+  },
+  "gpu_types": [
+    "NVIDIA H100 80GB HBM3"
+  ],
+  "coverage": 24.55897613282601
+}
+```
+
+:::{tip}
+Past unit test results are logged in `tests/unit/unit_results/`. These are helpful to view trends over time and commits.
+
+Here's an example `jq` command to view trends:
+
+```sh
+jq -r '[.start_time, .git_commit, .metrics["test_hf_ray_policy::test_hf_policy_generation"].avg_prob_mult_error] | @tsv' tests/unit/unit_results/*
+
+# Example output:
+#2025-03-24 23:35:39     778d288bb5d2edfd3eec4d07bb7dffffad5ef21b        1.0000039339065552
+#2025-03-24 23:36:37     778d288bb5d2edfd3eec4d07bb7dffffad5ef21b        1.0000039339065552
+#2025-03-24 23:37:37     778d288bb5d2edfd3eec4d07bb7dffffad5ef21b        1.0000039339065552
+#2025-03-24 23:38:14     778d288bb5d2edfd3eec4d07bb7dffffad5ef21b        1.0000039339065552
+#2025-03-24 23:38:50     778d288bb5d2edfd3eec4d07bb7dffffad5ef21b        1.0000039339065552
+```
+:::
+
 ## Functional tests
 
 :::{important}