Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a1fe428
WIP commit
yfw Mar 25, 2025
39cd956
fix: Remove reference of tokenizer from generation backend (#75) (#82)
parthchadha Mar 26, 2025
f8e51a5
feat: unit test metric tracking (#40)
terrykong Mar 26, 2025
778f5b0
fix: unit test error when coverage wasn't specified (#88)
terrykong Mar 26, 2025
c46581e
ci: temporarily disable CI on main since PRs must be up to date befor…
terrykong Mar 26, 2025
dc9d857
Cleanup + expandable_segments
yfw Mar 27, 2025
fd0f613
More cleanup
yfw Mar 27, 2025
d9b7fd4
cpu offload
yfw Mar 31, 2025
d24a9c3
Make everything configurable
yfw Apr 1, 2025
11fbef7
expandable_segments and fsdp offload configs
yfw Apr 2, 2025
ad48c1d
Make port configurable in ray.sub
yfw Apr 2, 2025
83bd55d
fix: error out early if ray cluster does not have resources (#89)
parthchadha Mar 26, 2025
f6a3d91
ci: skip functional until more capacity available and/or tests speed …
terrykong Mar 26, 2025
4eb1d6d
feat: evaluation implement (#16)
yuki-97 Mar 27, 2025
8062086
fix: gradient should be averaged instead of summed across mbs (#86)
parthchadha Mar 27, 2025
1ce2901
fix: Use separate step_metric for GPU Monitoring (#92)
yfw Mar 31, 2025
e93d697
feat: Update sft config to use single GPU (#90)
ashors1 Mar 31, 2025
c0aa989
fix: Grammar nit (#98)
SahilJain314 Mar 31, 2025
e27b5fd
feat: add capability to set min/max eps separately as proposed in the…
parthchadha Mar 31, 2025
a7b2e2b
fix: change format messages to out of place (#77)
KiddoZhu Mar 31, 2025
b17069c
fix: correct version and use setuptools.dynamic metadata for version/…
terrykong Apr 1, 2025
6f2c31e
fix: remove usage of vllm to get device uuid and instead use nvidia-m…
parthchadha Apr 1, 2025
4e20786
fix: Change optional-dependencies to dependency-groups (#81)
hemildesai Apr 1, 2025
d6de793
feat: Add support for hydra style overrides (#80)
hemildesai Apr 1, 2025
f874746
fix: Do not initialize reference model for sft (#71)
ashors1 Apr 1, 2025
0c0aa6d
fix: change grpo default to use 64 prompts per step and 32 generation…
parthchadha Apr 1, 2025
97c5e1b
feat: use cuda_graph by default for vllm (#116)
parthchadha Apr 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions .github/workflows/_run_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,16 @@ on:
default: 10
SCRIPT:
type: string
description: Test script to execute
description: Test script to execute in container
required: true
AFTER_SCRIPT:
type: string
description: Script to run after main test
description: Script to run after main test in container
required: false
default: ":"
FINAL_SCRIPT_EXTERNAL:
type: string
description: Script to run after SCRIPT and AFTER_SCRIPT, but outside container (useful for logging)
required: false
default: ":"
IS_OPTIONAL:
Expand Down Expand Up @@ -163,6 +168,15 @@ jobs:
)
docker exec nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd"

- name: final_script_external
if: always() && inputs.FINAL_SCRIPT_EXTERNAL != ':'
run: |
cmd=$(cat <<"RUN_TEST_EOF"
${{ inputs.FINAL_SCRIPT_EXTERNAL }}
RUN_TEST_EOF
)
bash -eux -o pipefail -c "$cmd"

- name: Container shutdown
if: always()
run: |
Expand Down
74 changes: 42 additions & 32 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ name: "CICD Reinforcer"
on:
pull_request:
branches:
- 'main'
- 'r**'
- "main"
- "r**"
types: [labeled]
merge_group:
types: [checks_requested]
Expand All @@ -28,9 +28,10 @@ on:
default: all
type: string
description: Comma-separated list of tests to run. Use "all" to run the full test suite.
push:
branches:
- 'main'
# TODO: Due to limited compute, disabling pushes to main. This is okay to do since we force PRs to be up to date and the CI tests on pull/$PR_NUM/merge
#push:
# branches:
# - 'main'

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}
Expand Down Expand Up @@ -80,14 +81,14 @@ jobs:
# Some output that's helpful for debugging
echo "Docs changed: $CHANGED_DOCS"
echo "Src changed: $CHANGED_SRC"

# echo "DOCS_ONLY: $DOCS_ONLY"
echo "LABEL: $LABEL"
echo "IS_PULLREQUEST: $IS_PULLREQUEST"

# Run CI only (on main or if label is attached) and if it's not only docs
echo run_ci=$([[ ("$LABEL" = "true" || "$IS_PULLREQUEST" = "false" || "$MERGE_GROUP" = "true") && "$DOCS_ONLY" = "false" ]] && echo "true" || echo "false") | tee -a "$GITHUB_OUTPUT"

lint-check:
name: Lint check
needs: [pre-flight]
Expand All @@ -101,7 +102,7 @@ jobs:
pip install pre-commit
pre-commit install
pre-commit run --all-files --show-diff-on-failure --color=always

sphinx-build:
name: Sphinx build
needs: [pre-flight]
Expand All @@ -114,7 +115,7 @@ jobs:
run: |
pip install uv
cd docs/
uv run --extra docs sphinx-build . _build/html
uv run --group docs sphinx-build . _build/html

build-container:
if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
Expand All @@ -139,7 +140,7 @@ jobs:
TIMEOUT: 10
SCRIPT: |
cd ${REINFORCER_REPO_DIR}/docs
uv run --extra docs sphinx-build -b doctest . _build/doctest
uv run --group docs sphinx-build -b doctest . _build/doctest
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}

Expand All @@ -153,27 +154,36 @@ jobs:
TIMEOUT: 15
SCRIPT: |
cd ${REINFORCER_REPO_DIR}
uv run --extra test bash -x ./tests/run_unit.sh
uv run --group test bash -x ./tests/run_unit.sh
FINAL_SCRIPT_EXTERNAL: |
cat <<EOF | tee -a $GITHUB_STEP_SUMMARY
# Unit test results
\`\`\`json
$(cat tests/unit/unit_results.json || echo "n/a")
\`\`\`
EOF
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}

functional-tests:
name: ${{ matrix.test_case }}
needs: [build-container, pre-flight]
uses: ./.github/workflows/_run_test.yml
if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
strategy:
matrix:
test_case:
- sft.sh
- grpo.sh
with:
# TODO: For now, allow these to fail since the checks are not robust.
IS_OPTIONAL: true
RUNNER: self-hosted-azure
TIMEOUT: 15
SCRIPT: |
cd ${REINFORCER_REPO_DIR}
uv run bash ./tests/functional/${{ matrix.test_case }}
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
# TODO: Temporarily disable functional tests until we have more capacity and tests run quicker
# Related: https://github.com/NVIDIA/reinforcer/pull/27
#functional-tests:
# name: ${{ matrix.test_case }}
# needs: [build-container, pre-flight]
# uses: ./.github/workflows/_run_test.yml
# if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
# strategy:
# matrix:
# test_case:
# - sft.sh
# - grpo.sh
# with:
# # TODO: For now, allow these to fail since the checks are not robust.
# IS_OPTIONAL: true
# RUNNER: self-hosted-azure
# TIMEOUT: 15
# SCRIPT: |
# cd ${REINFORCER_REPO_DIR}
# uv run bash ./tests/functional/${{ matrix.test_case }}
# secrets:
# HF_TOKEN: ${{ secrets.HF_TOKEN }}
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ docker/
wandb/
checkpoints/
results/
code_snapshots/
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,22 @@ We provide a sample SFT experiment that uses the [SQuAD dataset](https://rajpurk

#### Single Node

The experiment is set up to run on 8 GPUs. If using a machine that has access to 8 GPUs, you can launch the experiment as follows:
The default SFT experiment is configured to run on a single GPU. To launch the experiment,

```sh
uv run python examples/run_sft.py
```

This trains `Llama3.1-8B` on 8 GPUs. To run on a single GPU, we'll have to override a few of the experiment settings. We replace the 8B model with a smaller 1B model, decrease the batch size, and update the cluster configuration to use a single gpu:
This trains `Llama3.2-1B` on one GPU using the SQUAD dataset.

If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the batch size:

```sh
uv run python examples/run_sft.py \
policy.model_name="meta-llama/Llama-3.2-1B" \
policy.train_global_batch_size=16 \
sft.val_global_batch_size=16 \
cluster.gpus_per_node=1
policy.model_name="meta-llama/Meta-Llama-3-8B" \
policy.train_global_batch_size=128 \
sft.val_global_batch_size=128 \
cluster.gpus_per_node=8
```

Refer to [sft.yaml](examples/configs/sft.yaml) for a full list of parameters that can be overridden.
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ RUN chmod 755 /home/ray/.cache
WORKDIR /opt/reinforcer
RUN uv venv .venv
# uv sync has a more reliable resolver than simple uv pip install which can fail
RUN uv sync --extra test --extra dev --extra docs --no-install-project
RUN uv sync --group test --group dev --group docs --no-install-project

ENV VIRTUAL_ENV=/opt/reinforcer/.venv
ENV PATH="/opt/reinforcer/.venv/bin:$PATH"
Expand Down
14 changes: 12 additions & 2 deletions docs/design_docs/generation.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,20 +95,30 @@ The {py:class}`UpdatableVllmInternalWorker <nemo_reinforcer.models.generation.vl
To use a generation backend:

```python
from transformers import AutoTokenizer

from nemo_reinforcer.models.generation.vllm import VllmGeneration, VllmConfig
from nemo_reinforcer.distributed.virtual_cluster import RayVirtualCluster
from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict

# Set up the configuration
tokenizer = AutoTokenizer.from_pretrained(policy_config["model_name"])
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token

config = VllmConfig(
backend="vllm",
model_name="Qwen/Qwen2.5-1.5B",
max_new_tokens=100,
temperature=0.7,
top_p=1,
top_k=None,
stop_token_ids=[tokenizer.eos_token_id]
pad_token=tokenizer.pad_token_id,
skip_tokenizer_init=True,
vllm_cfg={
"tensor_parallel_size": 1,
"gpu_memory_utilization": 0.8
"gpu_memory_utilization": 0.8,
"max_model_len": 2048,
}
)

Expand Down
6 changes: 3 additions & 3 deletions docs/documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Switch to the documentation source folder and generate HTML output.

```sh
cd docs/
uv run --extra docs sphinx-build . _build/html
uv run --group docs sphinx-build . _build/html
```

* The resulting HTML files are generated in a `_build/html` folder that is created under the project `docs/` folder.
Expand All @@ -29,7 +29,7 @@ To do so run:

```sh
cd docs/
uv run --extra docs sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0
uv run --group docs sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0
```

Open a web browser and go to `http://${HOST_WHERE_SPHINX_COMMAND_RUN}:12345` to view the output.
Expand All @@ -41,7 +41,7 @@ We also run tests in our python docstrings. You can run them with:

```sh
cd docs/
uv run --extra docs sphinx-build -b doctest . _build/doctest
uv run --group docs sphinx-build -b doctest . _build/doctest
```

## Writing Tests in Python Docstrings
Expand Down
33 changes: 33 additions & 0 deletions docs/guides/eval.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Evaluation

## Start Evaluation

### Start Script
```sh
# To run the evaluation with default config (examples/configs/eval.yaml)
uv run python examples/run_eval.py

# Specify a custom config file
uv run python examples/run_eval.py --config path/to/custom_config.yaml

# Override specific config values via command line
uv run python examples/run_eval.py generation.model_name="Qwen/Qwen2.5-Math-7B-Instruct"
```

### Example Output

```
============================================================
model_name='Qwen2.5-Math-1.5B-Instruct' dataset_name='aime_2024'
score=0.10 (3.0/30)
============================================================
```

## Configuration

An example Evaluation configuration file can be found [here](../../examples/configs/eval.yaml).

### Prompt Template Configuration
Always remember to use the same `prompt_file` and `system_prompt_file` that were used during training.

For open-source models, we recommend setting `prompt_file=null` and `system_prompt_file=null` to allow them to use their native chat templates.
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ cluster.md
adding_new_models.md
guides/sft.md
guides/grpo.md
guides/eval.md
```

```{toctree}
Expand Down
65 changes: 65 additions & 0 deletions docs/testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,71 @@ CONTAINER=... bash tests/run_unit_in_docker.sh

The required `CONTAINER` can be built by following the instructions in the [docker documentation](docker.md).

### Tracking metrics in unit tests

Unit tests may also log metrics to a fixture. The fixture is called `tracker` and has the following API:
```python
# Track an arbitrary metric (must be json serializable)
tracker.track(metric_name, metric_value)
# Log the maximum memory across the entire cluster. Okay for tests since they are run serially.
tracker.log_max_mem(metric_name)
# Returns the maximum memory. Useful if you are measuring changes in memory.
tracker.get_max_mem()
```

Including the `tracker` fixture also tracks the elapsed time for the test implicitly.

Here is an example test:
```python
def test_exponentiate(tracker):
starting_mem = tracker.get_max_mem()
base = 2
exponent = 4
result = base ** exponent
tracker.track("result", result)
tracker.log_max_mem("memory_after_exponentiating")
change_in_mem = tracker.get_max_mem() - starting_mem
tracker.track("change_in_mem", change_in_mem)
assert result == 16
```

Which would produce this file in `tests/unit/unit_results.json`:
```json
{
"exit_status": 0,
"git_commit": "f1062bd3fd95fc64443e2d9ee4a35fc654ba897e",
"start_time": "2025-03-24 23:34:12",
"metrics": {
"test_hf_ray_policy::test_hf_policy_generation": {
"avg_prob_mult_error": 1.0000039339065552,
"mean_lps": -1.5399343967437744,
"_elapsed": 17.323044061660767
}
},
"gpu_types": [
"NVIDIA H100 80GB HBM3"
],
"coverage": 24.55897613282601
}
```

:::{tip}
Past unit test results are logged in `tests/unit/unit_results/`. These are helpful to view trends over time and commits.

Here's an example `jq` command to view trends:

```sh
jq -r '[.start_time, .git_commit, .metrics["test_hf_ray_policy::test_hf_policy_generation"].avg_prob_mult_error] | @tsv' tests/unit/unit_results/*

# Example output:
#2025-03-24 23:35:39 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552
#2025-03-24 23:36:37 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552
#2025-03-24 23:37:37 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552
#2025-03-24 23:38:14 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552
#2025-03-24 23:38:50 778d288bb5d2edfd3eec4d07bb7dffffad5ef21b 1.0000039339065552
```
:::

## Functional tests

:::{important}
Expand Down
Loading