From 2636f7ab6afd63c907e9edca528ea44933224b5d Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 21 Mar 2025 16:06:58 -0700
Subject: [PATCH 1/6] add sft quickstart

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 README.md | 59 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 447c20c668..d5df79410c 100644
--- a/README.md
+++ b/README.md
@@ -54,30 +54,30 @@ uv pip install -e '.[dev,test]'
 
 **Reminder**: Don't forget to set your HF_HOME and WANDB_API_KEY (if needed). You'll need to do a `huggingface-cli login` as well for Llama models.
 
-### GRPO
+### SFT
 
-We have a reference GRPO experiment config set up trained for math benchmarks using the [OpenInstructMath2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2) dataset.
+We provide a sample SFT experiment that uses the [SQuAD dataset](https://rajpurkar.github.io/SQuAD-explorer/).
 
 #### Single GPU
 
-To run GRPO on a single GPU for `Llama-3.2-1B-Instruct`:
+The experiment is set up to run on 8 GPUs. If using a machine that has access to 8 GPUs, you can launch the experiment as follows:
 
 ```sh
-# Run the GRPO math example using a 1B parameter model
-uv run python examples/run_grpo_math.py
+uv run python examples/run_sft.py
 ```
 
-By default, this uses the configuration in `examples/configs/grpo_math_1B.yaml`. You can customize parameters with command-line overrides:
+This trains `Llama3.1-8B` on 8 GPUs. To run on a single GPU, we'll have to override a few of the experiment settings. We replace the 8B model with a smaller 1B model
+ and update the cluster configuration to use a single gpu:
 
+TODO: do we need to decrease the batch size?
 ```sh
-uv run python examples/run_grpo_math.py \
-  policy.model_name="Qwen/Qwen2-1.5B" \
-  checkpointing.checkpoint_dir="results/qwen1_5b_math" \
-  logger.wandb_enabled=True \
-  logger.wandb.name="grpo-qwen1_5b_math" \
-  logger.num_val_samples_to_print=10
+uv run python examples/run_sft.py \
+  policy.model_name="meta-llama/Llama-3.2-1B" \
+  cluster.gpus_per_node=1
 ```
 
+Refer to [sft.yaml](examples/configs/sft.yaml) for a full list of parameters that can be overridden.
+
 #### Multi-node
 
 For distributed training across multiple nodes:
@@ -94,7 +94,7 @@ NUM_ACTOR_NODES=2
 TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 
 # grpo_math_8b uses Llama-3.1-8B-Instruct model
-COMMAND="uv pip install -e .; uv run ./examples/run_grpo_math.py --config examples/configs/grpo_math_8B.yaml cluster.num_nodes=2 checkpointing.checkpoint_dir='results/llama8b_2nodes' policy.train_global_batch_size=64 logger.wandb_enabled=True logger.wandb.name='grpo-llama8b_math'" \
+COMMAND="bash -c \"uv pip install -e .; uv run ./examples/run_sft.py --config examples/configs/sft.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 checkpointing.checkpoint_dir='results/sft_llama8b_2nodes' logger.wandb_enabled=True logger.wandb.name='sft-llama8b'\"" \
 RAY_DEDUP_LOGS=0 \
 UV_CACHE_DIR=YOUR_UV_CACHE_DIR \
 CONTAINER=YOUR_CONTAINER \
@@ -109,6 +109,39 @@ sbatch \
     ray.sub
 ```
 
+### GRPO
+
+We have a reference GRPO experiment config set up trained for math benchmarks using the [OpenInstructMath2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2) dataset.
+
+#### Single GPU
+
+To run GRPO on a single GPU for `Llama-3.2-1B-Instruct`:
+
+```sh
+# Run the GRPO math example using a 1B parameter model
+uv run python examples/run_grpo_math.py
+```
+
+By default, this uses the configuration in `examples/configs/grpo_math_1B.yaml`. You can customize parameters with command-line overrides:
+
+```sh
+uv run python examples/run_grpo_math.py \
+  policy.model_name="Qwen/Qwen2-1.5B" \
+  checkpointing.checkpoint_dir="results/qwen1_5b_math" \
+  logger.wandb_enabled=True \
+  logger.wandb.name="grpo-qwen1_5b_math" \
+  logger.num_val_samples_to_print=10
+```
+
+#### Multi-node
+
+For the general multi-node setup, refer to the [SFT multi-node](#multi-node) documentation. The only thing that differs from SFT is the `COMMAND`, which is replaced with
+
+```sh
+# grpo_math_8b uses Llama-3.1-8B-Instruct model
+COMMAND="uv pip install -e .; uv run ./examples/run_grpo_math.py --config examples/configs/grpo_math_8B.yaml cluster.num_nodes=2 checkpointing.checkpoint_dir='results/llama8b_2nodes' policy.train_global_batch_size=64 logger.wandb_enabled=True logger.wandb.name='grpo-llama8b_math'" \
+```
+
 ## Cluster Start
 
 Please visit [Cluster Start](docs/cluster.md) for how to get started on Slurm or Kubernetes.

From e3d84903a16e4161f9a45dcdde384ffce1c5e53e Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 21 Mar 2025 16:12:56 -0700
Subject: [PATCH 2/6] fix section title, scale down gbs

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index d5df79410c..dc7625419b 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ uv pip install -e '.[dev,test]'
 
 We provide a sample SFT experiment that uses the [SQuAD dataset](https://rajpurkar.github.io/SQuAD-explorer/).
 
-#### Single GPU
+#### Single Node
 
 The experiment is set up to run on 8 GPUs. If using a machine that has access to 8 GPUs, you can launch the experiment as follows:
 
@@ -66,13 +66,13 @@ The experiment is set up to run on 8 GPUs. If using a machine that has access to
 uv run python examples/run_sft.py
 ```
 
-This trains `Llama3.1-8B` on 8 GPUs. To run on a single GPU, we'll have to override a few of the experiment settings. We replace the 8B model with a smaller 1B model
- and update the cluster configuration to use a single gpu:
+This trains `Llama3.1-8B` on 8 GPUs. To run on a single GPU, we'll have to override a few of the experiment settings. We replace the 8B model with a smaller 1B model, decrease the batch size, and update the cluster configuration to use a single gpu:
 
-TODO: do we need to decrease the batch size?
 ```sh
 uv run python examples/run_sft.py \
   policy.model_name="meta-llama/Llama-3.2-1B" \
+  policy.train_global_batch_size=16 \
+  sft.val_global_batch_size=16 \
   cluster.gpus_per_node=1
 ```
 

From 01bb52d32656b1b70af32c7a3bbeed6e44b9c98d Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 21 Mar 2025 16:20:38 -0700
Subject: [PATCH 3/6] updates

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 README.md | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index dc7625419b..41fc52bfb3 100644
--- a/README.md
+++ b/README.md
@@ -113,7 +113,7 @@ sbatch \
 
 We have a reference GRPO experiment config set up trained for math benchmarks using the [OpenInstructMath2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2) dataset.
 
-#### Single GPU
+#### Single Node
 
 To run GRPO on a single GPU for `Llama-3.2-1B-Instruct`:
 
@@ -122,7 +122,15 @@ To run GRPO on a single GPU for `Llama-3.2-1B-Instruct`:
 uv run python examples/run_grpo_math.py
 ```
 
-By default, this uses the configuration in `examples/configs/grpo_math_1B.yaml`. You can customize parameters with command-line overrides:
+By default, this uses the configuration in `examples/configs/grpo_math_1B.yaml`. You can customize parameters with command-line overrides. For example, to run on 8 gpus,
+
+```sh
+# Run the GRPO math example using a 1B parameter model
+uv run python examples/run_grpo_math.py \
+  cluster.gpus_per_node=8
+```
+
+You can override any of the parameters listed in the yaml configuration file. For example,
 
 ```sh
 uv run python examples/run_grpo_math.py \
@@ -130,7 +138,7 @@ uv run python examples/run_grpo_math.py \
   checkpointing.checkpoint_dir="results/qwen1_5b_math" \
   logger.wandb_enabled=True \
   logger.wandb.name="grpo-qwen1_5b_math" \
-  logger.num_val_samples_to_print=10
+  logger.num_val_samples_to_print=10 \
 ```
 
 #### Multi-node

From b40cce734f9de29d45d9162932923395355284d7 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 21 Mar 2025 16:26:21 -0700
Subject: [PATCH 4/6] add full grpo command

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 README.md | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 41fc52bfb3..2289428b90 100644
--- a/README.md
+++ b/README.md
@@ -143,11 +143,28 @@ uv run python examples/run_grpo_math.py \
 
 #### Multi-node
 
-For the general multi-node setup, refer to the [SFT multi-node](#multi-node) documentation. The only thing that differs from SFT is the `COMMAND`, which is replaced with
+For the general multi-node setup, refer to the [SFT multi-node](#multi-node) documentation. The only thing that differs from SFT is the `COMMAND`:
 
 ```sh
+# Run from the root of NeMo-Reinforcer repo
+NUM_ACTOR_NODES=2
+# Add a timestamp to make each job name unique
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+
 # grpo_math_8b uses Llama-3.1-8B-Instruct model
 COMMAND="uv pip install -e .; uv run ./examples/run_grpo_math.py --config examples/configs/grpo_math_8B.yaml cluster.num_nodes=2 checkpointing.checkpoint_dir='results/llama8b_2nodes' policy.train_global_batch_size=64 logger.wandb_enabled=True logger.wandb.name='grpo-llama8b_math'" \
+RAY_DEDUP_LOGS=0 \
+UV_CACHE_DIR=YOUR_UV_CACHE_DIR \
+CONTAINER=YOUR_CONTAINER \
+MOUNTS="$PWD:$PWD" \
+sbatch \
+    --nodes=${NUM_ACTOR_NODES} \
+    --account=YOUR_ACCOUNT \
+    --job-name=YOUR_JOBNAME \
+    --partition=YOUR_PARTITION \
+    --time=4:0:0 \
+    --gres=gpu:8 \
+    ray.sub
 ```
 
 ## Cluster Start

From 154672c6a88a1b2fc0a82fc01c1dd1ffcf6f08a3 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 21 Mar 2025 16:27:50 -0700
Subject: [PATCH 5/6] address comments

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2289428b90..1b06e101be 100644
--- a/README.md
+++ b/README.md
@@ -94,7 +94,7 @@ NUM_ACTOR_NODES=2
 TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 
 # grpo_math_8b uses Llama-3.1-8B-Instruct model
-COMMAND="bash -c \"uv pip install -e .; uv run ./examples/run_sft.py --config examples/configs/sft.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 checkpointing.checkpoint_dir='results/sft_llama8b_2nodes' logger.wandb_enabled=True logger.wandb.name='sft-llama8b'\"" \
+COMMAND="uv pip install -e .; uv run ./examples/run_sft.py --config examples/configs/sft.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 checkpointing.checkpoint_dir='results/sft_llama8b_2nodes' logger.wandb_enabled=True logger.wandb.name='sft-llama8b'\"" \
 RAY_DEDUP_LOGS=0 \
 UV_CACHE_DIR=YOUR_UV_CACHE_DIR \
 CONTAINER=YOUR_CONTAINER \
@@ -125,7 +125,7 @@ uv run python examples/run_grpo_math.py
 By default, this uses the configuration in `examples/configs/grpo_math_1B.yaml`. You can customize parameters with command-line overrides. For example, to run on 8 gpus,
 
 ```sh
-# Run the GRPO math example using a 1B parameter model
+# Run the GRPO math example using a 1B parameter model using 8 GPUs
 uv run python examples/run_grpo_math.py \
   cluster.gpus_per_node=8
 ```

From d238a3392860bb03b147fd073e74d30a1e279da3 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 21 Mar 2025 16:47:45 -0700
Subject: [PATCH 6/6] fix typo

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 1b06e101be..5561fbcdb1 100644
--- a/README.md
+++ b/README.md
@@ -93,8 +93,8 @@ NUM_ACTOR_NODES=2
 # Add a timestamp to make each job name unique
 TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 
-# grpo_math_8b uses Llama-3.1-8B-Instruct model
-COMMAND="uv pip install -e .; uv run ./examples/run_sft.py --config examples/configs/sft.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 checkpointing.checkpoint_dir='results/sft_llama8b_2nodes' logger.wandb_enabled=True logger.wandb.name='sft-llama8b'\"" \
+# SFT experiment uses Llama-3.1-8B model
+COMMAND="uv pip install -e .; uv run ./examples/run_sft.py --config examples/configs/sft.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 checkpointing.checkpoint_dir='results/sft_llama8b_2nodes' logger.wandb_enabled=True logger.wandb.name='sft-llama8b'" \
 RAY_DEDUP_LOGS=0 \
 UV_CACHE_DIR=YOUR_UV_CACHE_DIR \
 CONTAINER=YOUR_CONTAINER \