From 2059fb00ae5c0257bfca6ec7ee9db79dfaede818 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 26 Mar 2025 11:31:46 -0700
Subject: [PATCH 1/3] update sft config to use single GPU

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 README.md                 | 14 ++++++++------
 examples/configs/sft.yaml |  8 ++++----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index ded81200f7..2619956b0c 100644
--- a/README.md
+++ b/README.md
@@ -60,20 +60,22 @@ We provide a sample SFT experiment that uses the [SQuAD dataset](https://rajpurk
 
 #### Single Node
 
-The experiment is set up to run on 8 GPUs. If using a machine that has access to 8 GPUs, you can launch the experiment as follows:
+The default SFT experiment is configured to run on a single GPU. To launch the experiment,
 
 ```sh
 uv run python examples/run_sft.py
 ```
 
-This trains `Llama3.1-8B` on 8 GPUs. To run on a single GPU, we'll have to override a few of the experiment settings. We replace the 8B model with a smaller 1B model, decrease the batch size, and update the cluster configuration to use a single gpu:
+This trains `Llama3.2-1B` on one GPU.
+
+If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the bacth size:
 
 ```sh
 uv run python examples/run_sft.py \
-  policy.model_name="meta-llama/Llama-3.2-1B" \
-  policy.train_global_batch_size=16 \
-  sft.val_global_batch_size=16 \
-  cluster.gpus_per_node=1
+  policy.model_name="meta-llama/Meta-Llama-3-8B" \
+  policy.train_global_batch_size=128 \
+  sft.val_global_batch_size=128 \
+  cluster.gpus_per_node=8
 ```
 
 Refer to [sft.yaml](examples/configs/sft.yaml) for a full list of parameters that can be overridden.
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 1282285fc3..91e4b451bd 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -3,7 +3,7 @@ sft:
   max_num_steps: 1000
   val_period: 10
   val_batches: 8
-  val_global_batch_size: 128
+  val_global_batch_size: 32
   val_micro_batch_size: 1
   val_at_start: true
   seed: 42
@@ -17,8 +17,8 @@ checkpointing:
   save_period: 10
 
 policy:
-  model_name: "meta-llama/Meta-Llama-3-8B"
-  train_global_batch_size: 128
+  model_name: "meta-llama/Llama-3.2-1B"
+  train_global_batch_size: 32
   train_micro_batch_size: 1
   max_total_sequence_length: 2048
   precision: "float32"
@@ -57,5 +57,5 @@ logger:
     flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
 
 cluster:
-  gpus_per_node: 8
+  gpus_per_node: 1
   num_nodes: 1

From ca23baf163dceb8f8b655fd64ae7bdec6c30cf62 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 26 Mar 2025 11:33:12 -0700
Subject: [PATCH 2/3] fix typo

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2619956b0c..e46e3718d8 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ uv run python examples/run_sft.py
 
 This trains `Llama3.2-1B` on one GPU.
 
-If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the bacth size:
+If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the batch size:
 
 ```sh
 uv run python examples/run_sft.py \

From dc4f0512325804cdab7ad4720791e69b4ea08856 Mon Sep 17 00:00:00 2001
From: Oleksii Kuchaiev <okuchaiev@nvidia.com>
Date: Wed, 26 Mar 2025 18:59:01 -0700
Subject: [PATCH 3/3] Different defaults for 1 GPU SFT

Signed-off-by: Oleksii Kuchaiev <okuchaiev@nvidia.com>
---
 README.md                 |  2 +-
 examples/configs/sft.yaml | 19 ++++++-------------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index e46e3718d8..aeab19f33c 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ The default SFT experiment is configured to run on a single GPU. To launch the e
 uv run python examples/run_sft.py
 ```
 
-This trains `Llama3.2-1B` on one GPU.
+This trains `Llama3.2-1B` on one GPU using SQUAD dataset.
 
 If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the batch size:
 
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 91e4b451bd..793ebb6285 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -1,6 +1,6 @@
 # SFT Algorithm Configuration
 sft:
-  max_num_steps: 1000
+  max_num_steps: 60
   val_period: 10
   val_batches: 8
   val_global_batch_size: 32
@@ -20,7 +20,7 @@ policy:
   model_name: "meta-llama/Llama-3.2-1B"
   train_global_batch_size: 32
   train_micro_batch_size: 1
-  max_total_sequence_length: 2048
+  max_total_sequence_length: 1024
   precision: "float32"
 
   optimizer:
@@ -30,13 +30,6 @@ policy:
       weight_decay: 0.1
       betas: [0.9, 0.98]
       eps: 1e-5
-
-  scheduler:
-    name: "torch.optim.lr_scheduler.LinearLR"
-    kwargs:
-      start_factor: 0.0196078
-      end_factor: 1.0
-      total_iters: 50
     
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
@@ -44,14 +37,14 @@ data:
 
 logger:
   log_dir: "logs"  # Base directory for all logs
-  wandb_enabled: false
-  tensorboard_enabled: false
+  wandb_enabled: true # Make sure you do ``wandb login [Your API key]'' before run
+  tensorboard_enabled: true
   monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
   wandb:
     project: "sft-dev"
-    name: "sft-dev-logger"
+    name: "sft-dev-${data.dataset_name}"
   tensorboard:
-    log_dir: "tb_logs"
+    log_dir: "tb_logs-sft-dev-${data.dataset_name}"
   gpu_monitoring:
     collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
     flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)