From 2059fb00ae5c0257bfca6ec7ee9db79dfaede818 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 26 Mar 2025 11:31:46 -0700 Subject: [PATCH 1/3] update sft config to use single GPU Signed-off-by: ashors1 --- README.md | 14 ++++++++------ examples/configs/sft.yaml | 8 ++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index ded81200f7..2619956b0c 100644 --- a/README.md +++ b/README.md @@ -60,20 +60,22 @@ We provide a sample SFT experiment that uses the [SQuAD dataset](https://rajpurk #### Single Node -The experiment is set up to run on 8 GPUs. If using a machine that has access to 8 GPUs, you can launch the experiment as follows: +The default SFT experiment is configured to run on a single GPU. To launch the experiment, ```sh uv run python examples/run_sft.py ``` -This trains `Llama3.1-8B` on 8 GPUs. To run on a single GPU, we'll have to override a few of the experiment settings. We replace the 8B model with a smaller 1B model, decrease the batch size, and update the cluster configuration to use a single gpu: +This trains `Llama3.2-1B` on one GPU. + +If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the bacth size: ```sh uv run python examples/run_sft.py \ - policy.model_name="meta-llama/Llama-3.2-1B" \ - policy.train_global_batch_size=16 \ - sft.val_global_batch_size=16 \ - cluster.gpus_per_node=1 + policy.model_name="meta-llama/Meta-Llama-3-8B" \ + policy.train_global_batch_size=128 \ + sft.val_global_batch_size=128 \ + cluster.gpus_per_node=8 ``` Refer to [sft.yaml](examples/configs/sft.yaml) for a full list of parameters that can be overridden. diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 1282285fc3..91e4b451bd 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -3,7 +3,7 @@ sft: max_num_steps: 1000 val_period: 10 val_batches: 8 - val_global_batch_size: 128 + val_global_batch_size: 32 val_micro_batch_size: 1 val_at_start: true seed: 42 @@ -17,8 +17,8 @@ checkpointing: save_period: 10 policy: - model_name: "meta-llama/Meta-Llama-3-8B" - train_global_batch_size: 128 + model_name: "meta-llama/Llama-3.2-1B" + train_global_batch_size: 32 train_micro_batch_size: 1 max_total_sequence_length: 2048 precision: "float32" @@ -57,5 +57,5 @@ logger: flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) cluster: - gpus_per_node: 8 + gpus_per_node: 1 num_nodes: 1 From ca23baf163dceb8f8b655fd64ae7bdec6c30cf62 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 26 Mar 2025 11:33:12 -0700 Subject: [PATCH 2/3] fix typo Signed-off-by: ashors1 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2619956b0c..e46e3718d8 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ uv run python examples/run_sft.py This trains `Llama3.2-1B` on one GPU. -If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the bacth size: +If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the batch size: ```sh uv run python examples/run_sft.py \ From dc4f0512325804cdab7ad4720791e69b4ea08856 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Wed, 26 Mar 2025 18:59:01 -0700 Subject: [PATCH 3/3] Different defaults for 1 GPU SFT Signed-off-by: Oleksii Kuchaiev --- README.md | 2 +- examples/configs/sft.yaml | 19 ++++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index e46e3718d8..aeab19f33c 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ The default SFT experiment is configured to run on a single GPU. To launch the e uv run python examples/run_sft.py ``` -This trains `Llama3.2-1B` on one GPU. +This trains `Llama3.2-1B` on one GPU using SQUAD dataset. If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the batch size: diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 91e4b451bd..793ebb6285 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -1,6 +1,6 @@ # SFT Algorithm Configuration sft: - max_num_steps: 1000 + max_num_steps: 60 val_period: 10 val_batches: 8 val_global_batch_size: 32 @@ -20,7 +20,7 @@ policy: model_name: "meta-llama/Llama-3.2-1B" train_global_batch_size: 32 train_micro_batch_size: 1 - max_total_sequence_length: 2048 + max_total_sequence_length: 1024 precision: "float32" optimizer: @@ -30,13 +30,6 @@ policy: weight_decay: 0.1 betas: [0.9, 0.98] eps: 1e-5 - - scheduler: - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 0.0196078 - end_factor: 1.0 - total_iters: 50 data: max_input_seq_length: ${policy.max_total_sequence_length} @@ -44,14 +37,14 @@ data: logger: log_dir: "logs" # Base directory for all logs - wandb_enabled: false - tensorboard_enabled: false + wandb_enabled: true # Make sure you do ``wandb login [Your API key]'' before run + tensorboard_enabled: true monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard wandb: project: "sft-dev" - name: "sft-dev-logger" + name: "sft-dev-${data.dataset_name}" tensorboard: - log_dir: "tb_logs" + log_dir: "tb_logs-sft-dev-${data.dataset_name}" gpu_monitoring: collection_interval: 10 # How often to collect GPU usage metrics (in seconds) flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds)