NVIDIA-NeMo · terrykong · Apr 29, 2025 · Apr 16, 2025 · Apr 24, 2025 · Apr 24, 2025
@@ -150,7 +150,7 @@ jobs:
     if: ${{ needs.pre-flight.outputs.test_level != 'none' }}
     with:
       RUNNER: self-hosted-azure
-      TIMEOUT: 60
+      TIMEOUT: 75
       UNIT_TEST_SCRIPT: |
         cd /opt/nemo-rl
         if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then
@@ -168,10 +168,10 @@ jobs:
       FUNCTIONAL_TEST_SCRIPT: |
         cd /opt/nemo-rl
         if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L1|L2)$ ]]; then
-          uv run --no-sync bash ./tests/functional/sft.sh
-          uv run --no-sync bash ./tests/functional/grpo.sh
-          uv run --no-sync bash ./tests/functional/grpo_multiturn.sh
-          uv run --no-sync bash ./tests/functional/dpo.sh
+          time uv run --no-sync bash ./tests/functional/sft.sh
+          time uv run --no-sync bash ./tests/functional/grpo.sh
+          time uv run --no-sync bash ./tests/functional/grpo_multiturn.sh
+          time uv run --no-sync bash ./tests/functional/dpo.sh
         else
           echo Skipping functional tests for level ${{ needs.pre-flight.outputs.test_level }}
         fi

@@ -15,6 +15,8 @@ apidocs/
 dist/
 *.egg-info/
 *.vscode/
+release_run*
+ckpts/
 
 # Test
 coverage.json

@@ -17,7 +17,7 @@ RUN chmod 755 /home/ray/.cache
 
 FROM base AS hermetic
 
-WORKDIR /opt/reinforcer
+WORKDIR /opt/nemo-rl
 
 # First copy only the dependency files
 COPY --chown=ray --chmod=755 pyproject.toml uv.lock ./

@@ -0,0 +1,109 @@
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_rollout_turns: 1
+  max_num_steps: 500
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: meta-llama/Llama-3.1-8B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 2
+  max_total_sequence_length: 4096
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 3e-07
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 50
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 4096
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 128009
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: 4096
+      load_format: dummy
+      skip_tokenizer_init: true
+    pad_token_id: 128009
+    model_name: meta-llama/Llama-3.1-8B-Instruct
+data:
+  max_input_seq_length: 4096
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 4
@@ -0,0 +1,109 @@
+grpo:
+  num_prompts_per_step: 32
+  num_generations_per_prompt: 16
+  max_rollout_turns: 1
+  max_num_steps: 500
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: meta-llama/Llama-3.2-1B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.2-1B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 4
+  generation_batch_size: 32
+  logprob_batch_size: 4
+  max_total_sequence_length: 512
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 5e-06
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 50
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 512
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 128009
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: 512
+      load_format: dummy
+      skip_tokenizer_init: true
+    pad_token_id: 128009
+    model_name: meta-llama/Llama-3.2-1B-Instruct
+data:
+  max_input_seq_length: 512
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
@@ -0,0 +1,109 @@
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_rollout_turns: 1
+  max_num_steps: 20
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: Qwen/Qwen2.5-32B
+  tokenizer:
+    name: Qwen/Qwen2.5-32B
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 2
+  max_total_sequence_length: 16384
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: true
+    activation_checkpointing: true
+    tensor_parallel_size: 8
+  make_sequence_length_divisible_by: 8
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 3e-07
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 50
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 16384
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 151643
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 4
+      gpu_memory_utilization: 0.6
+      max_model_len: 16384
+      load_format: dummy
+      skip_tokenizer_init: true
+    pad_token_id: 151643
+    model_name: Qwen/Qwen2.5-32B
+data:
+  max_input_seq_length: 16384
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 16