Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ jobs:
if: ${{ needs.pre-flight.outputs.test_level != 'none' }}
with:
RUNNER: self-hosted-azure
TIMEOUT: 60
TIMEOUT: 75
UNIT_TEST_SCRIPT: |
cd /opt/nemo-rl
if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then
Expand All @@ -168,10 +168,10 @@ jobs:
FUNCTIONAL_TEST_SCRIPT: |
cd /opt/nemo-rl
if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L1|L2)$ ]]; then
uv run --no-sync bash ./tests/functional/sft.sh
uv run --no-sync bash ./tests/functional/grpo.sh
uv run --no-sync bash ./tests/functional/grpo_multiturn.sh
uv run --no-sync bash ./tests/functional/dpo.sh
time uv run --no-sync bash ./tests/functional/sft.sh
time uv run --no-sync bash ./tests/functional/grpo.sh
time uv run --no-sync bash ./tests/functional/grpo_multiturn.sh
time uv run --no-sync bash ./tests/functional/dpo.sh
else
echo Skipping functional tests for level ${{ needs.pre-flight.outputs.test_level }}
fi
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ apidocs/
dist/
*.egg-info/
*.vscode/
release_run*
ckpts/

# Test
coverage.json
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ RUN chmod 755 /home/ray/.cache

FROM base AS hermetic

WORKDIR /opt/reinforcer
WORKDIR /opt/nemo-rl

# First copy only the dependency files
COPY --chown=ray --chmod=755 pyproject.toml uv.lock ./
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
grpo:
num_prompts_per_step: 64
num_generations_per_prompt: 32
max_rollout_turns: 1
max_num_steps: 500
normalize_rewards: true
use_leave_one_out_baseline: true
val_period: 10
val_at_start: false
max_val_samples: 256
val_batch_size: 256
loss_fn:
reference_policy_kl_penalty: 0.01
ratio_eps_min: 0.2
ratio_eps_max: 0.2
use_on_policy_kl_approximation: false
use_importance_sampling_correction: false
checkpointing:
enabled: true
checkpoint_dir: results/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
metric_name: val_reward
higher_is_better: true
keep_top_k: 3
save_period: 10
policy:
model_name: meta-llama/Llama-3.1-8B-Instruct
tokenizer:
name: meta-llama/Llama-3.1-8B-Instruct
train_global_batch_size: 512
train_micro_batch_size: 1
generation_batch_size: 32
logprob_batch_size: 2
max_total_sequence_length: 4096
precision: bfloat16
fsdp_offload_enabled: false
activation_checkpointing_enabled: false
refit_buffer_size_gb: 4
dtensor_cfg:
enabled: true
cpu_offload: false
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 1
make_sequence_length_divisible_by: 1
max_grad_norm: 1
optimizer:
name: torch.optim.AdamW
kwargs:
lr: 3e-07
weight_decay: 0.01
betas:
- 0.9
- 0.999
eps: 1e-08
foreach: false
fused: false
scheduler:
- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 0.1
end_factor: 1
total_iters: 50
- name: torch.optim.lr_scheduler.ConstantLR
kwargs:
factor: 1
total_iters: 10000000000
- milestones:
- 50
generation:
backend: vllm
max_new_tokens: 4096
temperature: 1
top_p: 1
top_k: null
stop_token_ids:
- 128009
stop_strings: null
vllm_cfg:
tensor_parallel_size: 1
gpu_memory_utilization: 0.6
max_model_len: 4096
load_format: dummy
skip_tokenizer_init: true
pad_token_id: 128009
model_name: meta-llama/Llama-3.1-8B-Instruct
data:
max_input_seq_length: 4096
prompt_file: examples/prompts/cot.txt
system_prompt_file: null
dataset_name: OpenMathInstruct-2
env:
math:
num_workers: 8
logger:
log_dir: logs/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
num_val_samples_to_print: 0
wandb_enabled: true
tensorboard_enabled: true
monitor_gpus: true
wandb:
project: nemo-rl
name: grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10
cluster:
gpus_per_node: 8
num_nodes: 4
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
grpo:
num_prompts_per_step: 32
num_generations_per_prompt: 16
max_rollout_turns: 1
max_num_steps: 500
normalize_rewards: true
use_leave_one_out_baseline: true
val_period: 10
val_at_start: false
max_val_samples: 256
val_batch_size: 256
loss_fn:
reference_policy_kl_penalty: 0.01
ratio_eps_min: 0.2
ratio_eps_max: 0.2
use_on_policy_kl_approximation: false
use_importance_sampling_correction: false
checkpointing:
enabled: true
checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
metric_name: val_reward
higher_is_better: true
keep_top_k: 3
save_period: 10
policy:
model_name: meta-llama/Llama-3.2-1B-Instruct
tokenizer:
name: meta-llama/Llama-3.2-1B-Instruct
train_global_batch_size: 512
train_micro_batch_size: 4
generation_batch_size: 32
logprob_batch_size: 4
max_total_sequence_length: 512
precision: bfloat16
fsdp_offload_enabled: false
activation_checkpointing_enabled: false
refit_buffer_size_gb: 4
dtensor_cfg:
enabled: true
cpu_offload: false
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 1
make_sequence_length_divisible_by: 1
max_grad_norm: 1
optimizer:
name: torch.optim.AdamW
kwargs:
lr: 5e-06
weight_decay: 0.01
betas:
- 0.9
- 0.999
eps: 1e-08
foreach: false
fused: false
scheduler:
- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 0.1
end_factor: 1
total_iters: 50
- name: torch.optim.lr_scheduler.ConstantLR
kwargs:
factor: 1
total_iters: 10000000000
- milestones:
- 50
generation:
backend: vllm
max_new_tokens: 512
temperature: 1
top_p: 1
top_k: null
stop_token_ids:
- 128009
stop_strings: null
vllm_cfg:
tensor_parallel_size: 1
gpu_memory_utilization: 0.6
max_model_len: 512
load_format: dummy
skip_tokenizer_init: true
pad_token_id: 128009
model_name: meta-llama/Llama-3.2-1B-Instruct
data:
max_input_seq_length: 512
prompt_file: examples/prompts/cot.txt
system_prompt_file: null
dataset_name: OpenMathInstruct-2
env:
math:
num_workers: 8
logger:
log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
num_val_samples_to_print: 0
wandb_enabled: true
tensorboard_enabled: true
monitor_gpus: true
wandb:
project: nemo-rl
name: grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10
cluster:
gpus_per_node: 8
num_nodes: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
grpo:
num_prompts_per_step: 64
num_generations_per_prompt: 32
max_rollout_turns: 1
max_num_steps: 20
normalize_rewards: true
use_leave_one_out_baseline: true
val_period: 10
val_at_start: false
max_val_samples: 256
val_batch_size: 256
loss_fn:
reference_policy_kl_penalty: 0.01
ratio_eps_min: 0.2
ratio_eps_max: 0.2
use_on_policy_kl_approximation: false
use_importance_sampling_correction: false
checkpointing:
enabled: true
checkpoint_dir: results/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long
metric_name: val_reward
higher_is_better: true
keep_top_k: 3
save_period: 10
policy:
model_name: Qwen/Qwen2.5-32B
tokenizer:
name: Qwen/Qwen2.5-32B
train_global_batch_size: 512
train_micro_batch_size: 1
generation_batch_size: 32
logprob_batch_size: 2
max_total_sequence_length: 16384
precision: bfloat16
fsdp_offload_enabled: false
activation_checkpointing_enabled: false
refit_buffer_size_gb: 4
dtensor_cfg:
enabled: true
cpu_offload: false
sequence_parallel: true
activation_checkpointing: true
tensor_parallel_size: 8
make_sequence_length_divisible_by: 8
max_grad_norm: 1
optimizer:
name: torch.optim.AdamW
kwargs:
lr: 3e-07
weight_decay: 0.01
betas:
- 0.9
- 0.999
eps: 1e-08
foreach: false
fused: false
scheduler:
- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 0.1
end_factor: 1
total_iters: 50
- name: torch.optim.lr_scheduler.ConstantLR
kwargs:
factor: 1
total_iters: 10000000000
- milestones:
- 50
generation:
backend: vllm
max_new_tokens: 16384
temperature: 1
top_p: 1
top_k: null
stop_token_ids:
- 151643
stop_strings: null
vllm_cfg:
tensor_parallel_size: 4
gpu_memory_utilization: 0.6
max_model_len: 16384
load_format: dummy
skip_tokenizer_init: true
pad_token_id: 151643
model_name: Qwen/Qwen2.5-32B
data:
max_input_seq_length: 16384
prompt_file: examples/prompts/cot.txt
system_prompt_file: null
dataset_name: OpenMathInstruct-2
env:
math:
num_workers: 8
logger:
log_dir: logs/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long
num_val_samples_to_print: 0
wandb_enabled: true
tensorboard_enabled: true
monitor_gpus: true
wandb:
project: nemo-rl
name: grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10
cluster:
gpus_per_node: 8
num_nodes: 16
Loading