diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index e0f0a6532b..c38cc2dd87 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -150,7 +150,7 @@ jobs: if: ${{ needs.pre-flight.outputs.test_level != 'none' }} with: RUNNER: self-hosted-azure - TIMEOUT: 60 + TIMEOUT: 75 UNIT_TEST_SCRIPT: | cd /opt/nemo-rl if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then @@ -168,10 +168,10 @@ jobs: FUNCTIONAL_TEST_SCRIPT: | cd /opt/nemo-rl if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L1|L2)$ ]]; then - uv run --no-sync bash ./tests/functional/sft.sh - uv run --no-sync bash ./tests/functional/grpo.sh - uv run --no-sync bash ./tests/functional/grpo_multiturn.sh - uv run --no-sync bash ./tests/functional/dpo.sh + time uv run --no-sync bash ./tests/functional/sft.sh + time uv run --no-sync bash ./tests/functional/grpo.sh + time uv run --no-sync bash ./tests/functional/grpo_multiturn.sh + time uv run --no-sync bash ./tests/functional/dpo.sh else echo Skipping functional tests for level ${{ needs.pre-flight.outputs.test_level }} fi diff --git a/.gitignore b/.gitignore index 478990ddc8..12121a4155 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,8 @@ apidocs/ dist/ *.egg-info/ *.vscode/ +release_run* +ckpts/ # Test coverage.json diff --git a/docker/Dockerfile b/docker/Dockerfile index b1977a4ac9..2baf5d4ea3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -17,7 +17,7 @@ RUN chmod 755 /home/ray/.cache FROM base AS hermetic -WORKDIR /opt/reinforcer +WORKDIR /opt/nemo-rl # First copy only the dependency files COPY --chown=ray --chmod=755 pyproject.toml uv.lock ./ diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml new file mode 100644 index 0000000000..ba6ba255f3 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 500 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.1-8B-Instruct + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 4096 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 4096 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 128009 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: 4096 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 128009 + model_name: meta-llama/Llama-3.1-8B-Instruct +data: + max_input_seq_length: 4096 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml new file mode 100644 index 0000000000..96e8e023cb --- /dev/null +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 32 + num_generations_per_prompt: 16 + max_rollout_turns: 1 + max_num_steps: 500 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.2-1B-Instruct + tokenizer: + name: meta-llama/Llama-3.2-1B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 4 + generation_batch_size: 32 + logprob_batch_size: 4 + max_total_sequence_length: 512 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 512 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 128009 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: 512 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 128009 + model_name: meta-llama/Llama-3.2-1B-Instruct +data: + max_input_seq_length: 512 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml new file mode 100644 index 0000000000..3693ac4677 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 20 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-32B + tokenizer: + name: Qwen/Qwen2.5-32B + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 16384 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: true + activation_checkpointing: true + tensor_parallel_size: 8 + make_sequence_length_divisible_by: 8 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 16384 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151643 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 4 + gpu_memory_utilization: 0.6 + max_model_len: 16384 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 151643 + model_name: Qwen/Qwen2.5-32B +data: + max_input_seq_length: 16384 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 16 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml new file mode 100644 index 0000000000..aed12183a8 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 2 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-32B + tokenizer: + name: Qwen/Qwen2.5-32B + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 16384 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: true + activation_checkpointing: true + tensor_parallel_size: 8 + make_sequence_length_divisible_by: 8 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 16384 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151643 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 4 + gpu_memory_utilization: 0.6 + max_model_len: 16384 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 151643 + model_name: Qwen/Qwen2.5-32B +data: + max_input_seq_length: 16384 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 16 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml new file mode 100644 index 0000000000..27211ddc7e --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 30 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-fsdp1 + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-7B-Instruct + tokenizer: + name: Qwen/Qwen2.5-7B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 4096 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: false + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 4096 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151645 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: 4096 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 151643 + model_name: Qwen/Qwen2.5-7B-Instruct +data: + max_input_seq_length: 4096 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-fsdp1 + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-7b-instruct-4n8g-fsdp1 + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml new file mode 100644 index 0000000000..87e2c592c0 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 30 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-7B-Instruct + tokenizer: + name: Qwen/Qwen2.5-7B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 4096 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: true + activation_checkpointing: false + tensor_parallel_size: 4 + make_sequence_length_divisible_by: 4 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 4096 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151645 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 4 + gpu_memory_utilization: 0.6 + max_model_len: 4096 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 151643 + model_name: Qwen/Qwen2.5-7B-Instruct +data: + max_input_seq_length: 4096 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml new file mode 100644 index 0000000000..9f5762f173 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 32 + num_generations_per_prompt: 16 + max_rollout_turns: 1 + max_num_steps: 450 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1 + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-Math-1.5B-Instruct + tokenizer: + name: Qwen/Qwen2.5-Math-1.5B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 4 + generation_batch_size: 32 + logprob_batch_size: 4 + max_total_sequence_length: 512 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 512 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151645 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: 512 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 151643 + model_name: Qwen/Qwen2.5-Math-1.5B-Instruct +data: + max_input_seq_length: 512 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1 + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1 + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml new file mode 100644 index 0000000000..da0140a73e --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml @@ -0,0 +1,67 @@ +sft: + max_num_epochs: 1 + max_num_steps: 250 + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp1 + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.1-8B-Instruct + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 1024 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: false + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + eps: 1e-05 + foreach: false + fused: false +data: + max_input_seq_length: 1024 + dataset_name: squad + add_bos: true + add_eos: true +logger: + log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp1 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-llama3.1-8b-instruct-1n8g-fsdp1 + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml new file mode 100644 index 0000000000..288f365c1a --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml @@ -0,0 +1,67 @@ +sft: + max_num_epochs: 1 + max_num_steps: 2730 + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.1-8B-Instruct + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 1024 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + eps: 1e-05 + foreach: false + fused: false +data: + max_input_seq_length: 1024 + dataset_name: squad + add_bos: true + add_eos: true +logger: + log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml new file mode 100644 index 0000000000..f065b5cd34 --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml @@ -0,0 +1,67 @@ +sft: + max_num_epochs: 1 + max_num_steps: 350 + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.1-8B-Instruct + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 1024 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: true + activation_checkpointing: false + tensor_parallel_size: 2 + make_sequence_length_divisible_by: 2 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + eps: 1e-05 + foreach: false + fused: false +data: + max_input_seq_length: 1024 + dataset_name: squad + add_bos: true + add_eos: true +logger: + log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml new file mode 100644 index 0000000000..7c4bd357ed --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml @@ -0,0 +1,67 @@ +sft: + max_num_epochs: 1 + max_num_steps: 500 + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.2-1b-1n8g-fsdp2tp1 + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.2-1B + tokenizer: + name: meta-llama/Llama-3.2-1B + chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 1024 + precision: float32 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + eps: 1e-05 + foreach: false + fused: false +data: + max_input_seq_length: 1024 + dataset_name: squad + add_bos: true + add_eos: true +logger: + log_dir: logs/sft-llama3.2-1b-1n8g-fsdp2tp1 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-llama3.2-1b-1n8g-fsdp2tp1 + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml new file mode 100644 index 0000000000..4cd1a5387c --- /dev/null +++ b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml @@ -0,0 +1,67 @@ +sft: + max_num_epochs: 1 + max_num_steps: 20 + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-32B + tokenizer: + name: Qwen/Qwen2.5-32B + chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 16000 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: true + activation_checkpointing: true + tensor_parallel_size: 8 + make_sequence_length_divisible_by: 8 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + eps: 1e-05 + foreach: false + fused: false +data: + max_input_seq_length: 16000 + dataset_name: squad + add_bos: true + add_eos: true +logger: + log_dir: logs/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 4 diff --git a/nemo_rl/__init__.py b/nemo_rl/__init__.py index 1606956b87..c755e5ed0f 100644 --- a/nemo_rl/__init__.py +++ b/nemo_rl/__init__.py @@ -1,3 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os from nemo_rl.package_info import ( __contact_emails__, diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000000..4e51a6efad --- /dev/null +++ b/tests/README.md @@ -0,0 +1,20 @@ +# Tests + +## Launching Release Tests + +```sh +# Assuming in NeMo RL project root + +cd tools/ + +IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch ... + +# DRYRUN=1 to get a rough estimate of compute +DRYRUN=1 IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch ... + +# DRYRUN=2 will create a codesnapshot with a fully hermetic example +DRYRUN=2 IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch ... + +# Run all (Caution: this will use a lot of compute; consider listing out the jobs) +IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch ../../recipes/**/*.sh +``` diff --git a/tests/functional/check_metrics.py b/tests/check_metrics.py similarity index 100% rename from tests/functional/check_metrics.py rename to tests/check_metrics.py diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh index 2421c5da6a..200a08cdd7 100755 --- a/tests/functional/dpo.sh +++ b/tests/functional/dpo.sh @@ -7,18 +7,19 @@ git config --global --add safe.directory $PROJECT_ROOT set -eou pipefail -LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs -JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json -RUN_LOG=$LOG_DIR/$(basename $0 .sh).log -export RAY_DEDUP_LOGS=0 +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -rm -rf $LOG_DIR -mkdir -p $LOG_DIR +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT -python -u $PROJECT_ROOT/examples/run_dpo.py \ +uv run $PROJECT_ROOT/examples/run_dpo.py \ cluster.gpus_per_node=2 \ dpo.max_num_steps=3 \ dpo.val_batches=1 \ @@ -31,9 +32,8 @@ python -u $PROJECT_ROOT/examples/run_dpo.py \ $@ \ 2>&1 | tee $RUN_LOG -cd $SCRIPT_DIR -python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -python check_metrics.py $JSON_METRICS \ +uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["2"] < 0.694' \ diff --git a/tests/functional/grpo.sh b/tests/functional/grpo.sh index b61442227b..bbbbd44a11 100755 --- a/tests/functional/grpo.sh +++ b/tests/functional/grpo.sh @@ -2,22 +2,24 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo +# Mark the current repo as safe, since wandb fetches metadata about the repo git config --global --add safe.directory $PROJECT_ROOT set -eou pipefail -LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs -JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json -RUN_LOG=$LOG_DIR/$(basename $0 .sh).log +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -rm -rf $LOG_DIR -mkdir -p $LOG_DIR +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT -python -u $PROJECT_ROOT/examples/run_grpo_math.py \ +uv run $PROJECT_ROOT/examples/run_grpo_math.py \ cluster.gpus_per_node=2 \ grpo.max_num_steps=3 \ logger.tensorboard_enabled=true \ @@ -27,9 +29,8 @@ python -u $PROJECT_ROOT/examples/run_grpo_math.py \ $@ \ 2>&1 | tee $RUN_LOG -cd $SCRIPT_DIR -python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -python check_metrics.py $JSON_METRICS \ +uv run tests/check_metrics.py $JSON_METRICS \ 'max(data["train/token_mult_prob_error"]) < 1.1' \ diff --git a/tests/functional/grpo_multiturn.sh b/tests/functional/grpo_multiturn.sh index ff9befcdd7..a22153c729 100755 --- a/tests/functional/grpo_multiturn.sh +++ b/tests/functional/grpo_multiturn.sh @@ -7,17 +7,19 @@ git config --global --add safe.directory $PROJECT_ROOT set -eou pipefail -LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs -JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json -RUN_LOG=$LOG_DIR/$(basename $0 .sh).log +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -rm -rf $LOG_DIR -mkdir -p $LOG_DIR +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT -python -u $PROJECT_ROOT/examples/run_grpo_sliding_puzzle.py \ +uv run $PROJECT_ROOT/examples/run_grpo_sliding_puzzle.py \ cluster.gpus_per_node=2 \ grpo.max_rollout_turns=10 \ grpo.max_num_steps=3 \ @@ -32,9 +34,8 @@ python -u $PROJECT_ROOT/examples/run_grpo_sliding_puzzle.py \ $@ \ 2>&1 | tee $RUN_LOG -cd $SCRIPT_DIR -python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -python check_metrics.py $JSON_METRICS \ +uv run tests/check_metrics.py $JSON_METRICS \ 'max(data["train/token_mult_prob_error"]) < 1.1' \ diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh index f3474fb0fd..90985ae2c1 100755 --- a/tests/functional/sft.sh +++ b/tests/functional/sft.sh @@ -1,26 +1,28 @@ #!/bin/bash -## clean up checkpoint directory on exit +# clean up checkpoint directory on exit trap "rm -rf /tmp/sft_checkpoints" EXIT SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo +# Mark the current repo as safe, since wandb fetches metadata about the repo git config --global --add safe.directory $PROJECT_ROOT set -eou pipefail -LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs -JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json -RUN_LOG=$LOG_DIR/$(basename $0 .sh).log +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -rm -rf $LOG_DIR -mkdir -p $LOG_DIR +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT -python -u $PROJECT_ROOT/examples/run_sft.py \ +uv run $PROJECT_ROOT/examples/run_sft.py \ policy.model_name=meta-llama/Llama-3.2-1B \ cluster.gpus_per_node=2 \ sft.max_num_steps=10 \ @@ -34,10 +36,9 @@ python -u $PROJECT_ROOT/examples/run_sft.py \ $@ \ 2>&1 | tee $RUN_LOG -cd $SCRIPT_DIR -python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # TODO: loss is very noisy, this check is mainly for sanity of immediate divergence -python check_metrics.py $JSON_METRICS \ +uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["9"] < 1500' \ diff --git a/tests/functional/json_dump_tb_logs.py b/tests/json_dump_tb_logs.py similarity index 100% rename from tests/functional/json_dump_tb_logs.py rename to tests/json_dump_tb_logs.py diff --git a/tests/test_suites/README.md b/tests/test_suites/README.md new file mode 100644 index 0000000000..3ccf0d75c9 --- /dev/null +++ b/tests/test_suites/README.md @@ -0,0 +1,67 @@ +# Recipes + +## Naming + +Each test is named: +``` +--#n#g--.sh +``` + +Examples: +* sft-llama3.2-1b-1n8g-fsdp2tp1.sh +* grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2.sh +* grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.sh + +## Running manually + +Each recipe can be run on the head node: + +```sh +uv run ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh +``` + +and the result directory can be found at the same level of the script (w/o `.sh` prefix): + +```sh +ls -lh llm/sft-llama3.2-1b-1n8g-fsdp2tp1/ +# drwxr-xr-x 2 terryk dip 4.0K Apr 23 18:07 ckpts +# drwxr-xr-x 3 terryk dip 4.0K Apr 23 18:07 logs +# -rw-r--r-- 1 terryk dip 142K Apr 23 18:23 metrics.json +# -rw-r--r-- 1 terryk dip 94K Apr 23 18:23 run.log +``` + +## Launching with code snapshots + +We provide a convenience script that will create a code snapshot and launch `NUM_RUNS` number of slurm jobs (`NUM_RUNS` is defined in the script itself). We create a code snapshot to +ensure that even as the master repo changes its code, you can always run your experiment with +the snapshot of the code at the time the experiment was initially launched. + +```sh +# Launch +CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh + +# Prints Estimated GPUhrs and then exits +DRYRUN=1 CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh + +# Prints Estimated GPUhrs, creates code snapshot, then exits +DRYRUN=2 CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh +``` + +After this completes, you can find the result under + +```sh +ls -lh ../code_snapshots/sft-llama3.2-1b-1n8g-fsdp2tp1/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1/ +# drwxr-xr-x 2 terryk dip 4.0K Apr 23 18:07 ckpts +# drwxr-xr-x 3 terryk dip 4.0K Apr 23 18:07 logs +# -rw-r--r-- 1 terryk dip 142K Apr 23 18:23 metrics.json +# -rw-r--r-- 1 terryk dip 94K Apr 23 18:23 run.log +``` + +As a convenience, there's also a `continue.sh` script under that will launch +another run using the same arguments. This is helpful if your job was +unexpectedly cancelled or you want to run it for a little longer. + +```sh +# This launches one more run of the same experiment +../code_snapshots/sft-llama3.2-1b-1n8g-fsdp2tp1/continue.sh +``` diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env new file mode 100644 index 0000000000..c2008292b9 --- /dev/null +++ b/tests/test_suites/llm/common.env @@ -0,0 +1,46 @@ +#!/bin/bash +# Source this file before running test to setup +# +# source ./common.env +set -eou pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +# Mark all repos as safe in the test context, since wandb fetchs metadata about the repo and it's a +# catch-22 to get the project root and mark it safe if you don't know the project root +git config --global --add safe.directory "*" +PROJECT_ROOT=$(git rev-parse --show-toplevel) + +exit_if_max_steps_reached() { + # Early stopping to save compute if max step has been reached + STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) + if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 + fi + echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" +} + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log + +# Test script has path: tests/test_suites/llm/${EXP_NAME}.sh +# where config has path: examples/configs/recipes/llm/${EXP_NAME}.yaml +# We will assume/check the path matches this pattern +CONFIG_PATH=$(echo $SCRIPT_DIR/${EXP_NAME}.yaml | sed 's#tests/test_suites#examples/configs/recipes#') +if [[ ! -f $CONFIG_PATH ]]; then + echo "[ERROR] Config file $CONFIG_PATH not found" + exit 1 +fi + +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +if [[ -n "${TEST_DRYRUN:-}" ]]; then + echo "[INFO] TEST_DRYRUN mode: used for testing" + exit +fi + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh new file mode 100755 index 0000000000..6e64876058 --- /dev/null +++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=100 +MAX_STEPS=500 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["100"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh new file mode 100755 index 0000000000..45cfad6e83 --- /dev/null +++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=500 +MAX_STEPS=500 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["500"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh new file mode 100755 index 0000000000..69c9899ccd --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=16 +STEPS_PER_RUN=10 +MAX_STEPS=20 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["20"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh new file mode 100755 index 0000000000..ccdef1b2bd --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=16 +STEPS_PER_RUN=2 # 40min: step_time: [1341, 801] +MAX_STEPS=2 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=60 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["2"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh new file mode 100755 index 0000000000..49c96a6f58 --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=30 +MAX_STEPS=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=90 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["30"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh new file mode 100755 index 0000000000..b3071fb58e --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=30 +MAX_STEPS=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=180 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["30"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh new file mode 100755 index 0000000000..98df00c25c --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=450 +MAX_STEPS=450 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["450"] < 1.1' +fi + diff --git a/tests/test_suites/llm/performance/.gitkeep b/tests/test_suites/llm/performance/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh new file mode 100755 index 0000000000..1e51c2a78f --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh @@ -0,0 +1,41 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=250 +MAX_STEPS=250 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=30 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # TODO: FIGURE OUT CORRECT METRICS + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 4' \ + 'data["train/loss"]["250"] < 0.5' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 60000' +fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh new file mode 100755 index 0000000000..32bb6dacb7 --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh @@ -0,0 +1,43 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# TODO: @ashors real convergence run (dataset only has 2737) +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=2730 +MAX_STEPS=2730 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/nemo-rl/issues/263 +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # TODO: FIGURE OUT CORRECT METRICS + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 5' \ + 'data["train/loss"]["2730"] < 0.3' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 45000' +fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh new file mode 100755 index 0000000000..ac441240fc --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh @@ -0,0 +1,43 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=350 +MAX_STEPS=350 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=45 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/nemo-rl/issues/263 + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # TODO: FIGURE OUT CORRECT METRICS + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 5' \ + 'data["train/loss"]["350"] < 0.5' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 45000' +fi diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh new file mode 100755 index 0000000000..24b966c2af --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh @@ -0,0 +1,41 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=500 +MAX_STEPS=500 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=15 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 2.4' \ + 'data["train/loss"]["500"] < 0.5' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 25000' +fi + diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh new file mode 100755 index 0000000000..9fb5f7839b --- /dev/null +++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh @@ -0,0 +1,43 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# TODO: this config can crash on OOM +# https://github.com/NVIDIA/nemo-rl/issues/263 + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=20 # step_time ~ 29sec +MAX_STEPS=20 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=30 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 1.5' \ + 'data["train/loss"]["20"] < 0.3' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 35000' +fi diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt new file mode 100644 index 0000000000..4c609d5bff --- /dev/null +++ b/tests/test_suites/nightly.txt @@ -0,0 +1,28 @@ +######## +# GRPO # +######## + +# Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much +tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh +tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh + +# FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct) +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh + +# Functional 32b run +tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh + +####### +# SFT # +####### + +# 1N 1B/8B runs +tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh + +# Dtensor vs fsdp1 (8B) +tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh +tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh + +# Functional 32b test +tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh diff --git a/tests/test_suites/nightly_performance.txt b/tests/test_suites/nightly_performance.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt new file mode 100644 index 0000000000..69735cb0cb --- /dev/null +++ b/tests/test_suites/release.txt @@ -0,0 +1,16 @@ +######## +# GRPO # +######## + +# Long 8b run +tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh + +# Long 32b run +tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh + +####### +# SFT # +####### + +# Long 8b convergence +tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh \ No newline at end of file diff --git a/tests/test_suites/release_performance.txt b/tests/test_suites/release_performance.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py new file mode 100644 index 0000000000..edceba3649 --- /dev/null +++ b/tests/unit/test_recipes_and_test_suites.py @@ -0,0 +1,216 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import os +import glob +import subprocess + +dir_path = os.path.dirname(os.path.abspath(__file__)) +project_root = os.path.abspath(os.path.join(dir_path, "..", "..")) +test_suites_dir = os.path.join(project_root, "tests", "test_suites") + +nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt") +release_test_suite_path = os.path.join(test_suites_dir, "release.txt") +nightly_performance_test_suite_path = os.path.join( + test_suites_dir, "nightly_performance.txt" +) +release_performance_test_suite_path = os.path.join( + test_suites_dir, "release_performance.txt" +) + + +@pytest.fixture +def nightly_test_suite(): + nightly_suite = [] + with open(nightly_test_suite_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + nightly_suite.append(line) + return nightly_suite + + +@pytest.fixture +def release_test_suite(): + release_suite = [] + with open(release_test_suite_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + release_suite.append(line) + return release_suite + + +@pytest.fixture +def nightly_performance_test_suite(): + nightly_performance_suite = [] + with open(nightly_performance_test_suite_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + nightly_performance_suite.append(line) + return nightly_performance_suite + + +@pytest.fixture +def release_performance_test_suite(): + release_performance_suite = [] + with open(release_performance_test_suite_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + release_performance_suite.append(line) + return release_performance_suite + + +@pytest.fixture +def all_test_suites( + nightly_test_suite, + release_test_suite, + nightly_performance_test_suite, + release_performance_test_suite, +): + return ( + nightly_test_suite + + release_test_suite + + nightly_performance_test_suite + + release_performance_test_suite + ) + + +@pytest.mark.parametrize( + "test_suite_path", + [ + nightly_test_suite_path, + release_test_suite_path, + nightly_performance_test_suite_path, + release_performance_test_suite_path, + ], + ids=[ + "nightly_test_suite", + "release_test_suite", + "nightly_performance_test_suite", + "release_performance_test_suite", + ], +) +def test_test_suites_exist(test_suite_path): + assert os.path.exists(test_suite_path), ( + f"Test suite {test_suite_path} does not exist" + ) + + +def test_no_overlap_across_test_suites(all_test_suites): + recipes = set(all_test_suites) + assert len(recipes) == len(all_test_suites), f"Test suites have repeats {recipes}" + + +def test_all_recipes_accounted_for_in_test_suites(all_test_suites): + all_recipes_in_test_suites = set(all_test_suites) + + all_tests_in_test_suites_dir = set() + for recipe_path in glob.glob( + os.path.join(test_suites_dir, "**", "*.sh"), recursive=True + ): + # Strip off the project root and leading slash + recipe_name = recipe_path[len(project_root) + 1 :] + all_tests_in_test_suites_dir.add(recipe_name) + + assert all_recipes_in_test_suites == all_tests_in_test_suites_dir, ( + "All recipes are not accounted for in the test suites" + ) + + +def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker): + command = f"DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}" + + print(f"Running command: {command}") + + # Run the command from the project root directory + result = subprocess.run( + command, + shell=True, + cwd=project_root, + capture_output=True, + text=True, + check=False, # Don't raise exception on non-zero exit code + ) + + # Print stdout and stderr for debugging if the test fails + print("STDOUT:") + print(result.stdout) + print("STDERR:") + print(result.stderr) + + # Assert that the command exited successfully + assert result.returncode == 0, f"Command failed with exit code {result.returncode}" + + # Assert that the last line of stdout contains the expected prefix + stdout_lines = result.stdout.strip().splitlines() + assert len(stdout_lines) > 0, "Command produced no output" + last_line = stdout_lines[-1] + assert last_line.startswith("[INFO]: Total GPU hours:"), ( + f"Last line of output was not as expected: '{last_line}'" + ) + total_gpu_hours = float(last_line.split(":")[-1].strip()) + assert total_gpu_hours <= 1024, ( + f"Total GPU hours exceeded 1024: {last_line}. We should revisit the test suites to reduce the total GPU hours." + ) + tracker.track("total_nightly_gpu_hours", total_gpu_hours) + + +def test_dry_run_does_not_fail_and_prints_total_gpu_hours(): + command = "DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./tests/test_suites/**/*.sh" + + # Run the command from the project root directory + result = subprocess.run( + command, + shell=True, + cwd=project_root, + capture_output=True, + text=True, + check=False, # Don't raise exception on non-zero exit code + ) + + # Print stdout and stderr for debugging if the test fails + print("STDOUT:") + print(result.stdout) + print("STDERR:") + print(result.stderr) + + # Assert that the command exited successfully + assert result.returncode == 0, f"Command failed with exit code {result.returncode}" + + # Assert that the last line of stdout contains the expected prefix + stdout_lines = result.stdout.strip().splitlines() + assert len(stdout_lines) > 0, "Command produced no output" + last_line = stdout_lines[-1] + assert last_line.startswith("[INFO]: Total GPU hours:"), ( + f"Last line of output was not as expected: '{last_line}'" + ) + + +def test_all_tests_can_find_config_if_dryrun(all_test_suites): + for test_suite in all_test_suites: + command = f"TEST_DRYRUN=1 {test_suite}" + result = subprocess.run( + command, + shell=True, + cwd=project_root, + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, ( + f"Command failed with exit code {result.returncode}" + ) diff --git a/tools/autoformat.sh b/tools/autoformat.sh old mode 100644 new mode 100755 diff --git a/tools/code_snapshot.sh b/tools/code_snapshot.sh new file mode 100644 index 0000000000..62136a8632 --- /dev/null +++ b/tools/code_snapshot.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -euo pipefail + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +PROJECT_ROOT=${SCRIPT_DIR}/.. +cd ${PROJECT_ROOT} + +echo2() { + echo "$@" >&2 +} + +if [[ ! -e "$PROJECT_ROOT/.git" ]]; then + echo2 "[Error]: This script was not run from the root of NeMo RL git repo. Please clone it first." + exit 1 +elif [[ $# -lt 1 ]]; then + echo2 "[Error]: This script requires one argument: the name of the experiment to be used as the snapshot directory name" + echo2 "Usage: bash tools/code_snapshot.sh " + exit 1 +fi + +EXP_NAME=$1 + +SNAPSHOT_DIR="$PROJECT_ROOT/code_snapshots/${EXP_NAME}" +if [[ ! -d "$SNAPSHOT_DIR" ]]; then + echo2 "Creating new code snapshot in $SNAPSHOT_DIR" + mkdir -p $SNAPSHOT_DIR +else + echo2 "Using existing code snapshot in $SNAPSHOT_DIR" + # Echo the snapshot directory so the caller can use it to `cd` into it + echo ${SNAPSHOT_DIR} + exit +fi + +echo2 "Copying git-tracked files..." +rsync -a --files-from=<(git ls-files) ./ $SNAPSHOT_DIR/ + + +# Echo the snapshot directory so the caller can use it to `cd` into it +echo ${SNAPSHOT_DIR} \ No newline at end of file diff --git a/tools/launch b/tools/launch new file mode 100755 index 0000000000..4c76cee78d --- /dev/null +++ b/tools/launch @@ -0,0 +1,175 @@ +#!/bin/bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +# This is a helper script to launch a release test on slurm. +# It reads a demarcated section of the script to extract the config, +# and uses that to determine how many nodes and how many chained jobs to launch. +# +# It also creates a code snapshot to ensure that the code is reproducible and subsequent +# jobs can be launched with the same code. It also creates a continue.sh in the code +# snapshot directory to continue launching the job even if the original invocation was +# forgotten. +# +# Usage: +# CONTAINER=... ACCOUNT=... PARTITION=... ./launch ... +# + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/..) + +# Function to extract config from a script +extract_config() { + local script_path="$1" + local config=$(sed -n '/^# =\+ BEGIN CONFIG =\+/,/^# =\+ END CONFIG =\+/p' "$script_path" | + grep -v "^#" | + grep "=" ) + if [[ -z "$config" ]]; then + echo "[ERROR]: No config section found in script_path=$script_path" + echo "[ERROR]: Please add and update a section in the script with these variables:" + echo + echo "# ===== BEGIN CONFIG =====" + echo "NUM_NODES=1 # How many nodes this job uses" + echo "STEPS_PER_RUN=60 # Approximately how many steps reached in one job" + echo "MAX_STEPS=60 # Max training steps" + echo 'NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up' + echo "NUM_MINUTES=240 # How many minutes one job is (SLURM specific)" + echo "# ===== END CONFIG =====" + return 1 + fi 1>&2 + echo "$config" +} + +check_file_in_version_control_and_get_relpath_from_git_root() { + local script_path="$1" + # Check if the script is tracked in git (assumes we're in the repo already) + rel_path_from_git_root=$(git ls-files --full-name --error-unmatch "$script_path") + ret_code=$? + if [[ $ret_code -ne 0 ]]; then + echo "[ERROR]: Script '$script_path' is not tracked in version control." >&2 + echo "[ERROR]: This may cause reproducibility issues. Add it to git to continue." >&2 + return 1 + fi + echo "$rel_path_from_git_root" +} + +set -eou pipefail + +if [[ $# -eq 0 ]]; then + echo "Error: No script provided." + echo "Usage: CONTAINER=... ACCOUNT=... PARTITION=... $0 ..." + exit 1 +fi + +# Check for mandatory environment variables +for VAR in "HF_HOME" "HF_DATASETS_CACHE"; do + if [[ -z "${!VAR:-}" ]]; then + echo "[ERROR]: $VAR environment variable is not set." + echo "[ERROR]: Please set $VAR to specify the appropriate Hugging Face directory." + echo "Example: export $VAR=/path/to/appropriate/directory" + exit 1 + fi +done + +CONTAINER=$CONTAINER +ACCOUNT=$ACCOUNT +PARTITION=$PARTITION +MOUNTS=${MOUNTS:-} +# DRYRUN=1 prints the runs and how much compute they use +# DRYRUN=2 additionally creates the snapshots (helpful to run a hermetic example manually or share a repro) +DRYRUN=${DRYRUN:-} +IS_RELEASE=${IS_RELEASE:-} # Adds extra configuration for wandb to track this in the right project +NOW=$(date '+%y%m%d-%H%M%S') + +if [[ -n "$MOUNTS" ]]; then + # Comma needed since we always mount PWD + MOUNTS=",$MOUNTS" +fi + +SCRIPTS="" +for SCRIPT in $@; do + if [[ ! -f "$SCRIPT" ]]; then + echo "Error: Script '$SCRIPT' does not exist or is not a file." + echo "Please provide a valid script path." + exit 1 + fi + SCRIPTS+=" $SCRIPT" +done + +total_gpu_hours=0 + +for SCRIPT in $SCRIPTS; do + # Extract and evaluate the config + if ! config=$(extract_config $SCRIPT); then + # Error message is already printed by extract_config + exit 1 + fi + eval "$config" + + # NUM_RUNS * NUM_NODES * NUM_GPUS * (NUM_MINUTES / 60) + gpu_hours=$((NUM_RUNS * NUM_NODES * 8 * NUM_MINUTES / 60)) + total_gpu_hours=$((total_gpu_hours + gpu_hours)) + echo "[INFO]: $gpu_hours GPUhrs to run $SCRIPT" + if [[ "${DRYRUN}" -eq 1 ]]; then + echo "[DRY_RUN]: Skipping creation of snapshot and submission of $SCRIPT." + continue + fi + + rel_script=$(check_file_in_version_control_and_get_relpath_from_git_root $SCRIPT) + + EXP_NAME=$(basename $SCRIPT .sh) + SNAPSHOT_DIR=$(bash $PROJECT_ROOT/tools/code_snapshot.sh $EXP_NAME) + + # Now use the variables + for i in $(seq 1 $NUM_RUNS); do + echo "Submitting $i/$NUM_RUNS job with ${NUM_NODES} nodes for $(basename $SCRIPT)" + JOB_NAME=$(basename $SCRIPT .sh) + + RELEASE_ARGS=() + if [[ -n "${IS_RELEASE}" ]]; then + RELEASE_ARGS=( + logger.wandb.project=nemo-rl-release + logger.wandb.name=$(basename $SCRIPT .sh)-$(git rev-parse --short HEAD) + ) + fi + + # TODO: jq install is just to be backward compatible with older containers. Should eventually remove. + cat <$SNAPSHOT_DIR/continue.sh +#!/bin/bash +SCRIPT_DIR=\$( cd -- "\$( dirname -- "\${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +cd \$SCRIPT_DIR + +HF_HOME=$HF_HOME \\ +HF_DATASETS_CACHE=$HF_DATASETS_CACHE \\ +COMMAND="apt install -y jq && uv run $rel_script ${RELEASE_ARGS[@]}" \\ +CONTAINER=$CONTAINER \\ +MOUNTS="$SNAPSHOT_DIR:$SNAPSHOT_DIR${MOUNTS}" \\ +sbatch \\ + --nodes=$NUM_NODES \\ + --account=$ACCOUNT \\ + --job-name=$ACCOUNT:$JOB_NAME \\ + --partition=$PARTITION \\ + --time=0:${NUM_MINUTES}:0 \\ + --gres=gpu:8 \\ + --output=slurm-${NOW}-%j-${JOB_NAME}-${i}.${NUM_RUNS}.out \\ + ray.sub +EOF + if [[ "${DRYRUN}" -eq 2 ]]; then + echo "[DRY_RUN]: Skipping submission of $SCRIPT. Find the snapshot at $SNAPSHOT_DIR and manually launch with 'bash continue.sh'" + else + bash $SNAPSHOT_DIR/continue.sh + fi + done +done +echo [INFO]: Total GPU hours: $total_gpu_hours diff --git a/tools/package_release_runs.sh b/tools/package_release_runs.sh new file mode 100755 index 0000000000..357c9ad618 --- /dev/null +++ b/tools/package_release_runs.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# This script packages all release runs into a tarball with a git SHA so that we can upload to our +# release page. The SHA is to avoid conflicts with previous runs, but when we upload we should +# remove that so that users can expect that the name is release_runs.tar.gz (this renaming can be +# done in the Github Release UI). + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/..) +cd $PROJECT_ROOT + +set -eou pipefail +# Enable recursive globbing +shopt -s globstar + +OUTPUT_TAR="release_runs-$(git rev-parse --short HEAD).tar.gz" + +TB_EVENTS=$(ls code_snapshots/*/tests/test_suites/**/logs/*/tensorboard/events* || true) + +# Check if the glob expanded to any files +if [ -z "$TB_EVENTS" ]; then + echo "Error: No tensorboard event files found matching the pattern." + exit 1 +elif [[ -f $OUTPUT_TAR ]]; then + echo "Error: $OUTPUT_TAR already exists. Clean it up before continuing." + exit 1 +fi + +TMP_DIR=$(mktemp -d) +echo "Created temporary directory: $TMP_DIR" + +# Set up trap to clean up temporary directory on exit +trap "echo 'Cleaning up temporary directory $TMP_DIR'; rm -rf $TMP_DIR" EXIT + +# Loop over all the recipe runs and package them into a tarball +for tbevent in $TB_EVENTS; do + exp_name=$(basename -- $(cut -d/ -f2 <<<$tbevent) -logs) + # Obfuscate the hostname + # events.out.tfevents.1744822578..780899.0 + obfuscated_event_path=$(basename $tbevent | awk -F. '{print $1"."$2"."$3"."$4".HOSTNAME."$(NF-1)"."$NF}') + + # Create subdirectory for experiment if it doesn't exist + mkdir -p "$TMP_DIR/$exp_name" + + # Copy the event file with obfuscated name to the experiment subdirectory + cp "$tbevent" "$TMP_DIR/$exp_name/$obfuscated_event_path" + + echo "[$exp_name] Copied $tbevent to $TMP_DIR/$exp_name/$obfuscated_event_path" +done + +# Create a tarball of all the processed event files +tar -czf "$OUTPUT_TAR" -C "$TMP_DIR" . +echo "Created tarball: $OUTPUT_TAR"