diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_cudagraphs/env_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_cudagraphs/env_config.yaml new file mode 100644 index 00000000000..329246987bf --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_cudagraphs/env_config.yaml @@ -0,0 +1,5 @@ +- agent_type: examples.rl.environments.countdown.countdown_agent.CountdownAgent + agent_args: + dataset_file: "/mnt/artifacts/rl_environments/Jiayi-Pan___countdown-tasks-3to4" + split: "train" + weight: 1.0 diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_cudagraphs/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_cudagraphs/golden_values_dev_dgx_h100.json new file mode 100644 index 00000000000..78e76d55735 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_cudagraphs/golden_values_dev_dgx_h100.json @@ -0,0 +1,287 @@ +{ + "lm loss": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 0.00762, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.05397, + "7": 0.01964, + "8": 0.0, + "9": 0.0, + "10": 0.0, + "11": 0.0, + "12": 0.0, + "13": 0.0, + "14": 0.0, + "15": 0.0, + "16": 0.02209, + "17": 0.0, + "18": 0.0, + "19": 0.0, + "20": 0.0, + "21": 0.0, + "22": 0.0, + "23": 0.0, + "24": 0.0, + "25": 0.0, + "26": 0.0, + "27": 0.0, + "28": 0.0, + "29": 0.0, + "30": 0.0, + "31": 0.0, + "32": 0.0, + "33": 0.0, + "34": 0.0, + "35": 0.0, + "36": 0.0, + "37": 0.0, + "38": 0.0, + "39": 0.0, + "40": 0.0, + "41": 0.0, + "42": 0.0, + "43": 0.0, + "44": 0.0, + "45": 0.0, + "46": 0.0, + "47": 0.0, + "48": 0.0, + "49": 0.0, + "50": 0.04447 + } + }, + "num-zeros": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 58.0, + "2": 583687296.0, + "3": 583687296.0, + "4": 583687296.0, + "5": 583687296.0, + "6": 35.0, + "7": 52.0, + "8": 583687296.0, + "9": 583687296.0, + "10": 583687296.0, + "11": 583687296.0, + "12": 583687296.0, + "13": 583687296.0, + "14": 583687296.0, + "15": 583687296.0, + "16": 52.0, + "17": 583687296.0, + "18": 583687296.0, + "19": 583687296.0, + "20": 583687296.0, + "21": 583687296.0, + "22": 583687296.0, + "23": 583687296.0, + "24": 583687296.0, + "25": 583687296.0, + "26": 583687296.0, + "27": 583687296.0, + "28": 583687296.0, + "29": 583687296.0, + "30": 583687296.0, + "31": 583687296.0, + "32": 583687296.0, + "33": 583687296.0, + "34": 583687296.0, + "35": 583687296.0, + "36": 583687296.0, + "37": 583687296.0, + "38": 583687296.0, + "39": 583687296.0, + "40": 583687296.0, + "41": 583687296.0, + "42": 583687296.0, + "43": 583687296.0, + "44": 583687296.0, + "45": 583687296.0, + "46": 583687296.0, + "47": 583687296.0, + "48": 583687296.0, + "49": 583687296.0, + "50": 45.0 + } + }, + "mem-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 55289954304.0, + "2": 55292747776.0, + "3": 55292731392.0, + "4": 55292891136.0, + "5": 55292878848.0, + "6": 55292878848.0, + "7": 55292878848.0, + "8": 55292788736.0, + "9": 55292788736.0, + "10": 55292788736.0, + "11": 55292792832.0, + "12": 55292792832.0, + "13": 55292792832.0, + "14": 55292792832.0, + "15": 55292792832.0, + "16": 55292796928.0, + "17": 55292796928.0, + "18": 55292801024.0, + "19": 55292805120.0, + "20": 55292801024.0, + "21": 55292801024.0, + "22": 55292796928.0, + "23": 55292801024.0, + "24": 55292796928.0, + "25": 55292801024.0, + "26": 55292796928.0, + "27": 55292796928.0, + "28": 55292801024.0, + "29": 55292801024.0, + "30": 55292805120.0, + "31": 55292805120.0, + "32": 55292805120.0, + "33": 55292805120.0, + "34": 55292805120.0, + "35": 55292805120.0, + "36": 55292805120.0, + "37": 55292801024.0, + "38": 55292801024.0, + "39": 55292801024.0, + "40": 55292805120.0, + "41": 55292805120.0, + "42": 55292805120.0, + "43": 55292801024.0, + "44": 55292796928.0, + "45": 55292801024.0, + "46": 55292801024.0, + "47": 55292801024.0, + "48": 55292801024.0, + "49": 55292805120.0, + "50": 55292805120.0 + } + }, + "mem-max-allocated-bytes": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 55289958400.0, + "2": 57103880192.0, + "3": 57104392192.0, + "4": 57104416768.0, + "5": 57104416768.0, + "6": 57104416768.0, + "7": 57104416768.0, + "8": 57104416768.0, + "9": 57104416768.0, + "10": 57104416768.0, + "11": 57104416768.0, + "12": 57104416768.0, + "13": 57104416768.0, + "14": 57104416768.0, + "15": 57104416768.0, + "16": 57104416768.0, + "17": 57104416768.0, + "18": 57104416768.0, + "19": 57104416768.0, + "20": 57104416768.0, + "21": 57104416768.0, + "22": 57104416768.0, + "23": 57104416768.0, + "24": 57104416768.0, + "25": 57104416768.0, + "26": 57104416768.0, + "27": 57104416768.0, + "28": 57104416768.0, + "29": 57104416768.0, + "30": 57104416768.0, + "31": 57104416768.0, + "32": 57104416768.0, + "33": 57104416768.0, + "34": 57104416768.0, + "35": 57104416768.0, + "36": 57104416768.0, + "37": 57104416768.0, + "38": 57104416768.0, + "39": 57104416768.0, + "40": 57104416768.0, + "41": 57104416768.0, + "42": 57104416768.0, + "43": 57104416768.0, + "44": 57104416768.0, + "45": 57104416768.0, + "46": 57104416768.0, + "47": 57104416768.0, + "48": 57104416768.0, + "49": 57104416768.0, + "50": 57104416768.0 + } + }, + "iteration-time": { + "start_step": 1, + "end_step": 50, + "step_interval": 1, + "values": { + "1": 38.24908, + "2": 4.52458, + "3": 3.69393, + "4": 3.38577, + "5": 3.41862, + "6": 3.27421, + "7": 3.32023, + "8": 3.83723, + "9": 4.07373, + "10": 3.47799, + "11": 3.27499, + "12": 3.37017, + "13": 3.3918, + "14": 3.25114, + "15": 3.29905, + "16": 3.29943, + "17": 3.50383, + "18": 3.56844, + "19": 3.30276, + "20": 3.34553, + "21": 3.29165, + "22": 3.30348, + "23": 3.33814, + "24": 3.31525, + "25": 3.29337, + "26": 3.26119, + "27": 3.5167, + "28": 3.2312, + "29": 3.45063, + "30": 3.3088, + "31": 3.32522, + "32": 3.28154, + "33": 3.23551, + "34": 3.20003, + "35": 3.25844, + "36": 3.67071, + "37": 3.1881, + "38": 3.30757, + "39": 3.32895, + "40": 3.29602, + "41": 3.25522, + "42": 3.28932, + "43": 3.32204, + "44": 3.26419, + "45": 3.75371, + "46": 3.23126, + "47": 3.25929, + "48": 3.19512, + "49": 3.32815, + "50": 3.25617 + } + } +} diff --git a/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_cudagraphs/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_cudagraphs/model_config.yaml new file mode 100644 index 00000000000..c8f355db2b4 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest_cudagraphs/model_config.yaml @@ -0,0 +1,79 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + NCCL_ALGO: Ring + CUBLAS_WORKSPACE_CONFIG: :4096:8 +TEST_TYPE: frozen-start +MODE: rl +MODEL_ARGS: + --tiktoken-pattern: v2 + --use-mcore-models: true + --tokenizer-type: TikTokenizer + --tokenizer-model: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json + --load: ${CHECKPOINT_LOAD_PATH}/model/mcore_mistral/nemo_minitron-0.5b/v1/ + --auto-detect-ckpt-format: true + --max-tokens-to-oom: 3600000 + --inference-max-seq-length: 1024 + --attention-backend: flash + --mock-data: true + --micro-batch-size: 1 + --no-load-optim: true + --no-use-tokenizer-model-from-checkpoint-args: true + --timing-log-level: 0 + --distributed-backend: nccl + --log-interval: 1 + --log-progress: true + --transformer-impl: transformer_engine + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 1 + --ckpt-format: torch_dist + --bf16: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-validation-ppl-to-tensorboard: true + --log-timers-to-tensorboard: true + --num-layers: 24 + --hidden-size: 1152 + --num-attention-heads: 16 + --max-position-embeddings: 1024 + --seq-length: 1024 + --timing-log-option: minmax + --log-throughput: true + --no-create-attention-mask-in-dataloader: true + --straggler-minmax-count: 16 + --tensorboard-log-interval: 1 + --empty-unused-memory-level: 2 + --langrl-inference-server-type: inplace_megatron + --seed: 42 + --calculate-per-token-loss: true + --rl-use-sequence-packing: true + --rl-sequence-packing-algo: fifo + --rl-offload-optimizer-during-inference: true + --timing-log-level: 1 + --log-timers-to-tensorboard: true + --cuda-graph-impl: local + --micro-batch-size: 1 + --global-batch-size: 16 + --grpo-group-size: 2 + --grpo-prompts-per-step: 8 + --grpo-iterations: 1 + --grpo-clamp-eps-lower: 0.2 + --grpo-clamp-eps-upper: 0.2 + --grpo-kl-beta: 0.0 + --grpo-entropy-term-weight: 0.0 + --langrl-env-config: tests/functional_tests/test_cases/gpt/gpt_grpo_tp1_pp1_dp8_583m_throughputtest/env_config.yaml + --rl-partial-rollouts: true + --lr: 0.000001 + --lr-warmup-samples: 0 + --clip-grad: 1.0 + --use-checkpoint-args: true + --dist-ckpt-strictness: log_unexpected + --perform-rl-step: true + --train-samples: 48828125 + --exit-interval: 10 + --tensorboard-dir: ${TENSORBOARD_PATH} + --save-interval: 1000000 + --eval-interval: 1000000 + --finetune: true + --inference-logging-step-interval: 1 + --rl-training-cuda-graphs: true diff --git a/tests/test_utils/recipes/gpt-grpo.yaml b/tests/test_utils/recipes/gpt-grpo.yaml index 11e8eadea9b..13bc5bf3a29 100644 --- a/tests/test_utils/recipes/gpt-grpo.yaml +++ b/tests/test_utils/recipes/gpt-grpo.yaml @@ -69,6 +69,11 @@ products: - environment: [dev] scope: [mr-broken] platforms: [dgx_h100] + - test_case: [gpt_grpo_tp1_pp1_dp8_583m_throughputtest_cudagraphs] + products: + - environment: [dev] + scope: [mr] + platforms: [dgx_h100] - test_case: [gpt_grpo_tp1tp2_pp1_dp8_583m_throughputtest_github] products: - environment: [dev]