From 2442083bda4d2469a57839d22640e3a02fba53ee Mon Sep 17 00:00:00 2001 From: Parth Mannan Date: Sun, 22 Feb 2026 13:18:10 -0800 Subject: [PATCH 1/2] Update moe_token_dispatcher_type default to alltoall Signed-off-by: Parth Mannan --- examples/configs/distillation_math.yaml | 2 +- examples/configs/distillation_math_megatron.yaml | 2 +- examples/configs/dpo.yaml | 2 +- examples/configs/grpo_math_1B.yaml | 2 +- examples/configs/grpo_math_1B_megatron.yaml | 2 +- examples/configs/sft.yaml | 2 +- examples/configs/sft_openmathinstruct2_megatron.yaml | 2 +- examples/configs/vlm_grpo_3B.yaml | 2 +- examples/configs/vlm_grpo_3B_megatron.yaml | 2 +- .../grpo_workplace_assistant_nemotron_nano_v2_9b.yaml | 2 +- nemo_rl/models/policy/__init__.py | 2 +- tests/unit/models/megatron/test_megatron_setup.py | 4 ++-- tests/unit/models/policy/test_megatron_worker.py | 2 +- 13 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml index bb3743a4fc..b32664fa63 100644 --- a/examples/configs/distillation_math.yaml +++ b/examples/configs/distillation_math.yaml @@ -109,7 +109,7 @@ policy: &POLICY_BASE defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml index 76151678f1..e6eafa67e3 100644 --- a/examples/configs/distillation_math_megatron.yaml +++ b/examples/configs/distillation_math_megatron.yaml @@ -60,7 +60,7 @@ policy: &POLICY_BASE moe_per_layer_logging: False defer_fp32_logits: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index fec83a6199..ddeb0fa7b8 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -133,7 +133,7 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 740f9ad24b..3e5ada5761 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -144,7 +144,7 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index b240c6519c..f328fe016c 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -96,7 +96,7 @@ policy: moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo moe_permute_fusion: false moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 7b90a90c38..821da4e530 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -115,7 +115,7 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false peft: diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml index 40f62473ac..e0748a1479 100644 --- a/examples/configs/sft_openmathinstruct2_megatron.yaml +++ b/examples/configs/sft_openmathinstruct2_megatron.yaml @@ -94,7 +94,7 @@ policy: bias_activation_fusion: True moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false env_vars: diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml index 4cad631c85..3e2d3b5597 100644 --- a/examples/configs/vlm_grpo_3B.yaml +++ b/examples/configs/vlm_grpo_3B.yaml @@ -114,7 +114,7 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml index 336d97d79b..63a12a226f 100644 --- a/examples/configs/vlm_grpo_3B_megatron.yaml +++ b/examples/configs/vlm_grpo_3B_megatron.yaml @@ -156,7 +156,7 @@ policy: defer_fp32_logits: False moe_per_layer_logging: False moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: optimizer: adam diff --git a/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml index a923f842b7..5de09a4a3a 100644 --- a/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml +++ b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml @@ -111,7 +111,7 @@ policy: defer_fp32_logits: false moe_permute_fusion: false moe_enable_deepep: false - moe_token_dispatcher_type: "allgather" + moe_token_dispatcher_type: "alltoall" moe_shared_expert_overlap: false optimizer: diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index 363399cbca..f1b405347d 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -187,7 +187,7 @@ class MegatronConfig(TypedDict): # Must set moe_token_dispatcher_type to 'flex' # Must set moe_shared_expert_overlap to False moe_enable_deepep: bool - # The type of token dispatcher to use. The default is 'allgather'. + # The type of token dispatcher to use. The default is 'alltoall'. # Options are 'allgather','alltoall' and 'flex' # Use 'flex' when using DeepEP moe_token_dispatcher_type: str diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py index 16d77389a6..7b2a5d3622 100644 --- a/tests/unit/models/megatron/test_megatron_setup.py +++ b/tests/unit/models/megatron/test_megatron_setup.py @@ -187,7 +187,7 @@ def test_moe_configuration(self): "moe_router_bias_update_rate": 0.0, "moe_permute_fusion": True, "moe_enable_deepep": False, - "moe_token_dispatcher_type": "allgather", + "moe_token_dispatcher_type": "alltoall", "moe_shared_expert_overlap": True, } } @@ -201,7 +201,7 @@ def test_moe_configuration(self): assert model_cfg.moe_router_bias_update_rate == 0.0 assert model_cfg.moe_permute_fusion is True assert model_cfg.moe_enable_deepep is False - assert model_cfg.moe_token_dispatcher_type == "allgather" + assert model_cfg.moe_token_dispatcher_type == "alltoall" assert model_cfg.moe_shared_expert_overlap is True diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 7d329ab411..2fbd16fc3c 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -135,7 +135,7 @@ def create_megatron_test_config( "bias_activation_fusion": True, "moe_per_layer_logging": False, "moe_enable_deepep": False, - "moe_token_dispatcher_type": "allgather", + "moe_token_dispatcher_type": "alltoall", "moe_shared_expert_overlap": False, "defer_fp32_logits": defer_fp32_logits, "train_iters": 100, # Required for Megatron training From 493ddefe319073b24e757660125ee7facadabc35 Mon Sep 17 00:00:00 2001 From: Parth Mannan Date: Sun, 22 Feb 2026 13:21:00 -0800 Subject: [PATCH 2/2] Update in vllm test Signed-off-by: Parth Mannan --- tests/unit/models/generation/test_vllm_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index c27a183b5c..3c06df8a80 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -194,7 +194,7 @@ def get_basic_megatron_test_config( "moe_router_bias_update_rate": 0.0, "moe_permute_fusion": False, "moe_enable_deepep": False, - "moe_token_dispatcher_type": "allgather", + "moe_token_dispatcher_type": "alltoall", "moe_shared_expert_overlap": False, "apply_rope_fusion": True, "bias_activation_fusion": True,