From 686bea0ae6fdddc2a225337a4b13c40575a2910b Mon Sep 17 00:00:00 2001 From: gaoziyuan Date: Tue, 2 Sep 2025 14:08:28 +0800 Subject: [PATCH 1/3] add moe noaux_tc tatics in trition backend --- .../layers/moe/fused_moe_triton_backend.py | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index 69920649afa..079effe2fea 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -829,13 +829,25 @@ def apply( E, N1, _ = getattr(layer, self.added_weight_attrs[0]).shape N2 = getattr(layer, self.added_weight_attrs[1]).shape[1] - topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( - gate_out, - layer.gate_correction_bias, - layer.top_k, - True, # apply_norm_weight - False, - ) + if layer.topk_method == "noaux_tc": + from .ep import get_moe_scores + + _, topk_weights, topk_ids = get_moe_scores( + gate_out, + layer.n_group, + layer.topk_group, + layer.top_k, + layer.routed_scaling_factor, + layer.gate_correction_bias, + ) + else: + topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( + gate_out, + layer.gate_correction_bias, + layer.top_k, + True, # apply_norm_weight + False, + ) config = { "BLOCK_SIZE_M": 64, From 4d1df7aacc0fc7c05657b16ca41c92acb9334814 Mon Sep 17 00:00:00 2001 From: gaoziyuan Date: Tue, 2 Sep 2025 19:27:38 +0800 Subject: [PATCH 2/3] fix --- .../layers/moe/fused_moe_triton_backend.py | 54 +++++++++++++------ 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index 079effe2fea..1b0e3a7cb7e 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -24,6 +24,7 @@ from fastdeploy.utils import ceil_div from ..quantization.quant_base import QuantMethodBase +from .ep import get_moe_scores try: from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess_func @@ -167,13 +168,24 @@ def apply( moe_intermediate_size = layer.moe_intermediate_size hidden_size = layer.hidden_size - topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( - gate_out, - layer.gate_correction_bias, - top_k, - True, # apply_norm_weight, - False, - ) + if layer.topk_method == "noaux_tc": + _, topk_weights, topk_ids = get_moe_scores( + gate_out, + layer.n_group, + layer.topk_group, + layer.top_k, + layer.routed_scaling_factor, + layer.gate_correction_bias, + ) + else: + topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( + gate_out, + layer.gate_correction_bias, + layer.top_k, + True, # apply_norm_weight + False, + ) + up_gate_proj_out = paddle.empty( [token_num * top_k, moe_intermediate_size * 2], dtype=x.dtype, @@ -419,13 +431,25 @@ def apply( moe_intermediate_size = layer.moe_intermediate_size hidden_size = layer.hidden_size - topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( - gate_out, - layer.gate_correction_bias, - top_k, - True, # apply_norm_weight, - False, - ) + if layer.topk_method == "noaux_tc": + + _, topk_weights, topk_ids = get_moe_scores( + gate_out, + layer.n_group, + layer.topk_group, + layer.top_k, + layer.routed_scaling_factor, + layer.gate_correction_bias, + ) + else: + + topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( + gate_out, + layer.gate_correction_bias, + top_k, + True, # apply_norm_weight, + False, + ) up_gate_proj_out = paddle.empty( [token_num * top_k, moe_intermediate_size * 2], @@ -830,8 +854,6 @@ def apply( N2 = getattr(layer, self.added_weight_attrs[1]).shape[1] if layer.topk_method == "noaux_tc": - from .ep import get_moe_scores - _, topk_weights, topk_ids = get_moe_scores( gate_out, layer.n_group, From 71b3288849b6502f1effafdf7e13d92aabe5bb28 Mon Sep 17 00:00:00 2001 From: gaoziyuan Date: Tue, 2 Sep 2025 21:35:09 +0800 Subject: [PATCH 3/3] add dp config --- fastdeploy/rl/rollout_config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastdeploy/rl/rollout_config.py b/fastdeploy/rl/rollout_config.py index 82074b70c43..1fe7978681c 100644 --- a/fastdeploy/rl/rollout_config.py +++ b/fastdeploy/rl/rollout_config.py @@ -60,6 +60,7 @@ def __init__( early_stop_config: str = None, local_rank: int = 0, moba_attention_config: str = None, + data_parallel_size: int = 1, ): # Required parameters self.model = model_name_or_path @@ -95,6 +96,7 @@ def __init__( self.splitwise_role = splitwise_role self.expert_parallel_size = expert_parallel_size self.enable_expert_parallel = enable_expert_parallel + self.data_parallel_size = data_parallel_size self.ori_vocab_size = ori_vocab_size self.quantization = quantization self.guided_decoding_backend = guided_decoding_backend