From 6f6ddc203738e1831af9bc31174e2f884e1dda97 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 23 Jan 2026 08:33:35 +0000 Subject: [PATCH 1/2] Updated reduce sum calculation to use einsum Signed-off-by: asmigosw --- QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index 57bcb842d..cb6f9e5a7 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -402,9 +402,8 @@ def forward(self, hidden_states): # Apply routing weights AFTER expert computation experts_out = experts_out * router_top_value.unsqueeze(-1) - experts_out = experts_out.sum(dim=1) - - return experts_out, router_logits + experts_out_sum = torch.einsum('bnd->bd', experts_out) + return experts_out_sum, router_logits def optimized_moe_forward(self, hidden_states: torch.Tensor): B, S, H = hidden_states.shape From b1647357936e3278e8f6ea70fa771b76264206e8 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 23 Jan 2026 08:47:48 +0000 Subject: [PATCH 2/2] Ruff format Signed-off-by: asmigosw --- QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index cb6f9e5a7..96ea8055c 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -402,7 +402,7 @@ def forward(self, hidden_states): # Apply routing weights AFTER expert computation experts_out = experts_out * router_top_value.unsqueeze(-1) - experts_out_sum = torch.einsum('bnd->bd', experts_out) + experts_out_sum = torch.einsum("bnd->bd", experts_out) return experts_out_sum, router_logits def optimized_moe_forward(self, hidden_states: torch.Tensor):