diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py index d63882215609..4dc2ad96c091 100644 --- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -278,7 +278,7 @@ def __init__(self, config: Qwen3MoeConfig): self.experts = Qwen3MoeExperts(config) self.gate = Qwen3MoeTopKRouter(config) - def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, sequence_length, hidden_dim = hidden_states.shape hidden_states_reshaped = hidden_states.view(-1, hidden_dim) _, routing_weights, selected_experts = self.gate(hidden_states_reshaped) diff --git a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py index cf8741aafe2d..0fd5b451959c 100644 --- a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py @@ -66,7 +66,7 @@ def __init__(self, config: Qwen3MoeConfig): self.experts = Qwen3MoeExperts(config) self.gate = Qwen3MoeTopKRouter(config) - def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, sequence_length, hidden_dim = hidden_states.shape hidden_states_reshaped = hidden_states.view(-1, hidden_dim) _, routing_weights, selected_experts = self.gate(hidden_states_reshaped) diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index 5141ffc388c8..22529635689e 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -1415,7 +1415,7 @@ def __init__(self, config: Qwen3OmniMoeThinkerConfig): self.experts = Qwen3OmniMoeThinkerTextExperts(config) self.gate = Qwen3OmniMoeThinkerTextTopKRouter(config) - def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, sequence_length, hidden_dim = hidden_states.shape hidden_states_reshaped = hidden_states.view(-1, hidden_dim) _, routing_weights, selected_experts = self.gate(hidden_states_reshaped) diff --git a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 6d4c68c1a752..7170645a45aa 100644 --- a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -136,7 +136,7 @@ def __init__(self, config: Qwen3VLMoeTextConfig): self.experts = Qwen3VLMoeTextExperts(config) self.gate = Qwen3VLMoeTextTopKRouter(config) - def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, sequence_length, hidden_dim = hidden_states.shape hidden_states_reshaped = hidden_states.view(-1, hidden_dim) _, routing_weights, selected_experts = self.gate(hidden_states_reshaped)