diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index 3ccebbd34231..c63beb73fac9 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -925,7 +925,7 @@ def __init__(self, config: Qwen2_5OmniVisionEncoderConfig = None) -> None: self.k = nn.Linear(self.dim, self.dim, bias=True) self.v = nn.Linear(self.dim, self.dim, bias=True) self.proj = nn.Linear(self.dim, self.dim) - self.scaling = math.sqrt(self.head_dim) + self.scaling = self.head_dim**-0.5 self.num_key_value_groups = 1 # needed for eager attention self.config = config diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index ac134bd48372..9acc76c9afa0 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -1903,7 +1903,7 @@ def __init__(self, config: Qwen2_5OmniVisionEncoderConfig = None) -> None: self.k = nn.Linear(self.dim, self.dim, bias=True) self.v = nn.Linear(self.dim, self.dim, bias=True) self.proj = nn.Linear(self.dim, self.dim) - self.scaling = math.sqrt(self.head_dim) + self.scaling = self.head_dim**-0.5 self.num_key_value_groups = 1 # needed for eager attention self.config = config diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 0122aa37e025..ab318d955ffa 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -24,7 +24,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math from dataclasses import dataclass from typing import Any, Callable, Optional, Union @@ -205,7 +204,7 @@ def __init__(self, config: Qwen2_5_VLVisionConfig) -> None: self.num_key_value_groups = 1 # needed for eager attention self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True) self.proj = nn.Linear(self.dim, self.dim) - self.scaling = math.sqrt(self.head_dim) + self.scaling = self.head_dim**-0.5 self.config = config def forward( diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 3b3c460c0c6d..a799e7328e5d 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -19,7 +19,6 @@ # limitations under the License. """PyTorch Qwen2-VL model.""" -import math from dataclasses import dataclass from typing import Any, Callable, Optional, Union @@ -323,7 +322,7 @@ def __init__(self, config: Qwen2VLVisionConfig) -> None: self.num_key_value_groups = 1 # needed for eager attention self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True) self.proj = nn.Linear(self.dim, self.dim) - self.scaling = math.sqrt(self.head_dim) + self.scaling = self.head_dim**-0.5 self.config = config def forward(