From 925936b9cd271bab1289bb1f3a84b227cb2c80ba Mon Sep 17 00:00:00 2001
From: DuyguA <duygu.altinok12@gmail.com>
Date: Thu, 18 Sep 2025 13:35:26 +0200
Subject: [PATCH 1/2] added quick test for the issue

---
 tests/models/bert/test_modeling_bert.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 8d33d9dc1b22..69909c435aea 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -723,6 +723,29 @@ def test_sdpa_ignored_mask(self):
                 torch.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-5, rtol=1e-4)
             )
 
+    @slow
+    def test_sdpa_padding_to_nan(self):
+        # Test for Pytorch sdpa implementation doesn't produce non for padded tokens on bfloat16
+        # See: https://github.com/huggingface/transformers/issues/31035 
+        model_sdpa = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="sdpa", dtype=torch.bfloat16)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
+
+        model_sdpa = model_sdpa.eval()
+
+        input_texts = [
+          "I visited him yesterday.", # this one will have quite many pad tokens
+          "Red, blue, and more colors  " + "red " * 40,  
+        ]
+
+        inputs = tokenizer(input_texts, return_tensors="pt", padding=True)
+        
+
+        with torch.no_grad():
+          outputs = model_sdpa(**inputs)
+
+        hidden_states = outputs.last_hidden_state
+        self.assertFalse(torch.isnan(hidden_states).any())
+
     @slow
     @pytest.mark.torch_export_test
     def test_export(self):

From 6950238c8e143c1a8cbf927233674df7665957ab Mon Sep 17 00:00:00 2001
From: DuyguA <duygu.altinok12@gmail.com>
Date: Thu, 18 Sep 2025 13:45:44 +0200
Subject: [PATCH 2/2] style fix

---
 tests/models/bert/test_modeling_bert.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 69909c435aea..3f29db17755f 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -726,22 +726,23 @@ def test_sdpa_ignored_mask(self):
     @slow
     def test_sdpa_padding_to_nan(self):
         # Test for Pytorch sdpa implementation doesn't produce non for padded tokens on bfloat16
-        # See: https://github.com/huggingface/transformers/issues/31035 
-        model_sdpa = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="sdpa", dtype=torch.bfloat16)
+        # See: https://github.com/huggingface/transformers/issues/31035
+        model_sdpa = BertModel.from_pretrained(
+            "hf-internal-testing/tiny-random-BertModel", attn_implementation="sdpa", dtype=torch.bfloat16
+        )
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
 
         model_sdpa = model_sdpa.eval()
 
         input_texts = [
-          "I visited him yesterday.", # this one will have quite many pad tokens
-          "Red, blue, and more colors  " + "red " * 40,  
+            "I visited him yesterday.",  # this one will have quite many pad tokens
+            "Red, blue, and more colors  " + "red " * 40,
         ]
 
         inputs = tokenizer(input_texts, return_tensors="pt", padding=True)
-        
 
         with torch.no_grad():
-          outputs = model_sdpa(**inputs)
+            outputs = model_sdpa(**inputs)
 
         hidden_states = outputs.last_hidden_state
         self.assertFalse(torch.isnan(hidden_states).any())