From 925936b9cd271bab1289bb1f3a84b227cb2c80ba Mon Sep 17 00:00:00 2001 From: DuyguA Date: Thu, 18 Sep 2025 13:35:26 +0200 Subject: [PATCH 1/2] added quick test for the issue --- tests/models/bert/test_modeling_bert.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 8d33d9dc1b22..69909c435aea 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -723,6 +723,29 @@ def test_sdpa_ignored_mask(self): torch.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-5, rtol=1e-4) ) + @slow + def test_sdpa_padding_to_nan(self): + # Test for Pytorch sdpa implementation doesn't produce non for padded tokens on bfloat16 + # See: https://github.com/huggingface/transformers/issues/31035 + model_sdpa = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="sdpa", dtype=torch.bfloat16) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel") + + model_sdpa = model_sdpa.eval() + + input_texts = [ + "I visited him yesterday.", # this one will have quite many pad tokens + "Red, blue, and more colors " + "red " * 40, + ] + + inputs = tokenizer(input_texts, return_tensors="pt", padding=True) + + + with torch.no_grad(): + outputs = model_sdpa(**inputs) + + hidden_states = outputs.last_hidden_state + self.assertFalse(torch.isnan(hidden_states).any()) + @slow @pytest.mark.torch_export_test def test_export(self): From 6950238c8e143c1a8cbf927233674df7665957ab Mon Sep 17 00:00:00 2001 From: DuyguA Date: Thu, 18 Sep 2025 13:45:44 +0200 Subject: [PATCH 2/2] style fix --- tests/models/bert/test_modeling_bert.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 69909c435aea..3f29db17755f 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -726,22 +726,23 @@ def test_sdpa_ignored_mask(self): @slow def test_sdpa_padding_to_nan(self): # Test for Pytorch sdpa implementation doesn't produce non for padded tokens on bfloat16 - # See: https://github.com/huggingface/transformers/issues/31035 - model_sdpa = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="sdpa", dtype=torch.bfloat16) + # See: https://github.com/huggingface/transformers/issues/31035 + model_sdpa = BertModel.from_pretrained( + "hf-internal-testing/tiny-random-BertModel", attn_implementation="sdpa", dtype=torch.bfloat16 + ) tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel") model_sdpa = model_sdpa.eval() input_texts = [ - "I visited him yesterday.", # this one will have quite many pad tokens - "Red, blue, and more colors " + "red " * 40, + "I visited him yesterday.", # this one will have quite many pad tokens + "Red, blue, and more colors " + "red " * 40, ] inputs = tokenizer(input_texts, return_tensors="pt", padding=True) - with torch.no_grad(): - outputs = model_sdpa(**inputs) + outputs = model_sdpa(**inputs) hidden_states = outputs.last_hidden_state self.assertFalse(torch.isnan(hidden_states).any())