From 1aced1a8a63438e9e2e15aa186fe5b48137edf1e Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Thu, 28 Nov 2024 12:52:31 +0100
Subject: [PATCH 1/9] fix

---
 tests/generation/test_utils.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 76ab793e3a36..92a03128cb9c 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1927,6 +1927,22 @@ def test_generate_with_static_cache(self):
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
             main_input = inputs_dict[model_class.main_input_name]
 
+            config.rms_norm_eps = 1.0
+            config.layer_norm_eps = 1.0
+            config.norm_eps = 1.0
+            config.norm_epsilon = 1.0
+            config.layer_norm_epsilon = 1.0
+
+            # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance.
+            # (We don't need the original epsilon values to check eager/sdpa matches)
+            for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
+                if hasattr(config, attr):
+                    getattr(config, attr).rms_norm_eps = 1.0
+                    getattr(config, attr).layer_norm_eps = 1.0
+                    getattr(config, attr).norm_eps = 1.0
+                    getattr(config, attr).norm_epsilon = 1.0
+                    getattr(config, attr).layer_norm_epsilon = 1.0
+
             if config.is_encoder_decoder:
                 self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
 
@@ -1937,6 +1953,13 @@ def test_generate_with_static_cache(self):
 
             for dtype in (torch.float32, torch.float16):
                 model = model_class(config).to(torch_device).to(dtype).eval()
+
+                # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
+                for x in model.modules():
+                    from torch import nn
+                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)) or type(x).__name__ == "GemmaRMSNorm":
+                        x.eps = 1.0
+
                 generation_kwargs = {
                     "max_new_tokens": max_new_tokens,
                     "return_dict_in_generate": True,  # Required to return `past_key_values`

From 1d4545b3a9ac19aa3ed3cb5752abe45e09874b37 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Thu, 28 Nov 2024 13:30:04 +0100
Subject: [PATCH 2/9] fix

---
 src/transformers/testing_utils.py | 37 +++++++++++++++++++++++++++++
 tests/generation/test_utils.py    | 28 +++++-----------------
 tests/test_modeling_common.py     | 39 ++++++-------------------------
 3 files changed, 50 insertions(+), 54 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 30f7b5a68fb2..0b6e176efc6f 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -14,6 +14,7 @@
 
 import collections
 import contextlib
+import copy
 import doctest
 import functools
 import gc
@@ -1387,6 +1388,42 @@ def assert_screenout(out, what):
     assert match_str != -1, f"expecting to find {what} in output: f{out_pr}"
 
 
+def set_model_tester_for_less_flaky_test(test_case):
+    if hasattr(test_case.model_tester, "num_hidden_layers"):
+        test_case.model_tester.num_hidden_layers = 1
+    if hasattr(test_case.model_tester, "vision_config") and "num_hidden_layers" in test_case.model_tester.vision_config:
+        test_case.model_tester.vision_config = copy.deepcopy(test_case.model_tester.vision_config)
+        test_case.model_tester.vision_config["num_hidden_layers"] = 1
+    if hasattr(test_case.model_tester, "text_config") and "num_hidden_layers" in test_case.model_tester.text_config:
+        test_case.model_tester.text_config = copy.deepcopy(test_case.model_tester.text_config)
+        test_case.model_tester.text_config["num_hidden_layers"] = 1
+
+
+def set_config_for_less_flaky_test(config):
+    config.rms_norm_eps = 1.0
+    config.layer_norm_eps = 1.0
+    config.norm_eps = 1.0
+    config.norm_epsilon = 1.0
+    config.layer_norm_epsilon = 1.0
+
+    # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance.
+    # (We don't need the original epsilon values to check eager/sdpa matches)
+    for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
+        if hasattr(config, attr):
+            getattr(config, attr).rms_norm_eps = 1.0
+            getattr(config, attr).layer_norm_eps = 1.0
+            getattr(config, attr).norm_eps = 1.0
+            getattr(config, attr).norm_epsilon = 1.0
+            getattr(config, attr).layer_norm_epsilon = 1.0
+
+
+def set_model_for_less_flaky_test(model):
+    # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
+    for module in model.modules():
+        if type(module).__name__ in ["GemmaRMSNorm", "LayerNorm", "GroupNorm"]:
+            module.eps = 1.0
+
+
 class CaptureStd:
     """
     Context manager to capture:
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 92a03128cb9c..13f527f88d5a 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -37,6 +37,9 @@
     require_torch_multi_accelerator,
     require_torch_multi_gpu,
     require_torch_sdpa,
+    set_config_for_less_flaky_test,
+    set_model_for_less_flaky_test,
+    set_model_tester_for_less_flaky_test,
     slow,
     torch_device,
 )
@@ -1920,29 +1923,15 @@ def test_generate_with_static_cache(self):
         Tests that generating with static cache give almost same results as with dynamic cache, and the output cache
         has the expected shapes
         """
+        set_model_tester_for_less_flaky_test(self)
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_static_cache:
                 self.skipTest(reason="This model does not support the static cache format")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            set_config_for_less_flaky_test(config)
             main_input = inputs_dict[model_class.main_input_name]
 
-            config.rms_norm_eps = 1.0
-            config.layer_norm_eps = 1.0
-            config.norm_eps = 1.0
-            config.norm_epsilon = 1.0
-            config.layer_norm_epsilon = 1.0
-
-            # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance.
-            # (We don't need the original epsilon values to check eager/sdpa matches)
-            for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
-                if hasattr(config, attr):
-                    getattr(config, attr).rms_norm_eps = 1.0
-                    getattr(config, attr).layer_norm_eps = 1.0
-                    getattr(config, attr).norm_eps = 1.0
-                    getattr(config, attr).norm_epsilon = 1.0
-                    getattr(config, attr).layer_norm_epsilon = 1.0
-
             if config.is_encoder_decoder:
                 self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
 
@@ -1953,12 +1942,7 @@ def test_generate_with_static_cache(self):
 
             for dtype in (torch.float32, torch.float16):
                 model = model_class(config).to(torch_device).to(dtype).eval()
-
-                # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
-                for x in model.modules():
-                    from torch import nn
-                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)) or type(x).__name__ == "GemmaRMSNorm":
-                        x.eps = 1.0
+                set_model_for_less_flaky_test(model)
 
                 generation_kwargs = {
                     "max_new_tokens": max_new_tokens,
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 99d0a8058c67..2c6e4198eb39 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -89,6 +89,9 @@
     require_torch_multi_accelerator,
     require_torch_multi_gpu,
     require_torch_sdpa,
+    set_config_for_less_flaky_test,
+    set_model_for_less_flaky_test,
+    set_model_tester_for_less_flaky_test,
     slow,
     torch_device,
 )
@@ -4006,34 +4009,11 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         def get_mean_reldiff(failcase, x, ref, atol, rtol):
             return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
 
-        if hasattr(self.model_tester, "num_hidden_layers"):
-            self.model_tester.num_hidden_layers = 1
-        if hasattr(self.model_tester, "vision_config") and "num_hidden_layers" in self.model_tester.vision_config:
-            self.model_tester.vision_config = copy.deepcopy(self.model_tester.vision_config)
-            self.model_tester.vision_config["num_hidden_layers"] = 1
-        if hasattr(self.model_tester, "text_config") and "num_hidden_layers" in self.model_tester.text_config:
-            self.model_tester.text_config = copy.deepcopy(self.model_tester.text_config)
-            self.model_tester.text_config["num_hidden_layers"] = 1
+        set_model_tester_for_less_flaky_test(self)
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            config.rms_norm_eps = 1.0
-            config.layer_norm_eps = 1.0
-            config.norm_eps = 1.0
-            config.norm_epsilon = 1.0
-            config.layer_norm_epsilon = 1.0
-
-            # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance.
-            # (We don't need the original epsilon values to check eager/sdpa matches)
-            for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
-                if hasattr(config, attr):
-                    getattr(config, attr).rms_norm_eps = 1.0
-                    getattr(config, attr).layer_norm_eps = 1.0
-                    getattr(config, attr).norm_eps = 1.0
-                    getattr(config, attr).norm_epsilon = 1.0
-                    getattr(config, attr).layer_norm_epsilon = 1.0
-
+            set_config_for_less_flaky_test(config)
             model = model_class(config)
             # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
             # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
@@ -4054,13 +4034,8 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 )
                 model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype)
 
-                # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
-                for x in model_eager.modules():
-                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
-                        x.eps = 1.0
-                for x in model_sdpa.modules():
-                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
-                        x.eps = 1.0
+                set_model_for_less_flaky_test(model_eager)
+                set_model_for_less_flaky_test(model_sdpa)
 
                 # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
                 # but it would be nicer to have an efficient way to use parameterized.expand

From 343201b38cd9c963a059315c3bcedf3ea3f2e5bc Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Thu, 28 Nov 2024 13:47:27 +0100
Subject: [PATCH 3/9] fix

---
 src/transformers/testing_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 0b6e176efc6f..a4955ad8e9a5 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -1391,7 +1391,10 @@ def assert_screenout(out, what):
 def set_model_tester_for_less_flaky_test(test_case):
     if hasattr(test_case.model_tester, "num_hidden_layers"):
         test_case.model_tester.num_hidden_layers = 1
-    if hasattr(test_case.model_tester, "vision_config") and "num_hidden_layers" in test_case.model_tester.vision_config:
+    if (
+        hasattr(test_case.model_tester, "vision_config")
+        and "num_hidden_layers" in test_case.model_tester.vision_config
+    ):
         test_case.model_tester.vision_config = copy.deepcopy(test_case.model_tester.vision_config)
         test_case.model_tester.vision_config["num_hidden_layers"] = 1
     if hasattr(test_case.model_tester, "text_config") and "num_hidden_layers" in test_case.model_tester.text_config:

From ad72abb31335f218c16f764b6dbfa3073397ef1d Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Thu, 28 Nov 2024 15:50:00 +0100
Subject: [PATCH 4/9] fix

---
 .../seamless_m4t_v2/test_modeling_seamless_m4t_v2.py     | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index 451fff0b35fb..01f513f97843 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -835,7 +835,12 @@ def test_generation_languages(self):
     def test_speech_generation(self):
         config, input_speech, input_text = self.prepare_speech_and_text_input()
 
+        from transformers.testing_utils import set_config_for_less_flaky_test, set_model_for_less_flaky_test
+        set_config_for_less_flaky_test(config)
+
         model = SeamlessM4Tv2Model(config=config)
+        set_model_for_less_flaky_test(model)
+
         self.update_generation(model)
         model.save_pretrained(self.tmpdirname)
         model.to(torch_device)
@@ -847,6 +852,8 @@ def test_speech_generation(self):
         state_dict = model.state_dict()
 
         text_model = SeamlessM4Tv2ForTextToSpeech.from_pretrained(self.tmpdirname)
+        set_model_for_less_flaky_test(text_model)
+
         self.update_generation(text_model)
         text_model.to(torch_device)
         text_model.eval()
@@ -854,6 +861,8 @@ def test_speech_generation(self):
         output_text = self.factory_generation_speech_test(model, input_text)
 
         speech_model = SeamlessM4Tv2ForSpeechToSpeech.from_pretrained(self.tmpdirname)
+        set_model_for_less_flaky_test(speech_model)
+
         self.update_generation(speech_model)
         speech_model.to(torch_device)
         speech_model.eval()

From c0ece15ca7fceb461b56604bdd5e6ebfc9aa8c05 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:24:03 +0100
Subject: [PATCH 5/9] fix

---
 .../models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index 01f513f97843..d7bae6562a0f 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -852,6 +852,9 @@ def test_speech_generation(self):
         state_dict = model.state_dict()
 
         text_model = SeamlessM4Tv2ForTextToSpeech.from_pretrained(self.tmpdirname)
+        # Even if this component is loaded after `model.save_pretrained` which is after
+        # `set_model_for_less_flaky_test(model)`, we still need to apply `set_model_for_less_flaky_test` here as the
+        # `eps` attribute in the model's norm layers is not set from the config.
         set_model_for_less_flaky_test(text_model)
 
         self.update_generation(text_model)
@@ -861,6 +864,9 @@ def test_speech_generation(self):
         output_text = self.factory_generation_speech_test(model, input_text)
 
         speech_model = SeamlessM4Tv2ForSpeechToSpeech.from_pretrained(self.tmpdirname)
+        # Even if this component is loaded after `model.save_pretrained` which is after
+        # `set_model_for_less_flaky_test(model)`, we still need to apply `set_model_for_less_flaky_test` here as the
+        # `eps` attribute in the model's norm layers is not set from the config.
         set_model_for_less_flaky_test(speech_model)
 
         self.update_generation(speech_model)

From d360907825d41cbf14aace572b7d83b2b292be87 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:25:41 +0100
Subject: [PATCH 6/9] fix

---
 tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index d7bae6562a0f..bba195ed1322 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -836,6 +836,7 @@ def test_speech_generation(self):
         config, input_speech, input_text = self.prepare_speech_and_text_input()
 
         from transformers.testing_utils import set_config_for_less_flaky_test, set_model_for_less_flaky_test
+
         set_config_for_less_flaky_test(config)
 
         model = SeamlessM4Tv2Model(config=config)

From 9911f647502c12fc64d93a625d415122b715d829 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:44:34 +0100
Subject: [PATCH 7/9] fix

---
 .../test_modeling_musicgen_melody.py              | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index bc8baa2746ad..98b554be65fb 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -41,6 +41,9 @@
     require_torch_gpu,
     require_torch_sdpa,
     require_torchaudio,
+    set_config_for_less_flaky_test,
+    set_model_for_less_flaky_test,
+    set_model_tester_for_less_flaky_test,
     slow,
     torch_device,
 )
@@ -516,8 +519,11 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         def get_mean_reldiff(failcase, x, ref, atol, rtol):
             return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
 
+        set_model_tester_for_less_flaky_test(self)
+
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            set_config_for_less_flaky_test(config)
             model = model_class(config)
 
             is_encoder_decoder = model.config.is_encoder_decoder
@@ -534,6 +540,9 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 )
                 model_eager = model_eager.eval().to(torch_device)
 
+                set_model_for_less_flaky_test(model_eager)
+                set_model_for_less_flaky_test(model_sdpa)
+
                 # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
                 # but it would be nicer to have an efficient way to use parameterized.expand
                 fail_cases = []
@@ -1528,8 +1537,11 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
         def get_mean_reldiff(failcase, x, ref, atol, rtol):
             return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
 
+        set_model_tester_for_less_flaky_test(self)
+
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            set_config_for_less_flaky_test(config)
             model = model_class(config)
 
             is_encoder_decoder = model.config.is_encoder_decoder
@@ -1546,6 +1558,9 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
                 )
                 model_eager = model_eager.eval().to(torch_device)
 
+                set_model_for_less_flaky_test(model_eager)
+                set_model_for_less_flaky_test(model_sdpa)
+
                 # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
                 # but it would be nicer to have an efficient way to use parameterized.expand
                 fail_cases = []

From a8eddaa36b95e3677c1e27b0e1181865080d9f27 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Tue, 10 Dec 2024 18:22:27 +0100
Subject: [PATCH 8/9] fix

---
 src/transformers/testing_utils.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index a4955ad8e9a5..aa078c3c40e3 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -1403,28 +1403,29 @@ def set_model_tester_for_less_flaky_test(test_case):
 
 
 def set_config_for_less_flaky_test(config):
-    config.rms_norm_eps = 1.0
-    config.layer_norm_eps = 1.0
-    config.norm_eps = 1.0
-    config.norm_epsilon = 1.0
-    config.layer_norm_epsilon = 1.0
+    target_attrs = ["rms_norm_eps", "layer_norm_eps", "norm_eps", "norm_epsilon", "layer_norm_epsilon", "batch_norm_eps"]
+    for target_attr in target_attrs:
+        setattr(config, target_attr, 1.0)
 
     # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance.
     # (We don't need the original epsilon values to check eager/sdpa matches)
-    for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
+    attrs = ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]
+    for attr in attrs:
         if hasattr(config, attr):
-            getattr(config, attr).rms_norm_eps = 1.0
-            getattr(config, attr).layer_norm_eps = 1.0
-            getattr(config, attr).norm_eps = 1.0
-            getattr(config, attr).norm_epsilon = 1.0
-            getattr(config, attr).layer_norm_epsilon = 1.0
+            for target_attr in target_attrs:
+                setattr(getattr(config, attr), target_attr, 1.0)
 
 
 def set_model_for_less_flaky_test(model):
     # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
-    for module in model.modules():
-        if type(module).__name__ in ["GemmaRMSNorm", "LayerNorm", "GroupNorm"]:
-            module.eps = 1.0
+    target_names = ("LayerNorm", "GroupNorm", "BatchNorm", "RMSNorm", "BatchNorm2d", "BatchNorm1d")
+    target_attrs = ["eps", "epsilon", "variance_epsilon"]
+    if is_torch_available() and isinstance(model, torch.nn.Module):
+        for module in model.modules():
+            if type(module).__name__.endswith(target_names):
+                for attr in target_attrs:
+                    if hasattr(module, attr):
+                        setattr(module, attr, 1.0)
 
 
 class CaptureStd:

From 39a8850658a2d54f78fc84763918e8776f4398ee Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Tue, 10 Dec 2024 18:24:34 +0100
Subject: [PATCH 9/9] fix

---
 src/transformers/testing_utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index aa078c3c40e3..1c4c40212969 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -1403,7 +1403,14 @@ def set_model_tester_for_less_flaky_test(test_case):
 
 
 def set_config_for_less_flaky_test(config):
-    target_attrs = ["rms_norm_eps", "layer_norm_eps", "norm_eps", "norm_epsilon", "layer_norm_epsilon", "batch_norm_eps"]
+    target_attrs = [
+        "rms_norm_eps",
+        "layer_norm_eps",
+        "norm_eps",
+        "norm_epsilon",
+        "layer_norm_epsilon",
+        "batch_norm_eps",
+    ]
     for target_attr in target_attrs:
         setattr(config, target_attr, 1.0)