diff --git a/docs/source/ja/perf_train_gpu_many.md b/docs/source/ja/perf_train_gpu_many.md
index 6721ba69a925..862171fecec4 100644
--- a/docs/source/ja/perf_train_gpu_many.md
+++ b/docs/source/ja/perf_train_gpu_many.md
@@ -472,8 +472,6 @@ FlexFlowは、サンプル-オペレータ-属性-パラメータの4D並列化
 
 したがって、このフレームワークの約束は非常に魅力的です。選択したクラスタで30分間のシミュレーションを実行し、この特定の環境を最適に利用するための最良の戦略を提供します。部分を追加/削除/置換すると、それに対して実行して再最適化プランを作成します。その後、トレーニングできます。異なるセットアップには独自の最適化があります。
 
-🤗 Transformersの現在の状況: まだ統合されていません。すでに[transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py)を使用してモデルがFXトレース可能であるため、FlexFlowを動作させるために必要な手順を誰かが見つける必要があります。
-
 ## Which Strategy To Use When
 
 ここでは、どの並列化戦略をいつ使用するかの非常におおまかなアウトラインを示します。各リストの最初が通常よりも速いことが一般的です。
diff --git a/docs/source/ko/perf_train_gpu_many.md b/docs/source/ko/perf_train_gpu_many.md
index 801e06e276e4..6f01eb22a344 100644
--- a/docs/source/ko/perf_train_gpu_many.md
+++ b/docs/source/ko/perf_train_gpu_many.md
@@ -476,8 +476,6 @@ https://huggingface.co/papers/2201.11990)
 
 따라서 이 프레임워크의 장점은 선택한 클러스터에서 30분 동안 시뮬레이션을 실행하고 이 특정 환경을 최적으로 활용하기 위한 최상의 전략을 제안한다는 것입니다. 부품을 추가/제거/교체하면 실행하고 그에 대한 계획을 다시 최적화한 후 훈련할 수 있습니다. 다른 설정은 자체적인 사용자 정의 최적화를 가질 수 있습니다.
 
-🤗 Transformers 현황: 아직 통합되지 않음. 이미 [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py)를 통해 모델을 FX-추적할 수 있으며, 이는 FlexFlow의 선행 조건입니다. 따라서 어떤 작업을 수행해야 FlexFlow가 우리의 모델과 함께 작동할 수 있는지 파악해야 합니다.
-
 
 ## 어떤 전략을 사용해야 할까요? [[which-strategy-to-use-when]]
 
diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py
index c0aa98cdce0c..8f0a8a04fd60 100644
--- a/src/transformers/masking_utils.py
+++ b/src/transformers/masking_utils.py
@@ -23,7 +23,7 @@
 from .configuration_utils import PreTrainedConfig
 from .utils import is_torch_xpu_available, logging
 from .utils.generic import GeneralInterface
-from .utils.import_utils import is_torch_flex_attn_available, is_torch_greater_or_equal, is_torchdynamo_compiling
+from .utils.import_utils import is_torch_flex_attn_available, is_torch_greater_or_equal, is_tracing
 
 
 if is_torch_flex_attn_available():
@@ -239,7 +239,6 @@ def _ignore_causal_mask_sdpa(
     allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
     passed).
     """
-    is_tracing = torch.jit.is_tracing() or isinstance(padding_mask, torch.fx.Proxy) or is_torchdynamo_compiling()
     if padding_mask is not None and padding_mask.shape[-1] > kv_length:
         mask_indices = torch.arange(kv_length, device=padding_mask.device)
         mask_indices += kv_offset
@@ -250,7 +249,7 @@ def _ignore_causal_mask_sdpa(
     # which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108). Thus, we only set
     # `ignore_causal_mask = True` if we are not tracing
     if (
-        not is_tracing
+        not is_tracing(padding_mask)
         # only cases when lower and upper diags are the same, see https://github.com/pytorch/pytorch/issues/108108
         and (query_length == 1 or (kv_length == query_length or _is_torch_xpu_available))
         # in this case we need to add special patterns to the mask so cannot be skipped otherwise
@@ -275,11 +274,9 @@ def _ignore_bidirectional_mask_sdpa(padding_mask: Optional[torch.Tensor]) -> boo
     Detects whether the bidirectional mask can be ignored in case PyTorch's SDPA is used, i.e. when there is full
     attention with no padding.
     """
-    is_tracing = torch.jit.is_tracing() or isinstance(padding_mask, torch.fx.Proxy) or is_torchdynamo_compiling()
-
     # When using `torch.export` or `torch.onnx.dynamo_export`, we need to avoid to check the contents of the mask;
     # otherwise, we will encounter dynamic control flows
-    if not is_tracing and (padding_mask is None or padding_mask.all()):
+    if not is_tracing(padding_mask) and (padding_mask is None or padding_mask.all()):
         return True
 
     return False
diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py
index 0be1b3ed2553..c2b92c2e5eed 100755
--- a/src/transformers/modeling_attn_mask_utils.py
+++ b/src/transformers/modeling_attn_mask_utils.py
@@ -22,7 +22,7 @@
 
 import torch
 
-from .utils.import_utils import is_torchdynamo_compiling
+from .utils.import_utils import is_torchdynamo_compiling, is_tracing
 
 
 @dataclass
@@ -267,7 +267,7 @@ def _ignore_causal_mask_sdpa(
         _, query_length = inputs_embeds.shape[0], inputs_embeds.shape[1]
         key_value_length = query_length + past_key_values_length
 
-        is_tracing = torch.jit.is_tracing() or isinstance(inputs_embeds, torch.fx.Proxy) or is_torchdynamo_compiling()
+        is_tracing_ = is_tracing(inputs_embeds)
 
         ignore_causal_mask = False
 
@@ -283,7 +283,7 @@ def _ignore_causal_mask_sdpa(
             # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal`
             # ("TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor").
             if (
-                (is_training or not is_tracing)
+                (is_training or not is_tracing_)
                 and (query_length == 1 or key_value_length == query_length)
                 and (sliding_window is None or key_value_length < sliding_window)
             ):
@@ -291,7 +291,7 @@ def _ignore_causal_mask_sdpa(
         elif sliding_window is None or key_value_length < sliding_window:
             if len(attention_mask.shape) == 4:
                 return False
-            elif not is_tracing and torch.all(attention_mask == 1):
+            elif not is_tracing_ and torch.all(attention_mask == 1):
                 if query_length == 1 or key_value_length == query_length:
                     # For query_length == 1, causal attention and bi-directional attention are the same.
                     ignore_causal_mask = True
@@ -379,7 +379,7 @@ def _prepare_4d_causal_attention_mask_for_sdpa(
     # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1`
     # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing.
     # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
-    is_tracing = torch.jit.is_tracing() or isinstance(inputs_embeds, torch.fx.Proxy) or is_torchdynamo_compiling()
+    is_tracing_ = is_tracing(inputs_embeds)
 
     ignore_causal_mask = AttentionMaskConverter._ignore_causal_mask_sdpa(
         attention_mask=attention_mask,
@@ -408,7 +408,7 @@ def _prepare_4d_causal_attention_mask_for_sdpa(
         # Attend to all tokens in masked rows from the causal_mask, for example the relevant first rows when
         # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
         # Details: https://github.com/pytorch/pytorch/issues/110213
-        if not is_tracing and expanded_4d_mask.device.type == "cuda":
+        if not is_tracing_ and expanded_4d_mask.device.type == "cuda":
             expanded_4d_mask = AttentionMaskConverter._unmask_unattended(
                 expanded_4d_mask, min_dtype=torch.finfo(inputs_embeds.dtype).min
             )
@@ -448,10 +448,8 @@ def _prepare_4d_attention_mask_for_sdpa(mask: torch.Tensor, dtype: torch.dtype,
     _, key_value_length = mask.shape
     tgt_len = tgt_len if tgt_len is not None else key_value_length
 
-    is_tracing = torch.jit.is_tracing() or isinstance(mask, torch.fx.Proxy) or is_torchdynamo_compiling()
-
     # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture data-dependent controlflows.
-    if not is_tracing and torch.all(mask == 1):
+    if not is_tracing(mask) and torch.all(mask == 1):
         return None
     else:
         return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 7ab90f1433fc..4e9db43e1b66 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -120,8 +120,7 @@
     ENV_VARS_TRUE_VALUES,
     is_huggingface_hub_greater_or_equal,
     is_sagemaker_mp_enabled,
-    is_torch_fx_proxy,
-    is_torchdynamo_compiling,
+    is_tracing,
 )
 from .utils.quantization_config import QuantizationMethod
 
@@ -4946,7 +4945,7 @@ def warn_if_padding_and_no_attention_mask(self, input_ids, attention_mask):
         """
 
         # Skip the check during tracing.
-        if is_torch_fx_proxy(input_ids) or torch.jit.is_tracing() or is_torchdynamo_compiling():
+        if is_tracing(input_ids):
             return
 
         if (attention_mask is not None) or (self.config.pad_token_id is None):
diff --git a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
index b7b8fdb4ce87..a0aa6c8b5c17 100644
--- a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
@@ -22,7 +22,7 @@
 from ....cache_utils import Cache
 from ....modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions
 from ....modeling_utils import PreTrainedModel
-from ....utils import DUMMY_INPUTS, DUMMY_MASK, auto_docstring, is_torch_fx_proxy
+from ....utils import DUMMY_INPUTS, DUMMY_MASK, auto_docstring
 from .configuration_gptsan_japanese import GPTSanJapaneseConfig
 
 
@@ -593,15 +593,9 @@ def _shift_right(self, input_ids):
                 "See T5 docs for more information."
             )
 
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
 
         if pad_token_id is None:
             raise ValueError("self.model.config.pad_token_id has to be defined.")
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index a046938c52e5..d758b0529d86 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -23,7 +23,7 @@
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
-from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
+from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import (
@@ -54,11 +54,6 @@
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
-# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
-# It means that the function will not be traced through and simply appear as a node in the graph.
-_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
-
-
 logger = logging.get_logger(__name__)
 
 
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index ae04d2435fc4..24d3322ad658 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -17,7 +17,6 @@
 from typing import Optional, Union
 
 import torch
-import torch.fx
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
@@ -34,12 +33,7 @@
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import (
-    auto_docstring,
-    is_torch_flex_attn_available,
-    is_torch_fx_proxy,
-    logging,
-)
+from ...utils import auto_docstring, is_torch_flex_attn_available, logging
 from .configuration_gptj import GPTJConfig
 
 
@@ -62,7 +56,6 @@ def create_sinusoidal_positions(num_pos: int, dim: int) -> torch.Tensor:
     return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
 
 
-@torch.fx.wrap
 def get_embed_positions(embed_positions, position_ids):
     return embed_positions.to(position_ids.device).repeat(position_ids.shape[0], 1, 1)
 
@@ -198,12 +191,7 @@ def forward(
         key = self._split_heads(key, self.num_attention_heads, self.head_dim, True)
         value = self._split_heads(value, self.num_attention_heads, self.head_dim, False)
 
-        if is_torch_fx_proxy(position_ids) or torch.jit.is_tracing():
-            # The logic to conditionally copy to GPU could not be traced, so we do this
-            # every time in the torch.fx case
-            embed_positions = get_embed_positions(self.embed_positions, position_ids)
-        else:
-            embed_positions = self._get_embed_positions(position_ids)
+        embed_positions = self._get_embed_positions(position_ids)
 
         repeated_position_ids = position_ids.unsqueeze(-1).repeat(1, 1, embed_positions.shape[-1])
         sincos = torch.gather(embed_positions, 1, repeated_position_ids).to(key.dtype)
@@ -283,12 +271,7 @@ def forward(
         key = self._split_heads(key, self.num_attention_heads, self.head_dim, True)
         value = self._split_heads(value, self.num_attention_heads, self.head_dim, False)
 
-        if is_torch_fx_proxy(position_ids) or torch.jit.is_tracing():
-            # The logic to conditionally copy to GPU could not be traced, so we do this
-            # every time in the torch.fx case
-            embed_positions = get_embed_positions(self.embed_positions, position_ids)
-        else:
-            embed_positions = self._get_embed_positions(position_ids)
+        embed_positions = self._get_embed_positions(position_ids)
 
         repeated_position_ids = position_ids.unsqueeze(-1).repeat(1, 1, embed_positions.shape[-1])
         sincos = torch.gather(embed_positions, 1, repeated_position_ids).to(key.dtype)
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index 0316d87ab6c3..af245b86220b 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -1041,7 +1041,6 @@ def apply_fusion_head(self, head: nn.Module, hidden_states: torch.Tensor) -> tor
         if isinstance(head, nn.Identity):
             return hidden_states
 
-        # Doing explicit to avoid problems with torch.fx
         batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size = hidden_states.shape
         # From: [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
         # To: head([batch_size * num_mask_units, hidden_size, mask_unit_height, mask_unit_width])
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index f9c3ef714680..c347f905d5da 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -39,7 +39,6 @@
     DUMMY_MASK,
     auto_docstring,
     is_torch_flex_attn_available,
-    is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
 )
@@ -1259,15 +1258,9 @@ def _shift_right(self, input_ids):
                 "See LongT5 docs for more information."
             )
 
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
 
         if pad_token_id is None:
             raise ValueError("self.model.config.pad_token_id has to be defined.")
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index bc5de0b65966..d1268c609446 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -42,7 +42,6 @@
     DUMMY_MASK,
     auto_docstring,
     is_torch_flex_attn_available,
-    is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
 )
@@ -631,15 +630,9 @@ def _shift_right(self, input_ids):
                 "See MT5 docs for more information."
             )
 
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
 
         if pad_token_id is None:
             raise ValueError("self.model.config.pad_token_id has to be defined.")
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index e38176b3f1ed..2d2ae7b7c16b 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -38,7 +38,6 @@
     DUMMY_MASK,
     auto_docstring,
     is_torch_flex_attn_available,
-    is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
 )
@@ -440,15 +439,9 @@ def _shift_right(self, input_ids):
                 "See Pix2Struct docs for more information."
             )
 
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
 
         if pad_token_id is None:
             raise ValueError("self.model.config.pad_token_id has to be defined.")
diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py
index 20f34c8d4d80..8386e7c47727 100644
--- a/src/transformers/models/pop2piano/modeling_pop2piano.py
+++ b/src/transformers/models/pop2piano/modeling_pop2piano.py
@@ -31,7 +31,7 @@
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, Seq2SeqLMOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import auto_docstring, is_torch_flex_attn_available, is_torch_fx_proxy, is_torchdynamo_compiling, logging
+from ...utils import auto_docstring, is_torch_flex_attn_available, is_torchdynamo_compiling, logging
 from .configuration_pop2piano import Pop2PianoConfig
 
 
@@ -593,15 +593,9 @@ def _shift_right(self, input_ids):
                 "self.model.config.decoder_start_token_id has to be defined. In Pop2Piano it is usually set to the pad_token_id."
             )
 
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
 
         if pad_token_id is None:
             raise ValueError("self.model.config.pad_token_id has to be defined.")
diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
index 3b6041c9d046..6abf3a0599ca 100644
--- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
@@ -30,7 +30,7 @@
 from ...modeling_rope_utils import dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, logging
-from ...utils.import_utils import is_torchdynamo_compiling
+from ...utils.import_utils import is_tracing
 from .configuration_recurrent_gemma import RecurrentGemmaConfig
 
 
@@ -362,8 +362,7 @@ def forward(
         # Apply gamma normalization to the input. We need to clip the derivatives of
         # `sqrt` in order to prevent NaNs during training in bfloat16. TODO a bit annoying
         multiplier = 1
-        tracing = isinstance(activations, torch.fx.Proxy) or is_torchdynamo_compiling()
-        if not torch.jit.is_tracing() and not tracing:
+        if not is_tracing(activations):
             multiplier = SqrtBoundDerivative.apply(1 - a_square)
         multiplier = reset + ~reset * multiplier
         normalized_x = gated_inputs * multiplier.type(activations.dtype)
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 008cd020f528..29f5e9c2c99a 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -44,7 +44,6 @@
     TransformersKwargs,
     auto_docstring,
     is_torch_flex_attn_available,
-    is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
 )
@@ -636,15 +635,9 @@ def _shift_right(self, input_ids):
                 " to the pad_token_id. See SwitchTransformers docs for more information"
             )
 
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
 
         if pad_token_id is None:
             raise ValueError("self.model.config.pad_token_id has to be defined.")
diff --git a/src/transformers/models/switch_transformers/modular_switch_transformers.py b/src/transformers/models/switch_transformers/modular_switch_transformers.py
index ccb1fe739bd0..274dc6ca44b7 100644
--- a/src/transformers/models/switch_transformers/modular_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modular_switch_transformers.py
@@ -37,7 +37,6 @@
     TransformersKwargs,
     auto_docstring,
     is_torch_flex_attn_available,
-    is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
 )
@@ -392,15 +391,9 @@ def _shift_right(self, input_ids):
                 " to the pad_token_id. See SwitchTransformers docs for more information"
             )
 
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
 
         if pad_token_id is None:
             raise ValueError("self.model.config.pad_token_id has to be defined.")
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 366337dae99f..2494964d296b 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -42,7 +42,6 @@
     DUMMY_MASK,
     auto_docstring,
     is_torch_flex_attn_available,
-    is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
 )
@@ -636,15 +635,9 @@ def _shift_right(self, input_ids):
                 "See T5 docs for more information."
             )
 
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
 
         if pad_token_id is None:
             raise ValueError("self.model.config.pad_token_id has to be defined.")
diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py
index a3be7ad08468..a1873b99f5cd 100644
--- a/src/transformers/models/umt5/modeling_umt5.py
+++ b/src/transformers/models/umt5/modeling_umt5.py
@@ -42,7 +42,6 @@
     DUMMY_MASK,
     auto_docstring,
     is_torch_flex_attn_available,
-    is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
 )
@@ -579,15 +578,9 @@ def _shift_right(self, input_ids):
                 "See UMT5 docs for more information."
             )
 
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
-            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
 
         if pad_token_id is None:
             raise ValueError("self.model.config.pad_token_id has to be defined.")
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 32f3cdab3363..c5a59fe8b3d9 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -92,7 +92,6 @@ def forward(self, position_ids: Optional[torch.Tensor] = None, past_key_values_l
         bsz, seq_len = position_ids.size()
         position_ids += self.offset
 
-        # Expand embeddings if needed. `position_ids.max()` is NOT used to keep torch.fx compatibility.
         max_pos = 2 + seq_len + past_key_values_length
         if max_pos > self.weights.size(0):
             self.make_weights(max_pos, self.embedding_dim, self.padding_idx)
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 82a9e3a85bd1..ad98aa9e2189 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -242,6 +242,7 @@
     is_torchdynamo_exporting,
     is_torchvision_available,
     is_torchvision_v2_available,
+    is_tracing,
     is_training_run_on_sagemaker,
     is_triton_available,
     is_uroman_available,
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
deleted file mode 100755
index 011930f72f48..000000000000
--- a/src/transformers/utils/fx.py
+++ /dev/null
@@ -1,1503 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import builtins
-import collections
-import contextlib
-import functools
-import inspect
-import math
-import operator
-import os
-import random
-import sys
-import warnings
-from collections.abc import Callable
-from typing import Any, Literal
-
-import torch
-import torch.utils._pytree as pytree
-from torch import nn
-from torch.fx import Graph, GraphModule, Node, Proxy, Tracer
-from torch.fx._compatibility import compatibility
-from torch.fx._symbolic_trace import is_fx_tracing
-from torch.fx.proxy import ParameterProxy
-
-from .. import logging
-from ..cache_utils import Cache, DynamicCache, StaticCache
-from ..modeling_utils import PreTrainedConfig, PreTrainedModel
-from ..models.auto import get_values
-from ..models.auto.modeling_auto import (
-    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_BACKBONE_MAPPING_NAMES,
-    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
-    MODEL_FOR_CTC_MAPPING_NAMES,
-    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
-    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_IMAGE_MAPPING_NAMES,
-    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
-    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
-    MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
-    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES,
-    MODEL_FOR_PRETRAINING_MAPPING_NAMES,
-    MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
-    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
-    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
-    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_MAPPING_NAMES,
-)
-from .import_utils import (
-    ENV_VARS_TRUE_VALUES,
-    is_peft_available,
-)
-
-
-if is_peft_available():
-    from peft import PeftModel
-
-
-logger = logging.get_logger(__name__)
-_IS_IN_DEBUG_MODE = os.environ.get("FX_DEBUG_MODE", "").upper() in ENV_VARS_TRUE_VALUES
-
-
-def _generate_supported_model_class_names(
-    model_name: type[PreTrainedConfig],
-    supported_tasks: str | list[str] | None = None,
-) -> list[str]:
-    task_mapping = {
-        "default": MODEL_MAPPING_NAMES,
-        "pretraining": MODEL_FOR_PRETRAINING_MAPPING_NAMES,
-        "next-sentence-prediction": MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES,
-        "masked-lm": MODEL_FOR_MASKED_LM_MAPPING_NAMES,
-        "causal-lm": MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
-        "seq2seq-lm": MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
-        "speech-seq2seq": MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
-        "multiple-choice": MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
-        "document-question-answering": MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
-        "question-answering": MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
-        "sequence-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
-        "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
-        "masked-image-modeling": MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
-        "image-classification": MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-        "zero-shot-image-classification": MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-        "ctc": MODEL_FOR_CTC_MAPPING_NAMES,
-        "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
-        "semantic-segmentation": MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
-        "backbone": MODEL_FOR_BACKBONE_MAPPING_NAMES,
-        "image-feature-extraction": MODEL_FOR_IMAGE_MAPPING_NAMES,
-    }
-
-    if supported_tasks is None:
-        supported_tasks = task_mapping.keys()
-    if isinstance(supported_tasks, str):
-        supported_tasks = [supported_tasks]
-
-    model_class_names = []
-    for task in supported_tasks:
-        class_name = task_mapping[task].get(model_name, None)
-        if class_name:
-            model_class_names.append(class_name)
-
-    return model_class_names
-
-
-_REGULAR_SUPPORTED_MODEL_NAMES_AND_TASKS = [
-    "altclip",
-    "albert",
-    "bart",
-    "bert",
-    "bitnet",
-    "blenderbot",
-    "blenderbot-small",
-    "bloom",
-    "clip",
-    "convnext",
-    "deberta",
-    "deberta-v2",
-    "dinov2",
-    "dinov3_convnext",
-    "dinov3_vit",
-    "distilbert",
-    "donut-swin",
-    "electra",
-    "gpt2",
-    "gpt_neo",
-    "gptj",
-    "hiera",
-    "hubert",
-    "ijepa",
-    "layoutlm",
-    "llama",
-    "cohere",
-    "lxmert",
-    "m2m_100",
-    "marian",
-    "mbart",
-    "megatron-bert",
-    "ministral",
-    "mistral",
-    "mixtral",
-    "mobilebert",
-    "mt5",
-    "nezha",
-    "opt",
-    "pegasus",
-    "plbart",
-    "qwen2",
-    "qwen2_moe",
-    "qwen3",
-    "qwen3_next",
-    "qwen3_moe",
-    "resnet",
-    "roberta",
-    "segformer",
-    "speech_to_text",
-    "speech_to_text_2",
-    "swin",
-    "t5",
-    "trocr",
-    "vit",
-    "vjepa2",
-    "xglm",
-    "wav2vec2",
-    #    "xlnet",
-]
-
-_FX_SUPPORTED_MODELS_WITH_KV_CACHE = ["llama", "opt"]
-
-_REGULAR_SUPPORTED_MODELS = []
-for item in _REGULAR_SUPPORTED_MODEL_NAMES_AND_TASKS:
-    if isinstance(item, dict):
-        _REGULAR_SUPPORTED_MODELS.extend(_generate_supported_model_class_names(**item))
-    else:
-        _REGULAR_SUPPORTED_MODELS.extend(_generate_supported_model_class_names(item))
-
-_SPECIAL_SUPPORTED_MODELS = [
-    "CLIPTextModel",
-    "CLIPTextModelWithProjection",
-    "CLIPVisionModel",
-    "CLIPVisionModelWithProjection",
-    "AltCLIPTextModel",
-    "AltCLIPVisionModel",
-    "GitVisionModel",
-    "GPT2DoubleHeadsModel",
-    "Speech2Text2Decoder",
-    "TrOCRDecoder",
-    "PeftModelForCausalLM",
-    "PeftModelForSeq2SeqLM",
-    "VJEPA2ForVideoClassification",
-    # TODO: add support for them as it should be quite easy to do so (small blocking issues).
-    # XLNetForQuestionAnswering,
-]
-_SUPPORTED_MODELS = tuple(sorted(set(_REGULAR_SUPPORTED_MODELS + _SPECIAL_SUPPORTED_MODELS)))
-
-_CURRENT_TRACER = None
-
-
-def torch_nn_embedding(self, input):
-    return torch.empty(*input.shape, self.weight.shape[-1], device="meta", dtype=self.weight.dtype)
-
-
-def torch_nn_functional_embedding(
-    input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False
-):
-    return torch.empty(*input.shape, weight.shape[-1], device="meta", dtype=weight.dtype)
-
-
-def torch_nn_layernorm(self, input):
-    return input
-
-
-def torch_nn_groupnorm(self, input):
-    return input
-
-
-def torch_nn_linear(self, input):
-    return torch.empty(input.shape[:-1] + (self.out_features,), device="meta")
-
-
-def torch_relu(x):
-    return x
-
-
-def torch_nn_relu(self, x):
-    return x
-
-
-def torch_nn_functional_relu(x, inplace=False):
-    if not inplace:
-        raise ValueError("Don't support in-place functional.relu for MetaTensor analysis")
-    return x
-
-
-def torch_where(condition, x, y):
-    # torch.where returns the broadcasted tensor of condition, x, and y,
-    # so hack it by using addition
-    return condition.to(device="meta") + x.to(device="meta") + y.to(device="meta")
-
-
-def torch_abs(input, *, out=None):
-    if out is not None:
-        raise ValueError("Don't support in-place abs for MetaTensor analysis")
-    return input
-
-
-def torch_arange(*args, **kwargs):
-    n = len(args)
-    step = 1
-    if n == 1:
-        start = 0
-        end = args[0]
-    elif n == 2:
-        start, end = args
-    else:
-        start, end, step = args
-    if isinstance(start, float):
-        start = int(start)
-    if isinstance(end, float):
-        start = int(end)
-    if isinstance(step, float):
-        step = int(step)
-    step = kwargs.get("step", step)
-    dtype = kwargs.get("dtype")
-    return torch.empty((end - start) // step, dtype=dtype, device="meta")
-
-
-def torch_full(*args, **kwargs):
-    args = list(args)
-    # We set the fill value to 1 as its value is not important as long as it's not a tensor on the `meta` device.
-    if len(args) > 1:
-        args[1] = 1
-    else:
-        kwargs["fill_value"] = 1
-    kwargs_without_device = dict(kwargs)
-    kwargs_without_device.pop("device", None)
-    return torch.full(*args, **kwargs_without_device, device="meta")
-
-
-def torch_cat(tensors, dim=None, axis=None, *, out=None):
-    if dim is None and axis is None:
-        dim = 0
-    if dim is None and axis is not None:
-        dim = axis
-    if dim < 0:
-        dim = tensors[0].dim() + dim
-    shapes = [t.shape for t in tensors]
-    shape = list(shapes[0])
-    concatenated_dim = sum(shape[dim] for shape in shapes)
-    final_shape = shape[:dim] + [concatenated_dim] + shape[dim + 1 :]
-    return torch.empty(final_shape, device="meta")
-
-
-def torch_stack(tensors, dim=None, axis=None, *, out=None):
-    if dim is None and axis is None:
-        dim = 0
-    if dim is None and axis is not None:
-        dim = axis
-    if dim < 0:
-        dim = tensors[0].dim() + 1 + dim
-    shape = list(tensors[0].shape)
-    shape.insert(dim, len(tensors))
-    return torch.empty(shape, device="meta")
-
-
-def torch_add(input, other, *, alpha=1, out=None):
-    if not isinstance(input, torch.Tensor):
-        return torch.empty_like(other, device="meta")
-    if not isinstance(other, torch.Tensor):
-        return torch.empty_like(input, device="meta")
-    max_length = max(input.dim(), other.dim())
-    input_shape = list(input.shape) + [1] * (max_length - input.dim())
-    other_shape = list(other.shape) + [1] * (max_length - other.dim())
-    shape = []
-    for i in range(max_length):
-        shape.append(max(input_shape[i], other_shape[i]))
-    return torch.empty(shape, device="meta")
-
-
-def torch_mul(input, other, *, out=None):
-    return torch_add(input, other, out=out)
-
-
-def torch_tensor_mul(self, other):
-    return torch_mul(self, other)
-
-
-def torch_matmul(input, other, *, out=None):
-    d1 = input.dim()
-    d2 = other.dim()
-    shape = None
-    if d1 == 1 and d2 == 1:
-        shape = None
-    elif d1 == 2 and d2 == 2:
-        shape = (input.size(0), other.size(1))
-    elif d1 == 1 and d2 == 2:
-        shape = (other.size(1),)
-    elif d1 == 2 and d1 == 1:
-        shape = (input.size(0),)
-    else:
-        max_length = max(input.dim(), other.dim())
-        shape1 = list(input.shape)
-        shape2 = list(other.shape)
-        if d1 == 1:
-            shape1 = [1] + shape1
-        if d2 == 1:
-            shape2.append(1)
-        shape1 = [-1] * (max_length - d1) + list(input.shape)
-        shape2 = [-1] * (max_length - d2) + list(other.shape)
-        shape = []
-        for i in range(max_length):
-            shape.append(max(shape1[i], shape2[i]))
-        shape[-2] = shape1[-2]
-        shape[-1] = shape2[-1]
-        if d1 == 1:
-            shape.pop(-2)
-        if d2 == 1:
-            shape.pop(-1)
-    if shape is None:
-        return torch.tensor(0.0, device="meta")
-    return torch.empty(*shape, device="meta")
-
-
-def torch_bmm(input, mat2, *, out=None):
-    if out is not None:
-        raise ValueError("Don't support in-place bmm for MetaTensor analysis")
-    batch_size, n, m = input.shape
-    _, _, p = mat2.shape
-    return torch.empty(batch_size, n, p, device="meta")
-
-
-def torch_baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None):
-    if out is not None:
-        raise ValueError("Don't support in-place baddbmm for MetaTensor analysis")
-    return torch_bmm(batch1, batch2)
-
-
-def torch_tensor_baddbmm(self, batch1, batch2, *, beta=1, alpha=1, out=None):
-    return torch_baddbmm(self, batch1, batch2, beta=beta, alpha=alpha, out=out)
-
-
-def torch_einsum(equation, *operands):
-    # TODO: infer shape without performing the computation, this might be quite hard.
-    concrete_operands = (torch.empty_like(operand, device="cpu") for operand in operands)
-    return torch.einsum(equation, *concrete_operands).to("meta")
-
-
-def torch_tensor_repeat(self, *sizes):
-    shape = list(self.shape)
-    for i, x in enumerate(sizes):
-        shape[i] *= x
-    return torch.empty(shape, device="meta")
-
-
-def torch_repeat_interleave(*args, dim=None, output_size=None):
-    num_args = len(args)
-    if num_args == 1:
-        shape = [output_size if output_size is not None else args[0].sum()]
-    else:
-        shape = list(args[0].shape)
-        if dim is None:
-            if num_args > 2:
-                dim = args[2]
-            else:
-                shape = [sum(shape)]
-                dim = 0
-        repeats = args[1]
-        if isinstance(repeats, int) or torch.numel(repeats) == 1:
-            shape[dim] *= int(repeats)
-        else:
-            shape[dim] = output_size if output_size is not None else repeats.sum()
-    return torch.empty(*shape, device="meta")
-
-
-def torch_index_select(input, dim, index, *, out=None):
-    shape = list(input.shape)
-    shape[dim] = len(index)
-    return torch.empty(*shape, device="meta")
-
-
-def torch_tensor_index_select(self, dim, index):
-    return torch_index_select(self, dim, index)
-
-
-def torch_gather(input, dim, index, *, sparse_grad=False, out=None):
-    shape = list(input.shape)
-    shape[dim] = index.shape[dim]
-    return torch.empty(*shape, device="meta")
-
-
-def torch_tensor_gather(self, dim, index):
-    return torch_gather(self, dim, index)
-
-
-def torch_roll(input, shifts, dims=None):
-    return input
-
-
-def torch_flip(input, dims):
-    return input
-
-
-def torch_tensor_flip(self, dims):
-    return self
-
-
-def torch_nn_conv1d(self, input):
-    l_in = input.shape[-1]
-    shape = None
-    padding = self.padding
-    if padding == "valid":
-        padding = (0, 0)
-    if padding == "same":
-        shape = list(input.shape)
-    if shape is None:
-        shape = list(input.shape)
-        l_out = math.floor(
-            (l_in + 2 * padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) / self.stride[0] + 1
-        )
-        shape[-1] = l_out
-    shape[-2] = self.out_channels
-    return torch.empty(shape, device="meta")
-
-
-def torch_nn_conv2d(self, input):
-    h_in, w_in = input.shape[-2:]
-    shape = None
-    padding = self.padding
-    if padding == "valid":
-        padding = (0, 0)
-    if padding == "same":
-        shape = list(input.shape)
-    if shape is None:
-        shape = list(input.shape)
-        h_out = math.floor(
-            (h_in + 2 * padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) / self.stride[0] + 1
-        )
-        w_out = math.floor(
-            (w_in + 2 * padding[1] - self.dilation[1] * (self.kernel_size[1] - 1) - 1) / self.stride[1] + 1
-        )
-        shape[-2:] = [h_out, w_out]
-    shape[-3] = self.out_channels
-    return torch.empty(shape, device="meta")
-
-
-def torch_squeeze(input, dim=None):
-    shape = list(input.shape)
-    if dim is not None:
-        if dim < 0:
-            dim = input.dim() + dim
-        if shape[dim] == 1:
-            shape.pop(dim)
-    else:
-        new_shape = []
-        for dim_value in shape:
-            if dim_value == 1:
-                continue
-            new_shape.append(dim_value)
-        shape = new_shape
-    return torch.empty(shape, device="meta")
-
-
-def torch_tensor_squeeze(self, dim=None):
-    return torch_squeeze(self, dim)
-
-
-def torch_unsqueeze(input, dim):
-    shape = list(input.shape)
-    if dim < 0:
-        dim = input.dim() + 1 + dim
-    shape.insert(dim, 1)
-    return torch.empty(shape, device="meta")
-
-
-def torch_tensor_unsqueeze(self, dim):
-    return torch_unsqueeze(self, dim)
-
-
-def torch_unique_consecutive(input, **kwargs):
-    output = torch.unique_consecutive(torch.zeros_like(input, device="cpu"), **kwargs)
-    if isinstance(output, torch.Tensor):
-        return output.to("meta")
-    else:
-        return tuple(map(output, lambda x: x.to("meta")))
-
-
-def torch_nn_functional_one_hot(tensor, num_classes=-1):
-    if num_classes < 0:
-        raise ValueError("Don't support automatic num_classes inference for MetaTensor analysis")
-    shape = list(tensor.shape) + [num_classes]
-    return torch.empty(shape, device="meta")
-
-
-def torch_nn_functional_scaled_dot_product_attention(
-    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
-):
-    target_length = query.shape[-2]
-    head_dim = value.shape[-1]
-    return torch.empty((*query.shape[:-2], target_length, head_dim), device="meta")
-
-
-def torch_nn_mseloss(self, input, target):
-    if self.reduction == "none":
-        shape = target.shape
-    else:
-        shape = (1,)
-    return torch.empty(shape, device="meta")
-
-
-def torch_nn_crossentropyloss(self, input, target):
-    if self.reduction == "none":
-        shape = target.shape
-    else:
-        shape = (1,)
-    return torch.empty(shape, device="meta")
-
-
-def torch_nn_bcewithlogitsloss(self, input, target):
-    if self.reduction == "none":
-        shape = target.shape
-    else:
-        shape = (1,)
-    return torch.empty(shape, device="meta")
-
-
-def operator_getitem(a, b):
-    def to_concrete(t):
-        if isinstance(t, torch.Tensor):
-            concrete = torch.ones_like(t, device="cpu")
-            if concrete.dtype in [torch.float16, torch.float32, torch.float64, torch.int32]:
-                concrete = concrete.to(torch.int64)
-            return concrete
-        return t
-
-    if isinstance(a, torch.Tensor):
-        # TODO: infer shape without performing the computation.
-        if isinstance(b, tuple):
-            b = tuple(map(to_concrete, b))
-        else:
-            b = to_concrete(b)
-        return operator.getitem(torch.empty_like(a, device="cpu"), b).to("meta")
-    return operator.getitem(a, b)
-
-
-_MANUAL_META_OVERRIDES: dict[Callable, Callable] = {
-    torch.nn.Embedding: torch_nn_embedding,
-    torch.nn.functional.embedding: torch_nn_functional_embedding,
-    torch.nn.LayerNorm: torch_nn_layernorm,
-    torch.nn.GroupNorm: torch_nn_groupnorm,
-    torch.nn.Linear: torch_nn_linear,
-    torch.relu: torch_relu,
-    torch.nn.functional.relu: torch_nn_functional_relu,
-    torch.nn.ReLU: torch_nn_relu,
-    torch.where: torch_where,
-    torch.abs: torch_abs,
-    torch.arange: torch_arange,
-    torch.full: torch_full,
-    torch.cat: torch_cat,
-    torch.stack: torch_stack,
-    torch.add: torch_add,
-    torch.mul: torch_mul,
-    torch.Tensor.mul: torch_tensor_mul,
-    torch.matmul: torch_matmul,
-    torch.bmm: torch_bmm,
-    torch.baddbmm: torch_baddbmm,
-    torch.Tensor.baddbmm: torch_tensor_baddbmm,
-    torch.einsum: torch_einsum,
-    torch.Tensor.repeat: torch_tensor_repeat,
-    torch.repeat_interleave: torch_repeat_interleave,
-    torch.roll: torch_roll,
-    torch.flip: torch_flip,
-    torch.Tensor.flip: torch_tensor_flip,
-    torch.index_select: torch_index_select,
-    torch.Tensor.index_select: torch_tensor_index_select,
-    torch.gather: torch_gather,
-    torch.Tensor.gather: torch_tensor_gather,
-    torch.nn.Conv1d: torch_nn_conv1d,
-    torch.nn.Conv2d: torch_nn_conv2d,
-    torch.squeeze: torch_squeeze,
-    torch.Tensor.squeeze: torch_tensor_squeeze,
-    torch.unsqueeze: torch_unsqueeze,
-    torch.Tensor.unsqueeze: torch_tensor_unsqueeze,
-    torch.unique_consecutive: torch_unique_consecutive,
-    torch.nn.functional.one_hot: torch_nn_functional_one_hot,
-    torch.nn.MSELoss: torch_nn_mseloss,
-    torch.nn.CrossEntropyLoss: torch_nn_crossentropyloss,
-    torch.nn.BCEWithLogitsLoss: torch_nn_bcewithlogitsloss,
-    operator.getitem: operator_getitem,
-}
-
-_MANUAL_META_OVERRIDES[torch.nn.functional.scaled_dot_product_attention] = (
-    torch_nn_functional_scaled_dot_product_attention
-)
-
-
-class HFProxy(Proxy):
-    """
-    Proxy that uses metadata to handle data-dependent control-flow.
-    """
-
-    def install_metadata(self, metadata):
-        self._metadata = metadata
-
-    @property
-    def shape(self):
-        return self.tracer.create_proxy("call_method", "size", (self,), {})
-
-    @property
-    def device(self):
-        # Hack so we can track when devices are used. During meta-tensor propagation,
-        # replace these values with a constant 'meta'
-        return MetaDeviceAttribute(self, "device")
-
-    def __len__(self):
-        if hasattr(self, "_metadata") and self._metadata is not None:
-            return len(self._metadata)
-        return super().__len__()
-
-    def __bool__(self):
-        if hasattr(self, "_metadata") and self._metadata is not None:
-            return self._metadata
-        return super().__bool__()
-
-    def __getattr__(self, k):
-        if k == "_metadata":
-            return self.__getattribute__(k)
-        # note: not added to the graph yet, if this is a method call
-        # we peephole optimize to the method invocation
-        return HFAttribute(self, k)
-
-    def __setitem__(self, indices, values):
-        return self.tracer.create_proxy("call_function", operator.setitem, (self, indices, values), {})
-
-    def __contains__(self, key):
-        if hasattr(self, "_metadata") and self._metadata is not None:
-            return key in self._metadata
-        return super().__contains__(key)
-
-
-class HFAttribute(HFProxy):
-    def __init__(self, root, attr: str):
-        self.root = root
-        self.attr = attr
-        self.tracer = root.tracer
-        self._node = None
-
-        if hasattr(self.root, "_metadata"):
-            self.install_metadata(getattr(self.root._metadata, attr))
-
-    @property
-    def node(self):
-        # the node for attributes is added lazily, since most will just be method calls
-        # which do not rely on the getitem call
-        if self._node is None:
-            self._node = self.tracer.create_proxy("call_function", builtins.getattr, (self.root, self.attr), {}).node
-        return self._node
-
-    def __call__(self, *args, **kwargs):
-        return self.tracer.create_proxy("call_method", self.attr, (self.root,) + args, kwargs)
-
-
-class MetaDeviceAttribute(HFAttribute):
-    pass
-
-
-class HFCacheProxy(HFProxy):
-    """
-    Proxy that represents an instance of `transformers.cache_utils.Cache`.
-    """
-
-    def install_orig_cache_cls(self, orig_cache_cls: type[Cache]):
-        self._orig_cache_cls = orig_cache_cls
-
-    @property
-    def __class__(self):
-        if not hasattr(self, "_orig_cache_cls"):
-            raise RuntimeError("The original Cache class must be installed to the HFCacheProxy.")
-        return self.tracer._CLASSES_TO_PATCH[self._orig_cache_cls]
-
-
-def create_wrapper(
-    function: Callable,
-    op_type: Literal["call_function"] | Literal["call_method"] | Literal["get_attr"],
-    proxy_factory_fn: Callable[[Node], Proxy] | None = None,
-) -> Callable:
-    @functools.wraps(function)
-    def wrapper(*args, **kwargs):
-        if not is_fx_tracing():
-            return function(*args, **kwargs)
-
-        found_proxies = []
-
-        def check_proxy(a):
-            if isinstance(a, Proxy):
-                found_proxies.append(a)
-
-        torch.fx.node.map_aggregate(args, check_proxy)
-        torch.fx.node.map_aggregate(kwargs, check_proxy)
-
-        if len(found_proxies) > 0:
-            tracer = found_proxies[0].tracer
-            if op_type == "call_function":
-                target = function
-            elif op_type == "call_method" or op_type == "get_attr":
-                target = function.__name__
-            else:
-                raise ValueError(f"op_type {op_type} not supported.")
-            return tracer.create_proxy(op_type, target, args, kwargs, proxy_factory_fn=proxy_factory_fn)
-        else:
-            return function(*args, **kwargs)
-
-    return wrapper
-
-
-class HFProxyableClassMeta(type):
-    """
-    Metaclass that creates a class with its main methods wrapped to be proxyable.
-    """
-
-    def __new__(
-        cls,
-        name: str,
-        bases: tuple[type, ...],
-        attrs: dict[str, Any],
-        proxy_factory_fn: Callable[[Node], Proxy] | None = None,
-    ):
-        instance = super().__new__(cls, name, bases, attrs)
-        for attr_name in dir(instance):
-            attr = getattr(instance, attr_name, None)
-            if attr is None:
-                continue
-            if attr_name == "__init__":
-                op_type = "call_function"
-            elif attr_name.startswith("__"):
-                op_type = None
-            elif inspect.ismethod(attr):
-                op_type = "call_function"
-            elif inspect.isfunction(attr):
-                op_type = "call_method"
-            else:
-                op_type = None
-            if op_type is not None:
-                setattr(instance, attr_name, create_wrapper(attr, op_type, proxy_factory_fn=proxy_factory_fn))
-        return instance
-
-
-def gen_constructor_wrapper(target: Callable) -> tuple[Callable, Callable]:
-    """
-    Wraps `target` to be proxyable. Used for tensor creators like `torch.ones`, `torch.arange` and so on.
-    """
-    wrapper = create_wrapper(target, "call_function")
-    return wrapper, target
-
-
-def _proxies_to_metas(v):
-    """Returns the underlying metadata for HFProxies, and behaves like the identity for the others."""
-    if isinstance(v, MetaDeviceAttribute):
-        return "meta"
-    if isinstance(v, torch.fx.Proxy):
-        if not (isinstance(v, HFProxy) and hasattr(v, "_metadata")):
-            raise RuntimeError(f"No metadata was found for {v}")
-        return v._metadata
-    return v
-
-
-def create_cache_proxy_factory_fn(orig_cache_cls: type[Cache]) -> Callable[[Node], HFCacheProxy]:
-    def cache_proxy_factory_fn(n: Node) -> HFCacheProxy:
-        if not isinstance(_CURRENT_TRACER, HFTracer):
-            raise RuntimeError("Cannot create HFCacheProxy because there is no HFTracer currently tracing.")
-        cache_proxy = HFCacheProxy(n, _CURRENT_TRACER)
-        cache_proxy.install_orig_cache_cls(orig_cache_cls)
-        return cache_proxy
-
-    return cache_proxy_factory_fn
-
-
-# Proxyable equivalent of the cache classes defined in `transformers.cache_utils`.
-ProxyableCache = HFProxyableClassMeta(
-    "ProxyableCache", (Cache,), {}, proxy_factory_fn=create_cache_proxy_factory_fn(Cache)
-)
-ProxyableDynamicCache = HFProxyableClassMeta(
-    "ProxyableDynamicCache",
-    (DynamicCache,),
-    {},
-    proxy_factory_fn=create_cache_proxy_factory_fn(DynamicCache),
-)
-ProxyableStaticCache = HFProxyableClassMeta(
-    "ProxyableStaticCache",
-    (StaticCache,),
-    {},
-    proxy_factory_fn=create_cache_proxy_factory_fn(StaticCache),
-)
-
-
-def _generate_random_int(low: int = 10, high: int = 20, forbidden_values: list[int] | None = None):
-    if forbidden_values is None:
-        forbidden_values = []
-    value = random.randint(low, high)
-    while value in forbidden_values:
-        value = random.randint(low, high)
-    return value
-
-
-class HFTracer(Tracer):
-    """
-    Tracer that is able to symbolically trace models from the library. To do that, it uses the HFProxy instead of the
-    regular PyTorch torch.fx.Proxy.
-    """
-
-    # Feature flag for proxying accesses to buffer values
-    proxy_buffer_attributes: bool = True
-    allow_insert_stateless_mods: bool = True
-    _TORCH_METHODS_TO_PATCH = [
-        "arange",
-        "zeros",
-        "ones",
-        "full",
-        "full_like",
-        "eye",
-        "empty",
-        "tensor",
-        "clamp",
-        "finfo",
-        "tril",
-    ]
-    _CLASSES_TO_PATCH = {
-        Cache: ProxyableCache,
-        DynamicCache: ProxyableDynamicCache,
-        StaticCache: ProxyableStaticCache,
-    }
-
-    supported_archs = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
-
-    def __init__(self, autowrap_modules=(math,), autowrap_functions=()):
-        super().__init__(autowrap_modules=autowrap_modules, autowrap_functions=autowrap_functions)
-
-    def _generate_dummy_input(
-        self, model: "PreTrainedModel", input_name: str, shape: list[int], input_names: list[str]
-    ) -> dict[str, torch.Tensor]:
-        """Generates dummy input for model inference recording."""
-        # Retrieving the model class, either from the "class_for_deserialization" attribute if the model was restored
-        # from pickle, or from the "__class__" attribute in the general case.
-        model_class_name = getattr(model, "class_for_deserialization", model.__class__).__name__
-        device = model.device
-        inputs_dict = {}
-
-        # when tracing a model with KV cache, we simply need to unsure that the KV cache length is larger than one to
-        # rightfully pass certain controlflows (Example: https://github.com/huggingface/transformers/blob/5c8d941d66734811d2ef6f57f15b44f7fb7a98c4/src/transformers/modeling_attn_mask_utils.py#L162).
-        # After tracing, the model can then still be used with arbitrary lengths different than the one used during tracing.
-        kv_cache_length = 5
-
-        if input_name in ["labels", "start_positions", "end_positions"]:
-            batch_size = shape[0]
-            if model_class_name in [
-                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES),
-                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES),
-                *get_values(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES),
-            ]:
-                inputs_dict["labels"] = torch.zeros(batch_size, dtype=torch.long, device=device)
-            elif model_class_name in [
-                *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
-                *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES),
-                "XLNetForQuestionAnswering",
-            ]:
-                inputs_dict["start_positions"] = torch.zeros(batch_size, dtype=torch.long, device=device)
-                inputs_dict["end_positions"] = torch.zeros(batch_size, dtype=torch.long, device=device)
-            elif model_class_name in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES):
-                if not hasattr(model.config, "problem_type") or model.config.problem_type is None:
-                    raise ValueError(
-                        "Could not retrieve the problem type for the sequence classification task, please set "
-                        'model.config.problem_type to one of the following values: "regression", '
-                        '"single_label_classification", or "multi_label_classification".'
-                    )
-
-                if model.config.problem_type == "regression":
-                    labels_shape = (batch_size, model.config.num_labels)
-                    labels_dtype = torch.float32
-                elif model.config.problem_type == "single_label_classification":
-                    labels_shape = (batch_size,)
-                    labels_dtype = torch.long
-                elif model.config.problem_type == "multi_label_classification":
-                    labels_shape = (batch_size, model.config.num_labels)
-                    labels_dtype = torch.float32
-                else:
-                    raise ValueError(
-                        'Expected model.config.problem_type to be either: "regression", "single_label_classification"'
-                        f', or "multi_label_classification", but "{model.config.problem_type}" was provided.'
-                    )
-                inputs_dict["labels"] = torch.zeros(*labels_shape, dtype=labels_dtype, device=device)
-
-            elif model_class_name in [
-                *get_values(MODEL_FOR_PRETRAINING_MAPPING_NAMES),
-                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
-                *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES),
-                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),
-                *get_values(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES),
-                "GPT2DoubleHeadsModel",
-                "PeftModelForCausalLM",
-                "PeftModelForSeq2SeqLM",
-            ]:
-                inputs_dict["labels"] = torch.zeros(shape, dtype=torch.long, device=device)
-            elif model_class_name in [*get_values(MODEL_FOR_CTC_MAPPING_NAMES)]:
-                inputs_dict["labels"] = torch.zeros(shape, dtype=torch.float32, device=device)
-            else:
-                raise NotImplementedError(
-                    f"Generating the dummy input named {input_name} for {model_class_name} is not supported yet."
-                )
-        elif "pixel_values" in input_name:
-            batch_size = shape[0]
-            image_size = getattr(model.config, "image_size", None)
-            if image_size is None:
-                if hasattr(model.config, "vision_config"):
-                    image_size = model.config.vision_config.image_size
-                elif hasattr(model.config, "encoder"):
-                    image_size = model.config.encoder.image_size
-                else:
-                    image_size = (_generate_random_int(), _generate_random_int())
-
-            # If no num_channels is in the config, use some arbitrary value.
-            num_channels = getattr(model.config, "num_channels", 3)
-            if not isinstance(image_size, collections.abc.Iterable):
-                image_size = (image_size, image_size)
-            height, width = image_size
-            inputs_dict[input_name] = torch.zeros(
-                batch_size, num_channels, height, width, dtype=torch.float32, device=device
-            )
-        elif "bbox" in input_name:
-            inputs_dict[input_name] = torch.zeros(*shape, 4, dtype=torch.float, device=device)
-        elif "input_features" in input_name:
-            inputs_dict[input_name] = torch.zeros(
-                *shape, model.config.input_feat_per_channel, dtype=torch.float, device=device
-            )
-        elif "inputs_embeds" in input_name:
-            batch_size = shape[0]
-
-            if (
-                getattr(model.config, "embedding_size", None) is not None
-                and model.config.model_type != "megatron-bert"
-            ):
-                embedding_size = model.config.embedding_size
-            else:
-                embedding_size = model.config.hidden_size
-
-            if len(shape) == 3:
-                # (batch_size, num_choices, sequence_length, embedding_size)
-                embedding_shape = (batch_size, shape[1], shape[2], embedding_size)
-            else:
-                # (batch_size, sequence_length, embedding_size)
-                embedding_shape = (batch_size, shape[1], embedding_size)
-
-            inputs_dict[input_name] = torch.zeros(embedding_shape, dtype=torch.float, device=device)
-        elif "visual_feats" in input_name:
-            inputs_dict[input_name] = torch.zeros(
-                shape
-                + [
-                    model.config.visual_feat_dim,
-                ],
-                dtype=torch.float,
-                device=device,
-            )
-        elif "visual_pos" in input_name:
-            inputs_dict[input_name] = torch.zeros(
-                shape
-                + [
-                    model.config.visual_pos_dim,
-                ],
-                dtype=torch.float,
-                device=device,
-            )
-        elif "inputs" in input_name:
-            inputs_dict[input_name] = torch.zeros(*shape, dtype=torch.float, device=device)
-        elif "input_values" in input_name:
-            batch_size, _ = shape
-            # Generating big sequence length for audio inputs.
-            seq_length = _generate_random_int(low=10000, high=20000)
-            inputs_dict[input_name] = torch.zeros(batch_size, seq_length, dtype=torch.float, device=device)
-        elif "mask" in input_name:
-            if "past_key_values" in input_names:
-                mask_shape = [shape[0], shape[1] + kv_cache_length]
-            else:
-                mask_shape = shape
-
-            inputs_dict[input_name] = torch.zeros(mask_shape, dtype=torch.long, device=device)
-        elif "ids" in input_name:
-            inputs_dict[input_name] = torch.zeros(shape, dtype=torch.long, device=device)
-        elif "past_key_values" in input_name:
-            if model.config.model_type not in _FX_SUPPORTED_MODELS_WITH_KV_CACHE:
-                raise NotImplementedError(
-                    f"Symbolic trace with past_key_values input is not supported yet for the model {model.config.model_type}. Please open an issue or a PR in Transformers repository if you would like to see the support added."
-                )
-            num_heads = model.config.num_attention_heads
-            head_dim = model.config.hidden_size // model.config.num_attention_heads
-
-            cache_shape = (shape[0], num_heads, kv_cache_length, head_dim)
-            pkv = tuple(
-                (
-                    torch.rand(cache_shape, dtype=torch.float, device=device),
-                    torch.rand(cache_shape, dtype=torch.float, device=device),
-                )
-                for i in range(model.config.num_hidden_layers)
-            )
-            inputs_dict[input_name] = pkv
-        else:
-            shape_with_hidden_size = shape + [model.config.hidden_size]
-            inputs_dict[input_name] = torch.zeros(shape_with_hidden_size, dtype=torch.float, device=device)
-
-        return inputs_dict
-
-    def create_proxy(self, kind, target, args, kwargs, name=None, type_expr=None, proxy_factory_fn=None):
-        rv = super().create_proxy(kind, target, args, kwargs, name, type_expr, proxy_factory_fn)
-
-        if kind == "placeholder" and target in self.meta_args:
-            rv.install_metadata(self.meta_args[target])
-            return rv
-
-        if target in self.orig_fns:
-            # NOTE: tensor constructors in PyTorch define the `device` argument as
-            # *kwargs-only*. That is why this works. If you add methods to
-            # _TORCH_METHODS_TO_PATCH that do not define `device` as kwarg-only,
-            # this will break and you will likely see issues where we cannot infer
-            # the size of the output.
-            if "device" in kwargs:
-                kwargs["device"] = "meta"
-
-        try:
-            args_metas = torch.fx.node.map_aggregate(args, _proxies_to_metas)
-            kwargs_metas = torch.fx.node.map_aggregate(kwargs, _proxies_to_metas)
-
-            should_install_metadata = True
-
-            self._disable_module_getattr = True
-            self._disable_call_module = True
-
-            if kind == "call_function":
-                meta_target = _MANUAL_META_OVERRIDES.get(target, target)
-                meta_out = meta_target(*args_metas, **kwargs_metas)
-                if isinstance(meta_out, torch.Tensor):
-                    meta_out = meta_out.to(device="meta")
-            elif kind == "call_method":
-                method = getattr(args_metas[0].__class__, target)
-                meta_target = _MANUAL_META_OVERRIDES.get(method, method)
-                meta_out = meta_target(*args_metas, **kwargs_metas)
-            elif kind == "call_module":
-                if not hasattr(self, "orig_forward"):
-                    raise AttributeError(f"{self} does not have an attribute called orig_forward")
-                mod = self.root.get_submodule(target)
-                mod_type = type(mod)
-                if mod_type in _MANUAL_META_OVERRIDES:
-                    meta_out = _MANUAL_META_OVERRIDES[mod_type](mod, *args_metas, **kwargs_metas)
-                else:
-                    meta_out = self.orig_forward(*args_metas, **kwargs_metas)
-            elif kind == "get_attr":
-                attr_itr = self.root
-                atoms = target.split(".")
-                for atom in atoms:
-                    attr_itr = getattr(attr_itr, atom)
-                if isinstance(attr_itr, torch.Tensor):
-                    meta_out = attr_itr.to(device="meta")
-                else:
-                    meta_out = attr_itr
-            else:
-                should_install_metadata = False
-
-            if should_install_metadata:
-                if not isinstance(rv, Proxy):
-                    raise ValueError("Don't support composite output yet")
-                rv.install_metadata(meta_out)
-
-        except Exception as e:
-            if _IS_IN_DEBUG_MODE:
-                warnings.warn(f"Could not compute metadata for {kind} target {target}: {e}")
-
-        self._disable_module_getattr = False
-        self._disable_call_module = False
-
-        return rv
-
-    # Replaced by .getattr from PyTorch 1.13
-    def _module_getattr(self, attr, attr_val, parameter_proxy_cache):
-        if getattr(self, "_disable_module_getattr", False):
-            return attr_val
-        else:
-
-            def maybe_get_proxy_for_attr(attr_val, collection_to_search, parameter_proxy_cache):
-                for n, p in collection_to_search:
-                    if attr_val is p:
-                        if n not in parameter_proxy_cache:
-                            kwargs = {}
-                            if "proxy_factory_fn" in inspect.signature(self.create_proxy).parameters:
-                                kwargs["proxy_factory_fn"] = (
-                                    None
-                                    if not self.param_shapes_constant
-                                    else lambda node: ParameterProxy(self, node, n, attr_val)
-                                )
-                            val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs)  # type: ignore[arg-type]
-                            parameter_proxy_cache[n] = val_proxy
-                        return parameter_proxy_cache[n]
-                return None
-
-            if isinstance(attr_val, torch.nn.Parameter):
-                maybe_parameter_proxy = maybe_get_proxy_for_attr(
-                    attr_val, self.root.named_parameters(), parameter_proxy_cache
-                )
-                if maybe_parameter_proxy is not None:
-                    return maybe_parameter_proxy
-
-            if self.proxy_buffer_attributes and isinstance(attr_val, torch.Tensor):
-                maybe_buffer_proxy = maybe_get_proxy_for_attr(
-                    attr_val, self.root.named_buffers(), parameter_proxy_cache
-                )
-                if maybe_buffer_proxy is not None:
-                    return maybe_buffer_proxy
-
-            return attr_val
-
-    # Needed for PyTorch 1.13+
-    def getattr(self, attr: str, attr_val: Any, parameter_proxy_cache: dict[str, Any]):
-        return self._module_getattr(attr, attr_val, parameter_proxy_cache)
-
-    def call_module(self, m, forward, args, kwargs):
-        if getattr(self, "_disable_call_module", False):
-            return forward(*args, **kwargs)
-        self.orig_forward = forward
-        return super().call_module(m, forward, args, kwargs)
-
-    def proxy(self, node):
-        return HFProxy(node, self)
-
-    @contextlib.contextmanager
-    def patch_for_tracing(self, root: torch.nn.Module | Callable[..., Any]):
-        # Patching torch functions
-        self.patched_torch_methods = {
-            target: gen_constructor_wrapper(getattr(torch, target)) for target in self._TORCH_METHODS_TO_PATCH
-        }
-        self.orig_fns = set()
-
-        for name, (wrapper, orig) in self.patched_torch_methods.items():
-            setattr(torch, name, wrapper)
-            self.orig_fns.add(orig)
-
-        # Patching classes
-        patched = []
-        module_of_model = inspect.getmodule(root)
-        for name, mod in sys.modules.items():
-            if module_of_model is not None and mod is not module_of_model:
-                continue
-            if not name.startswith("transformers"):
-                continue
-            for orig_cls, patched_cls in self._CLASSES_TO_PATCH.items():
-                for attr_name, attr in mod.__dict__.items():
-                    if attr is orig_cls:
-                        patched.append((mod, attr_name, orig_cls))
-                        setattr(mod, attr_name, patched_cls)
-
-        yield
-
-        # Restoring patched functions and classes.
-        for name, (_, orig) in self.patched_torch_methods.items():
-            setattr(torch, name, orig)
-        self.patched_torch_methods = {}
-        self.orig_fns = set()
-
-        for mod, attr_name, orig_cls in patched:
-            setattr(mod, attr_name, orig_cls)
-
-    def trace(
-        self,
-        root: torch.nn.Module | Callable[..., Any],
-        concrete_args: dict[str, Any] | None = None,
-        dummy_inputs: dict[str, Any] | None = None,
-        complete_concrete_args_with_inputs_not_in_dummy_inputs: bool = True,
-    ) -> Graph:
-        """
-        Traces `root` and returns the corresponding FX `torch.fx.Graph` representation. `root` can either be a
-        `torch.nn.Module` instance or a Python callable. Note that after this call, `self.root` may be different from
-        the `root` passed in here. For example, when a free function is passed to `trace()`, we will create a
-        `torch.nn.Module` instance to use as the root and add embedded constants to.
-
-        Args:
-            root (`torch.nn.Module` or  `Callable`):
-                Either a `torch.nn.Module`` or a function to be traced through. If root is not a
-                [`~transformers.PreTrainedModel`], then `dummy_inputs` must be passed, otherwise tracing will fail.
-            concrete_args (`dict[str, Any], *optional*):
-                Concrete arguments that should not be treated as Proxies
-            dummy_inputs (`dict[str, Any]`, *optional*):
-                The dummy inputs needed to handle data-dependent control-flow if `root` is not a
-                [`~transformers.PreTrainedModel`]. It can also be used when `root` is a
-                [`~transformers.PreTrainedModel`] to specify custom dummy inputs for a subset or all the model inputs.
-            complete_concrete_args_with_inputs_not_in_dummy_inputs (`bool`, *optional*, defaults to `True`):
-                If `True`, and `dummy_inputs` is specified, every argument that `root` can take that is not in
-                `dummy_inputs` and not in `concrete_args` will be added to `concrete_args`, otherwise does nothing.
-
-        Returns:
-            `torch.fx.Graph`:
-                A FX `torch.fx.Graph` representing the semantics of the passed-in `root`.
-
-        """
-        sig = inspect.signature(root.forward if isinstance(root, torch.nn.Module) else root)
-
-        if concrete_args is None:
-            concrete_args = {}
-
-        if dummy_inputs is not None and complete_concrete_args_with_inputs_not_in_dummy_inputs:
-            for param in sig.parameters.values():
-                if param.name in dummy_inputs:
-                    continue
-                if param.default is inspect.Parameter.empty:
-                    raise ValueError(f"You need to specify a default value for the parameter {param.name}.")
-            concrete_args.update(
-                {
-                    p.name: p.default
-                    for p in sig.parameters.values()
-                    if (p.name not in dummy_inputs and p.name not in concrete_args)
-                }
-            )
-
-        input_names = sig.parameters.keys() - concrete_args.keys()
-
-        # Creating a random input shape to generate dummy inputs.
-        batch_size = _generate_random_int()
-        sequence_length = _generate_random_int()
-        shape = [batch_size, sequence_length]
-
-        if root.__class__.__name__ in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES):
-            num_choices = _generate_random_int(low=2, high=5)
-            shape.insert(1, num_choices)
-
-        inputs = dict(dummy_inputs) if dummy_inputs is not None else {}
-        for input_name in input_names:
-            if input_name in inputs:
-                continue
-            # We enforce that root must either be a PreTrainedModel or deserialized from a serialized traced model to
-            # be able to use HFTracer._generate_dummy_input.
-            if isinstance(root, self.supported_archs) or type(root).__qualname__.startswith(
-                ("_deserialize_graph_module", "_CodeOnlyModule")
-            ):
-                inputs.update(self._generate_dummy_input(root, input_name, shape, input_names=input_names))
-            else:
-                raise RuntimeError(
-                    f"Could not generate input named {input_name} for because root is not a"
-                    " transformers.PreTrainedModel."
-                )
-
-        def to_meta(value):
-            if isinstance(value, torch.Tensor):
-                return value.to("meta")
-            return value
-
-        concrete_metas = pytree.tree_map(to_meta, inputs)
-
-        for param in sig.parameters.values():
-            if param.kind == inspect.Parameter.VAR_KEYWORD and param.name not in input_names:
-                concrete_metas[f"**{param.name}"] = {}
-        self.meta_args = concrete_metas
-
-        global _CURRENT_TRACER
-        _CURRENT_TRACER = self
-        with self.patch_for_tracing(root):
-            try:
-                self.graph = super().trace(root, concrete_args=concrete_args)
-            finally:
-                _CURRENT_TRACER = None
-
-        # This is necessary because concrete args are added as input to the traced module since
-        # https://github.com/pytorch/pytorch/pull/55888.
-        for node in self.graph.nodes:
-            if node.op == "placeholder":
-                # Removing default values for inputs as the forward pass will fail with them.
-                if node.target in input_names:
-                    node.args = ()
-                    # Without this, torch.jit.script fails because the inputs type is Optional[torch.Tensor].
-                    # It cannot infer on the attributes and methods the input should have, and fails.
-                    node.type = torch.Tensor
-                # It is a concrete arg so it is not used and should be removed.
-                else:
-                    to_visit = [node]
-                    to_delete = collections.OrderedDict()
-                    while to_visit:
-                        n = to_visit.pop(0)
-                        to_delete[n] = None
-                        to_visit += list(n.users.keys())
-
-                    for user in reversed(to_delete.keys()):
-                        self.graph.erase_node(user)
-
-            # TODO: solves GraphModule creation.
-            # Without this, return type annotation "Tuple" is causing code execution failure.
-            if node.op == "output":
-                node.type = None
-
-        return self.graph
-
-    def _stateless_mod_instantiation_depends_on_proxies(self, mod: nn.Module) -> bool:
-        """
-        Whether the module was instantiated with Proxies. If that is the case, such module cannot be a leaf module
-        because its attributes are input-dependent.
-        """
-        return any(isinstance(attr, Proxy) for attr in mod.__dict__.values())
-
-    def _insert_module_as_submodule(self, mod: nn.Module) -> str:
-        """
-        Helper method which tries to insert a module that was not declared as submodule.
-        """
-        # If one of the module attributes is a Proxy, it means that its instantiation is input-dependent.
-        # It is not possible to insert such modules, those should be traced through.
-        if self._stateless_mod_instantiation_depends_on_proxies(mod):
-            return ""
-        idx = 0
-        mod_name = mod.__class__.__name__.lower()
-        path = f"{mod_name}_{idx}"
-        already_inserted = False
-        while hasattr(self.root, path):
-            if getattr(self.root, path) is mod:
-                already_inserted = True
-                break
-            path = f"{mod_name}_{idx}"
-            idx += 1
-
-        # No need to add multiple instances of the same module.
-        if not already_inserted:
-            self.root.add_module(path, mod)
-        return path
-
-    def path_of_module(self, mod: nn.Module) -> str:
-        """
-        Helper method to find the qualified name of `mod` in the Module hierarchy of `root`. For example, if `root` has
-        a submodule named `foo`, which has a submodule named `bar`, passing `bar` into this function will return the
-        string "foo.bar".
-
-        Args:
-            mod (str): The `Module` to retrieve the qualified name for.
-        """
-        try:
-            return super().path_of_module(mod)
-        except NameError as e:
-            if self.allow_insert_stateless_mods and len(list(mod.parameters())) == 0 and len(list(mod.buffers())) == 0:
-                path = self._insert_module_as_submodule(mod)
-                return path
-            raise e
-
-    def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
-        return (not self._stateless_mod_instantiation_depends_on_proxies(m)) and super().is_leaf_module(
-            m, module_qualified_name
-        )
-
-    @compatibility(is_backward_compatible=True)
-    def keys(self, obj: "Proxy") -> Any:
-        """Called when a proxy object is has the keys() method called.
-        This is what happens when ** is called on a proxy. This should return an iterator if ** is supposed to work in
-        your custom tracer.
-        """
-        attribute = HFAttribute(obj, "keys")()
-        if obj.node.target.startswith("**"):
-            return attribute._metadata
-        return attribute
-
-
-def get_concrete_args(model: nn.Module, input_names: list[str]):
-    sig = inspect.signature(model.forward)
-
-    if not (set(input_names) <= set(sig.parameters.keys())):
-        formatted_input_names = input_names[0] if len(input_names) == 1 else ", ".join(input_names)
-        formatted_allowed_input_names = ", ".join(sig.parameters.keys())
-        raise ValueError(
-            f"The model does not have input(s) named: {formatted_input_names}, expected a subset of the following:"
-            f" {formatted_allowed_input_names}"
-        )
-
-    return {p.name: p.default for p in sig.parameters.values() if p.name not in input_names}
-
-
-def is_model_supported(model: "PreTrainedModel"):
-    return model.__class__.__name__ in _SUPPORTED_MODELS
-
-
-def check_if_model_is_supported(model: "PreTrainedModel"):
-    if not is_model_supported(model):
-        supported_model_names = ", ".join(_SUPPORTED_MODELS)
-        raise NotImplementedError(
-            f"Model {model.__class__.__name__} is not supported yet, supported models: {supported_model_names}"
-        )
-
-
-def symbolic_trace(
-    model: "PreTrainedModel",
-    input_names: list[str] | None = None,
-    disable_check: bool = False,
-    tracer_cls: type[HFTracer] = HFTracer,
-) -> GraphModule:
-    """
-    Performs symbolic tracing on the model.
-
-    Args:
-        model ([`PretrainedModel`]):
-            The model to trace.
-        input_names (`list[str]`, *optional*):
-            The names of the inputs of the traced model. If unset, model.dummy_inputs.keys() are used instead.
-        disable_check (`bool`, *optional*, defaults to `False`):
-            If `True`, no check is done before trying to trace the model, this is mostly usesul for debugging purposes.
-        tracer_cls (`Type[HFTracer]`, *optional*, defaults to `HFTracer`):
-            The tracer class to use for instantiating the tracer. If unset, `HFTracer` is used instead.
-
-    Returns:
-        `torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model.
-
-    Example:
-
-        ```python
-        from transformers.utils.fx import symbolic_trace
-
-        traced_model = symbolic_trace(model, input_names=["input_ids", "attention_mask", "token_type_ids"])
-        ```
-    """
-    if input_names is None:
-        input_names = model.dummy_inputs.keys()
-
-    input_names = list(input_names)
-    concrete_args = get_concrete_args(model, input_names)
-
-    if not disable_check:
-        check_if_model_is_supported(model)
-
-    if "past_key_values" in input_names and not getattr(model.config, "use_cache", False):
-        logger.warning(
-            "`past_key_values` were specified as input names, but model.config.use_cache = False, this might lead to "
-            "unexpected behavior."
-        )
-    if "past_key_values" not in input_names and getattr(model.config, "use_cache", False):
-        logger.warning(
-            "`past_key_values` were not specified as input names, but model.config.use_cache = True. Setting "
-            "model.config.use_cache = False."
-        )
-        model.config.use_cache = False
-
-    # Tracing.
-    tracer = tracer_cls()
-    traced_graph = tracer.trace(model, concrete_args=concrete_args)
-    traced = torch.fx.GraphModule(model, traced_graph)
-
-    traced.config = model.config
-    # The model class must be stored as an attribute to allow model deserialization, which uses trace, and thus
-    # _generate_dummy_input, where the model class is needed.
-    traced.class_for_deserialization = model.__class__
-    traced.device = model.device
-
-    return traced
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 06ff0e5cac75..e2b50ae7054a 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -1258,6 +1258,25 @@ def is_torch_fx_proxy(x):
         return False
 
 
+def is_jit_tracing() -> bool:
+    try:
+        import torch
+
+        return torch.jit.is_tracing()
+    except Exception:
+        return False
+
+
+def is_tracing(tensor=None) -> bool:
+    """Checks whether we are tracing a graph with dynamo (compile or export), torch.jit, or torch.fx"""
+    # Note that `is_torchdynamo_compiling` checks both compiling and exporting (the export check is stricter and
+    # only checks export)
+    _is_tracing = is_torchdynamo_compiling() or is_jit_tracing()
+    if tensor is not None:
+        _is_tracing |= is_torch_fx_proxy(tensor)
+    return _is_tracing
+
+
 @lru_cache
 def is_in_notebook() -> bool:
     try:
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 01d9b210d0b7..698f8bd361c1 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -179,7 +179,6 @@ class Aimv2VisionModelTest(Aimv2ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (Aimv2VisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -308,7 +307,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class Aimv2TextModelTest(Aimv2ModelTesterMixin, unittest.TestCase):
     all_model_classes = (Aimv2TextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -388,7 +386,6 @@ class Aimv2ModelTest(Aimv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py
index 6e0d5ef5603c..287522aeddde 100644
--- a/tests/models/albert/test_modeling_albert.py
+++ b/tests/models/albert/test_modeling_albert.py
@@ -260,7 +260,6 @@ class AlbertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # will not be maintained
 
     # special case for ForPreTraining model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 9a9399775cb8..9b4e877c30f4 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -130,7 +130,6 @@ class AlignVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (AlignVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     has_attentions = False
@@ -333,7 +332,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class AlignTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (AlignTextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     def setUp(self):
         self.model_tester = AlignTextModelTester(self)
@@ -437,7 +435,6 @@ def prepare_config_and_inputs_for_common(self):
 class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (AlignModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": AlignModel} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index 31a4a08d7147..4b7e8bc0ad01 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -131,7 +131,6 @@ class AltCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (AltCLIPVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -293,7 +292,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (AltCLIPTextModel,) if is_torch_available() else ()
-    fx_compatible = False  # Cannot support if `can_return_tuple`
 
     # TODO (@SunMarc): Fix me
     @unittest.skip(reason="It's broken.")
@@ -407,7 +405,6 @@ def prepare_img():
 class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (AltCLIPModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": AltCLIPModel} if is_torch_available() else {}
-    fx_compatible = False  # Cannot support if `can_return_tuple`
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/arcee/test_modeling_arcee.py b/tests/models/arcee/test_modeling_arcee.py
index ffd914882e8a..22304e4c8913 100644
--- a/tests/models/arcee/test_modeling_arcee.py
+++ b/tests/models/arcee/test_modeling_arcee.py
@@ -45,7 +45,6 @@ class ArceeModelTester(CausalLMModelTester):
 
 @require_torch
 class ArceeModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False
     model_tester_class = ArceeModelTester
 
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index ba7f6866ff94..f8560bd47f0f 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -160,7 +160,6 @@ class ASTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
 
diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py
index 80f94e4e4c15..acab03ad5b46 100644
--- a/tests/models/aya_vision/test_modeling_aya_vision.py
+++ b/tests/models/aya_vision/test_modeling_aya_vision.py
@@ -170,7 +170,6 @@ class AyaVisionModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     _is_composite = True
 
diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py
index bc634af7829a..832911168b26 100644
--- a/tests/models/bamba/test_modeling_bamba.py
+++ b/tests/models/bamba/test_modeling_bamba.py
@@ -288,8 +288,6 @@ class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         else {}
     )
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index e438107d3dfb..40991788e346 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -520,7 +520,6 @@ class BarkSemanticModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Te
     all_generative_model_classes = (BarkCausalModel,) if is_torch_available() else ()
 
     is_encoder_decoder = False
-    fx_compatible = False
     test_missing_keys = False
 
     test_resize_embeddings = True
@@ -607,7 +606,6 @@ class BarkCoarseModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
     all_generative_model_classes = (BarkCausalModel,) if is_torch_available() else ()
 
     is_encoder_decoder = False
-    fx_compatible = False
     test_missing_keys = False
 
     test_resize_embeddings = True
@@ -691,7 +689,6 @@ class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BarkFineModel,) if is_torch_available() else ()
 
     is_encoder_decoder = False
-    fx_compatible = False
     test_missing_keys = False
 
     test_resize_embeddings = True
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index 811b9c178e93..8bee5d9555d3 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -421,7 +421,6 @@ class BartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False  # Fix me Michael
 
     def setUp(self):
         self.model_tester = BartModelTester(self)
@@ -1504,8 +1503,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class BartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (BartDecoder, BartForCausalLM) if is_torch_available() else ()
-    fx_comptatible = True
-
     is_encoder_decoder = False
     test_missing_keys = False
 
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index ad876e4e54ef..c1473fcb0496 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -462,7 +462,6 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # won't be maintained
     model_split_percents = [0.5, 0.8, 0.9]
 
     # special case for ForPreTraining model
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index b24fac437f64..17c934351784 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -164,8 +164,6 @@ class BitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         else {}
     )
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     has_attentions = False
     test_torch_exportable = True
diff --git a/tests/models/bitnet/test_modeling_bitnet.py b/tests/models/bitnet/test_modeling_bitnet.py
index 3392d5c55943..f55597eddb43 100644
--- a/tests/models/bitnet/test_modeling_bitnet.py
+++ b/tests/models/bitnet/test_modeling_bitnet.py
@@ -142,8 +142,6 @@ class BitNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         else {}
     )
 
-    fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
-
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
         self,
diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py
index fbc2febb8af2..4e906a4dceb8 100644
--- a/tests/models/blenderbot/test_modeling_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_blenderbot.py
@@ -224,7 +224,6 @@ class BlenderbotModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False
     test_missing_keys = False
 
     def setUp(self):
diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
index f107cbefcdd5..aef6aaa70318 100644
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@@ -216,7 +216,6 @@ class BlenderbotSmallModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False
     test_missing_keys = False
 
     # TODO: Fix the failed tests when this model gets more usage
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 6754cc21ad2e..5a81b45f71a7 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -149,7 +149,6 @@ class BlipVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (BlipVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -310,7 +309,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BlipTextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     def setUp(self):
         self.model_tester = BlipTextModelTester(self)
@@ -418,7 +416,6 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = True
     test_attention_outputs = False
@@ -691,7 +688,6 @@ class BlipVQAModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BlipForQuestionAnswering,) if is_torch_available() else ()
     # Doesn't run generation tests due to custom generation logic -- won't fix
     all_generative_model_classes = ()
-    fx_compatible = False
 
     test_resize_embeddings = True
     test_attention_outputs = False
@@ -768,7 +764,6 @@ def test_model_get_set_embeddings(self):
 @require_torch
 class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BlipForImageTextRetrieval,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = True
     test_attention_outputs = False
@@ -897,7 +892,6 @@ class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
     # Doesn't run generation tests due to custom generation logic -- wont fix
     all_generative_model_classes = ()
-    fx_compatible = False
 
     test_resize_embeddings = True
     test_attention_outputs = False
diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py
index 6fd7121a5d0f..52e597b32ecc 100644
--- a/tests/models/blip/test_modeling_blip_text.py
+++ b/tests/models/blip/test_modeling_blip_text.py
@@ -125,7 +125,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BlipTextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     def setUp(self):
         self.model_tester = BlipTextModelTester(self)
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 3b30d8c857ed..9d4ba7b2e733 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -156,7 +156,6 @@ class Blip2VisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (Blip2VisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -461,7 +460,6 @@ def prepare_config_and_inputs_for_common(self):
 class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
     additional_model_inputs = ["input_ids"]
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
@@ -791,7 +789,6 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = True
     test_attention_outputs = False
@@ -1070,7 +1067,6 @@ def create_and_check_model(self, config, input_ids, attention_mask):
 @require_torch
 class Blip2TextModelWithProjectionTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (Blip2TextModelWithProjection,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = True
     test_attention_outputs = False
@@ -1227,7 +1223,6 @@ def create_and_check_model(self, config, pixel_values):
 @require_torch
 class Blip2VisionModelWithProjectionTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (Blip2VisionModelWithProjection,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -1375,7 +1370,6 @@ def prepare_config_and_inputs_for_common(self):
 class Blip2TextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (Blip2ForImageTextRetrieval,) if is_torch_available() else ()
     additional_model_inputs = ["input_ids"]
-    fx_compatible = False
 
     test_resize_embeddings = True
     test_attention_outputs = False
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index 4ce8e8332865..7c05340cec81 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -168,7 +168,6 @@ def create_and_check_bloom_weight_initialization(self, config, *args):
 @require_torch
 class BloomModelTest(CausalLMModelTest, unittest.TestCase):
     model_tester_class = BloomModelTester
-    fx_compatible = True
     test_missing_keys = False
 
     def test_bloom_model_past(self):
diff --git a/tests/models/blt/test_modeling_blt.py b/tests/models/blt/test_modeling_blt.py
index 5f53bd2d4cfb..c7ca7099582d 100644
--- a/tests/models/blt/test_modeling_blt.py
+++ b/tests/models/blt/test_modeling_blt.py
@@ -168,7 +168,6 @@ def get_config(self):
 
 @require_torch
 class BltModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False
     model_tester_class = BltModelTester
 
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py
index 35ede458045d..4237db1107d9 100644
--- a/tests/models/chameleon/test_modeling_chameleon.py
+++ b/tests/models/chameleon/test_modeling_chameleon.py
@@ -205,8 +205,6 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
         else {}
     )
 
-    fx_compatible = False
-
     def setUp(self):
         self.model_tester = ChameleonModelTester(self)
         self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=37)
@@ -258,8 +256,6 @@ class ChameleonVision2SeqModelTest(ModelTesterMixin, GenerationTesterMixin, unit
         else {}
     )
 
-    fx_compatible = False
-
     def setUp(self):
         self.model_tester = ChameleonVision2SeqModelTester(self)
         self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=37)
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index eb1692099522..f48e8c39e02b 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -312,7 +312,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class ChineseCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (ChineseCLIPTextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     # special case for ForPreTraining model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
@@ -405,7 +404,6 @@ class ChineseCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (ChineseCLIPVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -538,7 +536,6 @@ def prepare_config_and_inputs_for_common(self):
 class ChineseCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (ChineseCLIPModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": ChineseCLIPModel} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 553f7444e8a4..3075a0df1e58 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -158,7 +158,6 @@ class ClapAudioModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (ClapAudioModel, ClapAudioModelWithProjection) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -377,7 +376,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class ClapTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (ClapTextModel, ClapTextModelWithProjection) if is_torch_available() else ()
-    fx_compatible = False
 
     def setUp(self):
         self.model_tester = ClapTextModelTester(self)
@@ -487,7 +485,6 @@ def prepare_config_and_inputs_for_common(self):
 class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (ClapModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": ClapModel} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 9a3d0eb2beca..4ce1b9faa6aa 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -208,7 +208,6 @@ class CLIPVisionModelTest(CLIPModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (CLIPVisionModel, CLIPVisionModelWithProjection) if is_torch_available() else ()
-    fx_compatible = True
 
     test_resize_embeddings = False
 
@@ -396,7 +395,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class CLIPTextModelTest(CLIPModelTesterMixin, unittest.TestCase):
     all_model_classes = (CLIPTextModel, CLIPTextModelWithProjection) if is_torch_available() else ()
-    fx_compatible = True
 
     model_split_percents = [0.5, 0.8, 0.9]
 
@@ -524,7 +522,6 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         {"feature-extraction": CLIPModel, "image-feature-extraction": CLIPVisionModel} if is_torch_available() else {}
     )
     additional_model_inputs = ["pixel_values"]
-    fx_compatible = True
 
     test_resize_embeddings = False
     test_attention_outputs = False
@@ -624,7 +621,6 @@ def prepare_config_and_inputs_for_common(self):
 class CLIPForImageClassificationModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (CLIPForImageClassification,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-classification": CLIPForImageClassification} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 1ecd3fcab4be..c3dcf643966c 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -136,7 +136,6 @@ class CLIPSegVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (CLIPSegVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -293,7 +292,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (CLIPSegTextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     model_split_percents = [0.5, 0.8, 0.9]
 
@@ -420,7 +418,6 @@ def prepare_config_and_inputs_for_common(self):
 class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (CLIPSegModel, CLIPSegForImageSegmentation) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": CLIPSegModel} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py
index 6ad9ac4eee74..815c509b73b0 100644
--- a/tests/models/codegen/test_modeling_codegen.py
+++ b/tests/models/codegen/test_modeling_codegen.py
@@ -313,7 +313,6 @@ class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     pipeline_model_mapping = (
         {"feature-extraction": CodeGenModel, "text-generation": CodeGenForCausalLM} if is_torch_available() else {}
     )
-    fx_compatible = False
 
     test_missing_keys = False
 
diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py
index 2312f7ac0698..e76d0aa8c6a1 100644
--- a/tests/models/cohere/test_modeling_cohere.py
+++ b/tests/models/cohere/test_modeling_cohere.py
@@ -171,8 +171,6 @@ class CohereModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         else {}
     )
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
@@ -188,9 +186,6 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_torch_fx_output_loss(self):
-        super().test_torch_fx_output_loss()
-
 
 @require_torch
 @slow
diff --git a/tests/models/cohere2_vision/test_modeling_cohere2_vision.py b/tests/models/cohere2_vision/test_modeling_cohere2_vision.py
index ba219d0614cc..eaffd4368735 100644
--- a/tests/models/cohere2_vision/test_modeling_cohere2_vision.py
+++ b/tests/models/cohere2_vision/test_modeling_cohere2_vision.py
@@ -157,7 +157,6 @@ class Cohere2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     _is_composite = True
 
diff --git a/tests/models/colpali/test_modeling_colpali.py b/tests/models/colpali/test_modeling_colpali.py
index b457dfbf0eb8..54e45ed7f31e 100644
--- a/tests/models/colpali/test_modeling_colpali.py
+++ b/tests/models/colpali/test_modeling_colpali.py
@@ -185,8 +185,6 @@ class ColPaliForRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (ColPaliForRetrieval,) if is_torch_available() else ()
-    fx_compatible = False
-
     test_resize_embeddings = True
     additional_model_inputs = ["token_type_ids"]
 
diff --git a/tests/models/colqwen2/test_modeling_colqwen2.py b/tests/models/colqwen2/test_modeling_colqwen2.py
index 20fdb42a3890..790cf639c985 100644
--- a/tests/models/colqwen2/test_modeling_colqwen2.py
+++ b/tests/models/colqwen2/test_modeling_colqwen2.py
@@ -200,7 +200,6 @@ class ColQwen2ForRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (ColQwen2ForRetrieval,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = True
 
diff --git a/tests/models/convnext/test_modeling_convnext.py b/tests/models/convnext/test_modeling_convnext.py
index 1caa94b3eece..ce90bb537794 100644
--- a/tests/models/convnext/test_modeling_convnext.py
+++ b/tests/models/convnext/test_modeling_convnext.py
@@ -175,8 +175,6 @@ class ConvNextModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         else {}
     )
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     has_attentions = False
     test_torch_exportable = True
diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py
index a98206a21601..79fd07b098fc 100644
--- a/tests/models/convnextv2/test_modeling_convnextv2.py
+++ b/tests/models/convnextv2/test_modeling_convnextv2.py
@@ -154,8 +154,6 @@ class ConvNextV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         else {}
     )
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     has_attentions = False
     test_torch_exportable = True
diff --git a/tests/models/cwm/test_modeling_cwm.py b/tests/models/cwm/test_modeling_cwm.py
index eaed2878bb33..8ed6e335c62d 100644
--- a/tests/models/cwm/test_modeling_cwm.py
+++ b/tests/models/cwm/test_modeling_cwm.py
@@ -76,7 +76,6 @@ class CwmModelTest(CausalLMModelTest, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
     model_tester_class = CwmModelTester
 
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py
index c8ecc3bbd526..915d7472f5fb 100644
--- a/tests/models/deberta/test_modeling_deberta.py
+++ b/tests/models/deberta/test_modeling_deberta.py
@@ -237,8 +237,6 @@ class DebertaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         else {}
     )
 
-    fx_compatible = True
-
     is_encoder_decoder = False
 
     def setUp(self):
@@ -274,14 +272,6 @@ def test_model_from_pretrained(self):
         model = DebertaModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker")
-    def test_torch_fx_output_loss(self):
-        pass
-
-    @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker")
-    def test_torch_fx(self):
-        pass
-
 
 @require_torch
 @require_sentencepiece
diff --git a/tests/models/deberta_v2/test_modeling_deberta_v2.py b/tests/models/deberta_v2/test_modeling_deberta_v2.py
index 895e23139916..7236e740ed83 100644
--- a/tests/models/deberta_v2/test_modeling_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_deberta_v2.py
@@ -251,8 +251,6 @@ class DebertaV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
         else {}
     )
 
-    fx_compatible = True
-
     is_encoder_decoder = False
 
     def setUp(self):
@@ -292,14 +290,6 @@ def test_model_from_pretrained(self):
         model = DebertaV2Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker")
-    def test_torch_fx_output_loss(self):
-        pass
-
-    @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker")
-    def test_torch_fx(self):
-        pass
-
 
 @require_torch
 @require_sentencepiece
diff --git a/tests/models/deepseek_v2/test_modeling_deepseek_v2.py b/tests/models/deepseek_v2/test_modeling_deepseek_v2.py
index 5301ac7c1ff8..8f7da870e700 100644
--- a/tests/models/deepseek_v2/test_modeling_deepseek_v2.py
+++ b/tests/models/deepseek_v2/test_modeling_deepseek_v2.py
@@ -54,7 +54,6 @@ def __init__(
 
 @require_torch
 class DeepseekV2ModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False
     test_all_params_have_gradient = False
     model_tester_class = DeepseekV2ModelTester
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
index a5c696aed1de..618aa2f4ca66 100644
--- a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
+++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
@@ -233,8 +233,6 @@ class DeepseekV3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
         else {}
     )
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/diffllama/test_modeling_diffllama.py b/tests/models/diffllama/test_modeling_diffllama.py
index 90f72ab989c2..f18c75cb3feb 100644
--- a/tests/models/diffllama/test_modeling_diffllama.py
+++ b/tests/models/diffllama/test_modeling_diffllama.py
@@ -194,8 +194,6 @@ class DiffLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
         else {}
     )
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py
index 2d36617f97f9..bddb82bafbc0 100644
--- a/tests/models/dinat/test_modeling_dinat.py
+++ b/tests/models/dinat/test_modeling_dinat.py
@@ -210,7 +210,6 @@ class DinatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py
index f8fa6d86076b..a3624493c5fb 100644
--- a/tests/models/dinov2/test_modeling_dinov2.py
+++ b/tests/models/dinov2/test_modeling_dinov2.py
@@ -227,7 +227,6 @@ class Dinov2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # broken by output recording refactor
 
     test_resize_embeddings = False
 
diff --git a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
index 8f745c41bd96..dc5fc3b12d66 100644
--- a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
+++ b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
@@ -232,7 +232,6 @@ class Dinov2WithRegistersModelTest(ModelTesterMixin, PipelineTesterMixin, unitte
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py b/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py
index f3540d90dcec..ccb84f2111d9 100644
--- a/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py
+++ b/tests/models/dinov3_convnext/test_modeling_dinov3_convnext.py
@@ -165,8 +165,6 @@ class DINOv3ConvNextModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te
     all_model_classes = (DINOv3ConvNextModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-feature-extraction": DINOv3ConvNextModel} if is_torch_available() else {}
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     has_attentions = False
     test_torch_exportable = True
diff --git a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py
index eac994a0569f..09f76e066840 100644
--- a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py
+++ b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py
@@ -150,7 +150,6 @@ class Dinov3ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index b5e82a822a0f..f6a01a5620e1 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -223,7 +223,6 @@ class DistilBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # won't be maintained
     test_resize_embeddings = True
     test_resize_position_embeddings = True
 
diff --git a/tests/models/doge/test_modeling_doge.py b/tests/models/doge/test_modeling_doge.py
index d8557085b3cd..f01e22fb794a 100644
--- a/tests/models/doge/test_modeling_doge.py
+++ b/tests/models/doge/test_modeling_doge.py
@@ -275,8 +275,6 @@ class DogeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     )
     has_attentions = False
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
index 3f89e0816e41..1f983ecdff43 100644
--- a/tests/models/donut/test_modeling_donut_swin.py
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -166,7 +166,6 @@ class DonutSwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
 
     test_resize_embeddings = False
 
diff --git a/tests/models/edgetam/test_modeling_edgetam.py b/tests/models/edgetam/test_modeling_edgetam.py
index fe07de425059..4ec9e16c4db2 100644
--- a/tests/models/edgetam/test_modeling_edgetam.py
+++ b/tests/models/edgetam/test_modeling_edgetam.py
@@ -234,7 +234,6 @@ class EdgeTamModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
     pipeline_model_mapping = (
         {"feature-extraction": EdgeTamModel, "mask-generation": EdgeTamModel} if is_torch_available() else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     _is_composite = True
diff --git a/tests/models/efficientnet/test_modeling_efficientnet.py b/tests/models/efficientnet/test_modeling_efficientnet.py
index 3146962209bf..6eab2ca2d29b 100644
--- a/tests/models/efficientnet/test_modeling_efficientnet.py
+++ b/tests/models/efficientnet/test_modeling_efficientnet.py
@@ -134,8 +134,6 @@ class EfficientNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test
         else {}
     )
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     has_attentions = False
     test_torch_exportable = True
diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py
index e4695e596e66..7bf778aa4d9e 100644
--- a/tests/models/electra/test_modeling_electra.py
+++ b/tests/models/electra/test_modeling_electra.py
@@ -405,7 +405,6 @@ class ElectraModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # won't be maintained
 
     # Overwriting to add `is_decoder` flag
     def prepare_config_and_inputs_for_generate(self, batch_size=2):
diff --git a/tests/models/emu3/test_modeling_emu3.py b/tests/models/emu3/test_modeling_emu3.py
index 013315894067..0feaddb02318 100644
--- a/tests/models/emu3/test_modeling_emu3.py
+++ b/tests/models/emu3/test_modeling_emu3.py
@@ -131,8 +131,6 @@ class Emu3Text2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTe
         else {}
     )
 
-    fx_compatible = False
-
     def setUp(self):
         self.model_tester = Emu3Text2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Emu3TextConfig, hidden_size=37)
@@ -288,8 +286,6 @@ class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
     )
     pipeline_model_mapping = {}
 
-    fx_compatible = False
-
     def setUp(self):
         self.model_tester = Emu3Vision2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Emu3Config, has_text_modality=False, hidden_size=37)
diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py
index 7eb89aad9cbb..6ca4d53eeea1 100644
--- a/tests/models/ernie/test_modeling_ernie.py
+++ b/tests/models/ernie/test_modeling_ernie.py
@@ -455,7 +455,6 @@ class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     # Overwriting to add `is_decoder` flag
     def prepare_config_and_inputs_for_generate(self, batch_size=2):
diff --git a/tests/models/ernie4_5/test_modeling_ernie4_5.py b/tests/models/ernie4_5/test_modeling_ernie4_5.py
index b5ef2766ed06..7339a9d5c892 100644
--- a/tests/models/ernie4_5/test_modeling_ernie4_5.py
+++ b/tests/models/ernie4_5/test_modeling_ernie4_5.py
@@ -45,7 +45,6 @@ class Ernie4_5ModelTester(CausalLMModelTester):
 
 @require_torch
 class Ernie4_5ModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
     model_tester_class = Ernie4_5ModelTester
 
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
diff --git a/tests/models/exaone4/test_modeling_exaone4.py b/tests/models/exaone4/test_modeling_exaone4.py
index a2809388c9e5..3bef0a07cae1 100644
--- a/tests/models/exaone4/test_modeling_exaone4.py
+++ b/tests/models/exaone4/test_modeling_exaone4.py
@@ -52,7 +52,6 @@ class Exaone4ModelTester(CausalLMModelTester):
 
 @require_torch
 class Exaone4ModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
     model_tester_class = Exaone4ModelTester
     model_split_percents = [0.5, 0.6]
 
diff --git a/tests/models/falcon_h1/test_modeling_falcon_h1.py b/tests/models/falcon_h1/test_modeling_falcon_h1.py
index b3c10867ee4d..966580496c98 100644
--- a/tests/models/falcon_h1/test_modeling_falcon_h1.py
+++ b/tests/models/falcon_h1/test_modeling_falcon_h1.py
@@ -260,8 +260,6 @@ def create_and_check_decoder_model_past_large_inputs(
 class FalconH1ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (FalconH1Model, FalconH1ForCausalLM) if is_torch_available() else ()
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
index 3102990833f8..f2e042c11748 100644
--- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
+++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
@@ -263,7 +263,6 @@ def prepare_config_and_inputs_for_common(self):
 class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (FalconMambaModel, FalconMambaForCausalLM) if is_torch_available() else ()
     has_attentions = False  # FalconMamba does not support attentions
-    fx_compatible = False  # FIXME let's try to support this @ArthurZucker
     test_missing_keys = False
 
     pipeline_model_mapping = (
diff --git a/tests/models/flex_olmo/test_modeling_flex_olmo.py b/tests/models/flex_olmo/test_modeling_flex_olmo.py
index 0bee1ff46489..6f5f27bc4b6f 100644
--- a/tests/models/flex_olmo/test_modeling_flex_olmo.py
+++ b/tests/models/flex_olmo/test_modeling_flex_olmo.py
@@ -47,7 +47,6 @@ class FlexOlmoModelTester(CausalLMModelTester):
 
 @require_torch
 class FlexOlmoModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False
     test_all_params_have_gradient = False
     model_tester_class = FlexOlmoModelTester
 
diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py
index be67090daba3..b39c87721bcf 100644
--- a/tests/models/focalnet/test_modeling_focalnet.py
+++ b/tests/models/focalnet/test_modeling_focalnet.py
@@ -241,7 +241,6 @@ class FocalNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     has_attentions = False
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 58753226e05a..19831b427181 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -125,7 +125,6 @@ class GitVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (GitVisionModel,) if is_torch_available() else ()
-    fx_compatible = True
 
     test_resize_embeddings = False
 
@@ -381,7 +380,6 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     # special case for GitForCausalLM model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/glm4_moe/test_modeling_glm4_moe.py b/tests/models/glm4_moe/test_modeling_glm4_moe.py
index 66c64a86654c..d4cebbd02983 100644
--- a/tests/models/glm4_moe/test_modeling_glm4_moe.py
+++ b/tests/models/glm4_moe/test_modeling_glm4_moe.py
@@ -58,7 +58,6 @@ def __init__(
 
 @require_torch
 class Glm4MoeModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False
     model_tester_class = Glm4MoeModelTester
     # used in `test_torch_compile_for_training`. Skip as "Dynamic control flow in MoE"
     _torch_compile_train_cls = None
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 667366461e46..af045509a9a4 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -169,7 +169,6 @@ class GPT2ModelTest(CausalLMModelTest, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
     test_missing_keys = False
     model_tester_class = GPT2ModelTester
 
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
index f4885cf4d8cd..cfd56f199cf3 100644
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -393,7 +393,6 @@ class GPTBigCodeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
     test_missing_keys = False
 
     multi_query = True
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index 93f77412cb06..193f05d56d01 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -375,7 +375,6 @@ class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
     test_missing_keys = False
 
     # special case for DoubleHeads model
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index 2b853a78a537..3f6bab0a827f 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -340,16 +340,8 @@ class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
-
     test_missing_keys = False
 
-    def test_torch_fx(self):
-        super().test_torch_fx()
-
-    def test_torch_fx_output_loss(self):
-        super().test_torch_fx_output_loss()
-
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
         self,
diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py
index 4afa595d6a16..d02923f5fe3d 100644
--- a/tests/models/granite/test_modeling_granite.py
+++ b/tests/models/granite/test_modeling_granite.py
@@ -175,8 +175,6 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         else {}
     )
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py
index 7d6bc3f6d21c..96ec52e5b2c2 100644
--- a/tests/models/granitemoe/test_modeling_granitemoe.py
+++ b/tests/models/granitemoe/test_modeling_granitemoe.py
@@ -174,8 +174,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
         else {}
     )
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py
index f99f555e4193..0deca05a7eb5 100644
--- a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py
+++ b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py
@@ -177,8 +177,6 @@ class GraniteMoeSharedModelTest(ModelTesterMixin, GenerationTesterMixin, unittes
         else {}
     )
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/hgnet_v2/test_modeling_hgnet_v2.py b/tests/models/hgnet_v2/test_modeling_hgnet_v2.py
index cb330a021425..5f7301c3bd5a 100644
--- a/tests/models/hgnet_v2/test_modeling_hgnet_v2.py
+++ b/tests/models/hgnet_v2/test_modeling_hgnet_v2.py
@@ -178,8 +178,6 @@ class HGNetV2ForImageClassificationTest(ModelTesterMixin, PipelineTesterMixin, u
     all_model_classes = (HGNetV2ForImageClassification, HGNetV2Backbone) if is_torch_available() else ()
     pipeline_model_mapping = {"image-classification": HGNetV2ForImageClassification} if is_torch_available() else {}
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     test_torch_exportable = True
     has_attentions = False
diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py
index e4d1fe26217b..f945b756936e 100644
--- a/tests/models/hiera/test_modeling_hiera.py
+++ b/tests/models/hiera/test_modeling_hiera.py
@@ -243,7 +243,6 @@ class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py
index 1e98603a9b5d..f47d20239f2a 100644
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@@ -14,9 +14,6 @@
 """Testing suite for the PyTorch Hubert model."""
 
 import math
-import os
-import pickle
-import tempfile
 import unittest
 
 import pytest
@@ -27,7 +24,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -47,8 +43,6 @@
     )
     from transformers.models.hubert.modeling_hubert import _compute_mask_indices
 
-from transformers.utils.fx import symbolic_trace
-
 
 class HubertModelTester:
     def __init__(
@@ -312,7 +306,6 @@ class HubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
 
     def setUp(self):
         self.model_tester = HubertModelTester(self)
@@ -406,120 +399,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states.grad)
         self.assertIsNotNone(attentions.grad)
 
-    # Hubert cannot be TorchScripted because of torch.nn.utils.weight_norm
-    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
-        # TODO: fix it
-        self.skipTest(reason="torch 2.1 breaks torch fx tests for wav2vec2/hubert.")
-
-        if not self.fx_compatible:
-            self.skipTest(reason="torch fx is not compatible with this model")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.return_dict = False
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
-
-            try:
-                if model.config.is_encoder_decoder:
-                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                    labels = inputs.get("labels", None)
-                    input_names = [
-                        "attention_mask",
-                        "decoder_attention_mask",
-                        "decoder_input_ids",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                    ]
-                    if labels is not None:
-                        input_names.append("labels")
-
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-
-                    model_output = model(**filtered_inputs)
-
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                else:
-                    input_names = [
-                        "attention_mask",
-                        "bbox",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                        "pixel_values",
-                        "token_type_ids",
-                        "visual_feats",
-                        "visual_pos",
-                    ]
-
-                    labels = inputs.get("labels", None)
-                    start_positions = inputs.get("start_positions", None)
-                    end_positions = inputs.get("end_positions", None)
-                    if labels is not None:
-                        input_names.append("labels")
-                    if start_positions is not None:
-                        input_names.append("start_positions")
-                    if end_positions is not None:
-                        input_names.append("end_positions")
-
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-
-                    model_output = model(**filtered_inputs)
-
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-
-            except Exception as e:
-                self.fail(f"Couldn't trace module: {e}")
-
-            def flatten_output(output):
-                flatten = []
-                for x in output:
-                    if isinstance(x, (tuple, list)):
-                        flatten += flatten_output(x)
-                    elif not isinstance(x, torch.Tensor):
-                        continue
-                    else:
-                        flatten.append(x)
-                return flatten
-
-            model_output = flatten_output(model_output)
-            traced_output = flatten_output(traced_output)
-            num_outputs = len(model_output)
-
-            for i in range(num_outputs):
-                self.assertTrue(
-                    torch.allclose(model_output[i], traced_output[i]),
-                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
-                )
-
-            # Test that the model can be serialized and restored properly
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
-                try:
-                    with open(pkl_file_name, "wb") as f:
-                        pickle.dump(traced_model, f)
-                    with open(pkl_file_name, "rb") as f:
-                        loaded = pickle.load(f)
-                except Exception as e:
-                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
-
-                loaded_output = loaded(**filtered_inputs)
-                loaded_output = flatten_output(loaded_output)
-
-                for i in range(num_outputs):
-                    self.assertTrue(
-                        torch.allclose(model_output[i], loaded_output[i]),
-                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
-                    )
-
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index 097ba36eefad..a345f5f37574 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -177,7 +177,6 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (Idefics2Model,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = True
     _is_composite = True
@@ -368,7 +367,6 @@ class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
 
     all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-text-to-text": Idefics2ForConditionalGeneration} if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = True
 
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index 3d3f6562f84b..4642444be237 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -167,7 +167,6 @@ class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (Idefics3Model,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = True
 
@@ -333,7 +332,6 @@ class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest
 
     all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-text-to-text": Idefics3ForConditionalGeneration} if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = True
 
diff --git a/tests/models/ijepa/test_modeling_ijepa.py b/tests/models/ijepa/test_modeling_ijepa.py
index bf2d75987d7e..bf52907d587d 100644
--- a/tests/models/ijepa/test_modeling_ijepa.py
+++ b/tests/models/ijepa/test_modeling_ijepa.py
@@ -201,7 +201,6 @@ class IJepaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # broken by output recording refactor
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index 95e2b763ef38..4882c14dba36 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -149,7 +149,6 @@ class InstructBlipVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (InstructBlipVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -475,7 +474,6 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
     )
     pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration}
     additional_model_inputs = ["qformer_input_ids", "input_ids"]
-    fx_compatible = False
 
     test_resize_embeddings = True
     test_attention_outputs = False
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 211fcd3cff43..747469de3ffa 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -153,7 +153,6 @@ class InstructBlipVideoVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (InstructBlipVideoVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -490,7 +489,6 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
         (InstructBlipVideoForConditionalGeneration, InstructBlipVideoModel) if is_torch_available() else ()
     )
     additional_model_inputs = ["qformer_input_ids", "input_ids"]
-    fx_compatible = False
 
     test_resize_embeddings = True
     test_attention_outputs = False
diff --git a/tests/models/janus/test_modeling_janus.py b/tests/models/janus/test_modeling_janus.py
index d8f632ac091d..76e49179fdb5 100644
--- a/tests/models/janus/test_modeling_janus.py
+++ b/tests/models/janus/test_modeling_janus.py
@@ -193,7 +193,6 @@ def prepare_config_and_inputs_for_common(self):
 class JanusVisionText2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (JanusModel, JanusForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (JanusForConditionalGeneration,) if is_torch_available() else ()
-    fx_compatible = False
 
     _is_composite = True
 
@@ -354,7 +353,6 @@ def prepare_config_and_inputs_for_common(self):
 class JanusVQModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (JanusVQVAE,) if is_torch_available() else ()
 
-    fx_compatible = False
     has_attentions = False
     test_resize_embeddings = False
 
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index 734a7ea97be6..b63076e8f2b4 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -268,7 +268,6 @@ class Kosmos2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
index 994bd4c39353..ac8be1982721 100644
--- a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
+++ b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
@@ -300,7 +300,6 @@ class Kosmos2_5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
index f009f12d0df4..83ff0d9ec223 100644
--- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
+++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py
@@ -253,8 +253,6 @@ class KyutaiSpeechToTextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel
         else {}
     )
 
-    fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index 422aaa22eb7b..6c3d45bd2acb 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -243,7 +243,6 @@ class LayoutLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # Cannot support if `can_return_tuple`
 
     def setUp(self):
         self.model_tester = LayoutLMModelTester(self)
diff --git a/tests/models/lfm2/test_modeling_lfm2.py b/tests/models/lfm2/test_modeling_lfm2.py
index cd0169f03bd3..d3c27409562b 100644
--- a/tests/models/lfm2/test_modeling_lfm2.py
+++ b/tests/models/lfm2/test_modeling_lfm2.py
@@ -49,7 +49,6 @@ def __init__(
 
 @require_torch
 class Lfm2ModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False
     model_tester_class = Lfm2ModelTester
     # used in `test_torch_compile_for_training`
     _torch_compile_train_cls = Lfm2ForCausalLM if is_torch_available() else None
diff --git a/tests/models/lfm2_moe/test_modeling_lfm2_moe.py b/tests/models/lfm2_moe/test_modeling_lfm2_moe.py
index 5ab0301a22ca..55a9f4e40c19 100644
--- a/tests/models/lfm2_moe/test_modeling_lfm2_moe.py
+++ b/tests/models/lfm2_moe/test_modeling_lfm2_moe.py
@@ -62,7 +62,6 @@ class Lfm2MoeModelTest(CausalLMModelTest, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
     model_tester_class = Lfm2MoeModelTester
     # used in `test_torch_compile_for_training`
     _torch_compile_train_cls = Lfm2MoeForCausalLM if is_torch_available() else None
diff --git a/tests/models/lfm2_vl/test_modeling_lfm2_vl.py b/tests/models/lfm2_vl/test_modeling_lfm2_vl.py
index 85aa9cb72c06..906a7aa54e6a 100644
--- a/tests/models/lfm2_vl/test_modeling_lfm2_vl.py
+++ b/tests/models/lfm2_vl/test_modeling_lfm2_vl.py
@@ -162,7 +162,6 @@ class Lfm2VlModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
         else {}
     )
 
-    fx_compatible = False
     model_tester_class = Lfm2VlModelTester
     _is_composite = True
 
diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py
index 957e836b2e5e..33230c0128de 100644
--- a/tests/models/lilt/test_modeling_lilt.py
+++ b/tests/models/lilt/test_modeling_lilt.py
@@ -238,7 +238,6 @@ class LiltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 7cf52692c293..b4e1e2b1c672 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -51,7 +51,6 @@ class LlamaModelTester(CausalLMModelTester):
 
 @require_torch
 class LlamaModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
     model_tester_class = LlamaModelTester
 
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py
index 3ec760ad342f..25b769e715b7 100644
--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@@ -506,7 +506,6 @@ class LongT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = True
     is_encoder_decoder = True
diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py
index 234fe380e1f9..bdcd2f5ab33b 100644
--- a/tests/models/lxmert/test_modeling_lxmert.py
+++ b/tests/models/lxmert/test_modeling_lxmert.py
@@ -529,8 +529,6 @@ class LxmertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         else {}
     )
 
-    fx_compatible = True
-
     # overwrite function because qa models takes different input label shape
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
diff --git a/tests/models/m2m_100/test_modeling_m2m_100.py b/tests/models/m2m_100/test_modeling_m2m_100.py
index e4c257d72144..718b5cca2956 100644
--- a/tests/models/m2m_100/test_modeling_m2m_100.py
+++ b/tests/models/m2m_100/test_modeling_m2m_100.py
@@ -237,7 +237,6 @@ class M2M100ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False
     test_missing_keys = False
 
     # TODO: Fix the failed tests
diff --git a/tests/models/mamba/test_modeling_mamba.py b/tests/models/mamba/test_modeling_mamba.py
index 6d8dc3b82670..99a20aea5eec 100644
--- a/tests/models/mamba/test_modeling_mamba.py
+++ b/tests/models/mamba/test_modeling_mamba.py
@@ -235,7 +235,6 @@ def prepare_config_and_inputs_for_common(self):
 class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MambaModel, MambaForCausalLM) if is_torch_available() else ()
     has_attentions = False  # Mamba does not support attentions
-    fx_compatible = False  # FIXME let's try to support this @ArthurZucker
     test_missing_keys = False
 
     pipeline_model_mapping = (
diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py
index ae96d256334c..be1d0a351034 100644
--- a/tests/models/mamba2/test_modeling_mamba2.py
+++ b/tests/models/mamba2/test_modeling_mamba2.py
@@ -238,7 +238,6 @@ def create_and_check_mamba2_slow_vs_fast_forward(self, config, input_ids, *args,
 class Mamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Mamba2Model, Mamba2ForCausalLM) if is_torch_available() else ()
     has_attentions = False  # Mamba does not support attentions
-    fx_compatible = False  # FIXME let's try to support this @molbap
     test_missing_keys = False
 
     pipeline_model_mapping = (
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index 0ec3f6e61007..b897cb76c6d8 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -229,7 +229,6 @@ class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False
     test_missing_keys = False
 
     def setUp(self):
diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py
index dcb58e931053..e73077dd93ef 100644
--- a/tests/models/maskformer/test_modeling_maskformer_swin.py
+++ b/tests/models/maskformer/test_modeling_maskformer_swin.py
@@ -174,7 +174,6 @@ class MaskFormerSwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te
         else ()
     )
     pipeline_model_mapping = {"feature-extraction": MaskFormerSwinModel} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py
index 797ecda798eb..73c28e9ed573 100644
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -231,7 +231,6 @@ class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False  # Fix me Michael
 
     test_missing_keys = False
 
diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py
index f795afab605c..a2a53f728185 100644
--- a/tests/models/megatron_bert/test_modeling_megatron_bert.py
+++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py
@@ -288,7 +288,6 @@ class MegatronBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
     # test_resize_embeddings = False
 
     # special case for ForPreTraining model
diff --git a/tests/models/metaclip_2/test_modeling_metaclip_2.py b/tests/models/metaclip_2/test_modeling_metaclip_2.py
index b7a39687f7c3..40e17e652098 100644
--- a/tests/models/metaclip_2/test_modeling_metaclip_2.py
+++ b/tests/models/metaclip_2/test_modeling_metaclip_2.py
@@ -207,7 +207,6 @@ class MetaClip2VisionModelTest(MetaClip2ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (MetaClip2VisionModel, MetaClip2VisionModelWithProjection) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -402,7 +401,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class MetaClip2TextModelTest(MetaClip2ModelTesterMixin, unittest.TestCase):
     all_model_classes = (MetaClip2TextModel, MetaClip2TextModelWithProjection) if is_torch_available() else ()
-    fx_compatible = False
 
     model_split_percents = [0.5, 0.8, 0.9]
 
@@ -534,7 +532,6 @@ class MetaClip2ModelTest(MetaClip2ModelTesterMixin, PipelineTesterMixin, unittes
         else {}
     )
     additional_model_inputs = ["pixel_values"]
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
@@ -635,7 +632,6 @@ def prepare_config_and_inputs_for_common(self):
 class MetaClip2ForImageClassificationModelTest(MetaClip2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MetaClip2ForImageClassification,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-classification": MetaClip2ForImageClassification} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py
index d857c96d8d96..1ceef103dfac 100644
--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -122,7 +122,6 @@ class MgpstrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py
index db2215696fd7..95bf2b654379 100644
--- a/tests/models/mobilebert/test_modeling_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_mobilebert.py
@@ -283,7 +283,6 @@ class MobileBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # won't be maintained
 
     # special case for ForPreTraining model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py
index 6fc707dc6324..a07b7010f993 100644
--- a/tests/models/modernbert/test_modeling_modernbert.py
+++ b/tests/models/modernbert/test_modeling_modernbert.py
@@ -260,7 +260,6 @@ class ModernBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     model_split_percents = [0.5, 0.8, 0.9]
 
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index b30ac90276a4..61aeeb5d7ce5 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -350,7 +350,6 @@ class MptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         else ()
     )
 
-    fx_compatible = False
     test_missing_keys = False
 
     pipeline_model_mapping = (
diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py
index fc37c9efa6e6..b5fd56813845 100644
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -13,13 +13,10 @@
 # limitations under the License.
 
 import copy
-import os
-import pickle
 import tempfile
 import unittest
 
 from transformers import MT5Config, is_torch_available
-from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 from transformers.testing_utils import (
     require_sentencepiece,
     require_tokenizers,
@@ -27,11 +24,10 @@
     slow,
     torch_device,
 )
-from transformers.utils.fx import symbolic_trace
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -557,7 +553,6 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
 
     test_resize_embeddings = True
     is_encoder_decoder = True
@@ -587,119 +582,6 @@ def is_pipeline_test_to_skip(
 
         return False
 
-    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
-        if not self.fx_compatible:
-            self.skipTest(reason="torch.fx is not compatible with this model")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.return_dict = False
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == "MT5ForSequenceClassification":
-                continue
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
-
-            try:
-                if model.config.is_encoder_decoder:
-                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                    labels = inputs.get("labels", None)
-                    input_names = [
-                        "attention_mask",
-                        "decoder_attention_mask",
-                        "decoder_input_ids",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                    ]
-                    if labels is not None:
-                        input_names.append("labels")
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-                    model_output = model(**filtered_inputs)
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                else:
-                    input_names = [
-                        "attention_mask",
-                        "bbox",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                        "pixel_values",
-                        "token_type_ids",
-                        "visual_feats",
-                        "visual_pos",
-                    ]
-                    labels = inputs.get("labels", None)
-                    start_positions = inputs.get("start_positions", None)
-                    end_positions = inputs.get("end_positions", None)
-                    if labels is not None:
-                        input_names.append("labels")
-                    if start_positions is not None:
-                        input_names.append("start_positions")
-                    if end_positions is not None:
-                        input_names.append("end_positions")
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-                    if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
-                        not hasattr(model.config, "problem_type") or model.config.problem_type is None
-                    ):
-                        model.config.problem_type = "single_label_classification"
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                    model_output = model(**filtered_inputs)
-
-            except Exception as e:
-                self.fail(f"Couldn't trace module: {e}")
-
-            def flatten_output(output):
-                flatten = []
-                for x in output:
-                    if isinstance(x, (tuple, list)):
-                        flatten += flatten_output(x)
-                    elif not isinstance(x, torch.Tensor):
-                        continue
-                    else:
-                        flatten.append(x)
-                return flatten
-
-            model_output = flatten_output(model_output)
-            traced_output = flatten_output(traced_output)
-            num_outputs = len(model_output)
-
-            for i in range(num_outputs):
-                self.assertTrue(
-                    torch.allclose(model_output[i], traced_output[i]),
-                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
-                )
-
-            # Test that the model can be serialized and restored properly
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
-                try:
-                    with open(pkl_file_name, "wb") as f:
-                        pickle.dump(traced_model, f)
-                    with open(pkl_file_name, "rb") as f:
-                        loaded = pickle.load(f)
-                except Exception as e:
-                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
-
-                loaded_output = loaded(**filtered_inputs)
-                loaded_output = flatten_output(loaded_output)
-
-                for i in range(num_outputs):
-                    self.assertTrue(
-                        torch.allclose(model_output[i], loaded_output[i]),
-                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
-                    )
-
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
-
     # overwrite because MT5 doesn't accept position ids as input and expects `decoder_input_ids`
     def test_custom_4d_attention_mask(self):
         for model_class in self.all_generative_model_classes:
diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py
index 53bcc3e77162..e50039d68fe6 100644
--- a/tests/models/mvp/test_modeling_mvp.py
+++ b/tests/models/mvp/test_modeling_mvp.py
@@ -421,7 +421,6 @@ class MvpModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False
 
     test_missing_keys = False
 
@@ -789,8 +788,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class MvpStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (MvpDecoder, MvpForCausalLM) if is_torch_available() else ()
-    fx_comptatible = True
-
     is_encoder_decoder = False
 
     def setUp(
diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py
index a330dd92f83a..f1fa2d66f422 100644
--- a/tests/models/nemotron/test_modeling_nemotron.py
+++ b/tests/models/nemotron/test_modeling_nemotron.py
@@ -50,7 +50,6 @@ class NemotronModelTest(CausalLMModelTest, unittest.TestCase):
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
-    fx_compatible = False
 
     # used in `test_torch_compile_for_training`
     _torch_compile_train_cls = NemotronForCausalLM if is_torch_available() else None
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
index e2085ca4ffa5..2040b5ca435a 100644
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -242,7 +242,6 @@ class NllbMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False
 
     test_missing_keys = True
 
diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py
index 8b849977fcba..22fc890b1e92 100644
--- a/tests/models/olmo/test_modeling_olmo.py
+++ b/tests/models/olmo/test_modeling_olmo.py
@@ -172,8 +172,6 @@ class OlmoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         else {}
     )
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/olmo2/test_modeling_olmo2.py b/tests/models/olmo2/test_modeling_olmo2.py
index 59443d4b83c2..cf9c84b950a6 100644
--- a/tests/models/olmo2/test_modeling_olmo2.py
+++ b/tests/models/olmo2/test_modeling_olmo2.py
@@ -173,8 +173,6 @@ class Olmo2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         else {}
     )
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/olmo3/test_modeling_olmo3.py b/tests/models/olmo3/test_modeling_olmo3.py
index 7e1aac1e261e..d892509a42b9 100644
--- a/tests/models/olmo3/test_modeling_olmo3.py
+++ b/tests/models/olmo3/test_modeling_olmo3.py
@@ -52,7 +52,6 @@ class Olmo3ModelTester(CausalLMModelTester):
 
 @require_torch
 class Olmo3ModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False
     test_all_params_have_gradient = False
     model_tester_class = Olmo3ModelTester
 
diff --git a/tests/models/olmoe/test_modeling_olmoe.py b/tests/models/olmoe/test_modeling_olmoe.py
index 99287e714af6..9c56629089d7 100644
--- a/tests/models/olmoe/test_modeling_olmoe.py
+++ b/tests/models/olmoe/test_modeling_olmoe.py
@@ -183,8 +183,6 @@ class OlmoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         else {}
     )
 
-    fx_compatible = False
-
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index 607e6c191246..d195385ecdd5 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -215,7 +215,6 @@ class OPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         else {}
     )
     is_encoder_decoder = False
-    fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
 
     test_missing_keys = False
 
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index 68c5a6d3f0fd..2a14a35cefa3 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -141,7 +141,6 @@ class Owlv2VisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (Owlv2VisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -304,7 +303,6 @@ def prepare_config_and_inputs_for_common(self):
 # Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTTextModelTest with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2, owlvit-base-patch32->owlv2-base-patch16-ensemble
 class Owlv2TextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (Owlv2TextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     def setUp(self):
         self.model_tester = Owlv2TextModelTester(self)
@@ -421,7 +419,6 @@ class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
@@ -546,7 +543,6 @@ def prepare_config_and_inputs_for_common(self):
 # Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTForObjectDetectionTest with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2, owlvit-base-patch32->owlv2-base-patch16-ensemble
 class Owlv2ForObjectDetectionTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (Owlv2ForObjectDetection,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index b919bfb47807..5ca710cf66b2 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -139,7 +139,6 @@ class OwlViTVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (OwlViTVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -300,7 +299,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class OwlViTTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (OwlViTTextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     def setUp(self):
         self.model_tester = OwlViTTextModelTester(self)
@@ -416,7 +414,6 @@ class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
@@ -539,7 +536,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class OwlViTForObjectDetectionTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (OwlViTForObjectDetection,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index 15570bbc9e27..efc663f9bb71 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -190,7 +190,6 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
     )
     pipeline_model_mapping = {"image-text-to-text": PaliGemmaForConditionalGeneration}
     additional_model_inputs = ["token_type_ids"]
-    fx_compatible = False
 
     _is_composite = True
 
diff --git a/tests/models/paligemma2/test_modeling_paligemma2.py b/tests/models/paligemma2/test_modeling_paligemma2.py
index 46914326712a..a000d68d20a1 100644
--- a/tests/models/paligemma2/test_modeling_paligemma2.py
+++ b/tests/models/paligemma2/test_modeling_paligemma2.py
@@ -169,7 +169,6 @@ class PaliGemma2ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
 
     all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-text-to-text": PaliGemmaForConditionalGeneration}
-    fx_compatible = False
 
     _is_composite = True
 
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index 327e55f38942..1289acefd315 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -234,7 +234,6 @@ class PegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False
     test_resize_position_embeddings = True
 
     test_missing_keys = False
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index eddc28d7a672..04bbcd28c5be 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -142,7 +142,6 @@ class Pix2StructVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (Pix2StructVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -310,7 +309,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class Pix2StructTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (Pix2StructTextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     def setUp(self):
         self.model_tester = Pix2StructTextModelTester(self)
@@ -408,7 +406,6 @@ class Pix2StructModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = True
     test_attention_outputs = False
diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py
index e76b5e02e634..5da6225b03d5 100644
--- a/tests/models/plbart/test_modeling_plbart.py
+++ b/tests/models/plbart/test_modeling_plbart.py
@@ -225,7 +225,6 @@ class PLBartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False  # Fix me Michael
 
     test_missing_keys = False
 
diff --git a/tests/models/pop2piano/test_modeling_pop2piano.py b/tests/models/pop2piano/test_modeling_pop2piano.py
index 2c536695d8b3..3177df3ca89c 100644
--- a/tests/models/pop2piano/test_modeling_pop2piano.py
+++ b/tests/models/pop2piano/test_modeling_pop2piano.py
@@ -489,7 +489,6 @@ class Pop2PianoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
     pipeline_model_mapping = (
         {"automatic-speech-recognition": Pop2PianoForConditionalGeneration} if is_torch_available() else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = True
     is_encoder_decoder = True
diff --git a/tests/models/resnet/test_modeling_resnet.py b/tests/models/resnet/test_modeling_resnet.py
index 383043b0b31c..b8fb99916e42 100644
--- a/tests/models/resnet/test_modeling_resnet.py
+++ b/tests/models/resnet/test_modeling_resnet.py
@@ -172,8 +172,6 @@ class ResNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         else {}
     )
 
-    fx_compatible = True
-
     test_resize_embeddings = False
     has_attentions = False
     test_torch_exportable = True
diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
index 99032b83e8ed..e381631ea839 100644
--- a/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/models/roberta/test_modeling_roberta.py
@@ -393,7 +393,6 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # won't be maintained
     model_split_percents = [0.5, 0.8, 0.9]
 
     # Overwriting to add `is_decoder` flag
diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
index f4d2adebfe52..9394d3a8dcfd 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
@@ -390,7 +390,6 @@ class RobertaPreLayerNormModelTest(ModelTesterMixin, GenerationTesterMixin, Pipe
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
     model_split_percents = [0.5, 0.8, 0.9]
 
     # Overwriting to add `is_decoder` flag
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index ab81758f6ce9..e923183c5130 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -215,7 +215,6 @@ class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     pipeline_model_mapping = (
         {"feature-extraction": RwkvModel, "text-generation": RwkvForCausalLM} if is_torch_available() else {}
     )
-    fx_compatible = False
     test_missing_keys = False
 
     def setUp(self):
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index f472292a9d5c..935126f57907 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -158,7 +158,6 @@ class SamVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (SamVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_torch_exportable = True
@@ -512,7 +511,6 @@ class SamModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     pipeline_model_mapping = (
         {"feature-extraction": SamModel, "mask-generation": SamModel} if is_torch_available() else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     _is_composite = True
diff --git a/tests/models/sam2/test_modeling_sam2.py b/tests/models/sam2/test_modeling_sam2.py
index 647b9f7a7dff..15dd3039e76a 100644
--- a/tests/models/sam2/test_modeling_sam2.py
+++ b/tests/models/sam2/test_modeling_sam2.py
@@ -141,7 +141,6 @@ class Sam2VisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (Sam2VisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_torch_exportable = True
@@ -463,7 +462,6 @@ class Sam2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     pipeline_model_mapping = (
         {"feature-extraction": Sam2Model, "mask-generation": Sam2Model} if is_torch_available() else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     _is_composite = True
diff --git a/tests/models/sam_hq/test_modeling_sam_hq.py b/tests/models/sam_hq/test_modeling_sam_hq.py
index 300317c12f18..08e95a9ad8a8 100644
--- a/tests/models/sam_hq/test_modeling_sam_hq.py
+++ b/tests/models/sam_hq/test_modeling_sam_hq.py
@@ -166,7 +166,6 @@ class SamHQVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (SamHQVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_torch_exportable = True
@@ -544,7 +543,6 @@ class SamHQModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     pipeline_model_mapping = (
         {"feature-extraction": SamHQModel, "mask-generation": SamHQModel} if is_torch_available() else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_cpu_offload = False
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index 27363fb4a61a..95f858df2f5b 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -336,7 +336,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class SeamlessM4TModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
-    fx_compatible = False
     test_missing_keys = False
 
     test_resize_embeddings = False
@@ -529,7 +528,6 @@ def test_retain_grad_hidden_states_attentions(self):
 @require_torch
 class SeamlessM4TModelWithTextInputTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
-    fx_compatible = False
     test_missing_keys = False
 
     test_resize_embeddings = True
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index 97fa54188d34..3c68901fe98a 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -362,7 +362,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class SeamlessM4Tv2ModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
-    fx_compatible = False
     test_missing_keys = False
 
     test_resize_embeddings = False
@@ -554,7 +553,6 @@ def test_retain_grad_hidden_states_attentions(self):
 @require_torch
 class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
-    fx_compatible = False
     test_missing_keys = False
 
     test_resize_embeddings = True
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index 3d1a4a2843e2..b455c42bbb81 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -175,8 +175,6 @@ class SegformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
         else {}
     )
 
-    fx_compatible = True
-
     test_resize_embeddings = False
     test_torch_exportable = True
 
diff --git a/tests/models/seggpt/test_modeling_seggpt.py b/tests/models/seggpt/test_modeling_seggpt.py
index 603cca98ab61..88dccbaa7ab5 100644
--- a/tests/models/seggpt/test_modeling_seggpt.py
+++ b/tests/models/seggpt/test_modeling_seggpt.py
@@ -167,7 +167,6 @@ class SegGptModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (SegGptModel, SegGptForImageSegmentation) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index 9c7a4d0fe199..dc58a7537361 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -173,7 +173,6 @@ class SiglipVisionModelTest(SiglipModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (SiglipVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     # MP works but offload doesn't work when the MultiheadAttention is offloaded
@@ -339,7 +338,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class SiglipTextModelTest(SiglipModelTesterMixin, unittest.TestCase):
     all_model_classes = (SiglipTextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     model_split_percents = [0.5, 0.8, 0.9]
 
@@ -441,7 +439,6 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test
     additional_model_inputs = ["pixel_values"]
     all_model_classes = (SiglipModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": SiglipModel} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
@@ -533,7 +530,6 @@ def prepare_config_and_inputs_for_common(self):
 class SiglipForImageClassificationModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (SiglipForImageClassification,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-classification": SiglipForImageClassification} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/siglip2/test_modeling_siglip2.py b/tests/models/siglip2/test_modeling_siglip2.py
index 36aafb7ed3eb..7b634ca7acb2 100644
--- a/tests/models/siglip2/test_modeling_siglip2.py
+++ b/tests/models/siglip2/test_modeling_siglip2.py
@@ -267,7 +267,6 @@ class Siglip2VisionModelTest(Siglip2ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (Siglip2VisionModel,) if is_torch_available() else ()
     additional_model_inputs = ["pixel_attention_mask", "spatial_shapes"]
-    fx_compatible = False
 
     test_resize_embeddings = False
     # MP works but offload doesn't work when the MultiheadAttention is offloaded
@@ -432,7 +431,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class Siglip2TextModelTest(Siglip2ModelTesterMixin, unittest.TestCase):
     all_model_classes = (Siglip2TextModel,) if is_torch_available() else ()
-    fx_compatible = False
     test_resize_embeddings = False
 
     model_split_percents = [0.5, 0.8, 0.9]
@@ -541,7 +539,6 @@ class Siglip2ModelTest(Siglip2ModelTesterMixin, PipelineTesterMixin, unittest.Te
         "pixel_attention_mask",
         "spatial_shapes",
     ]
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
@@ -638,7 +635,6 @@ class Siglip2ForImageClassificationModelTest(Siglip2ModelTesterMixin, PipelineTe
     all_model_classes = (Siglip2ForImageClassification,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-classification": Siglip2ForImageClassification} if is_torch_available() else {}
     additional_model_inputs = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/smolvlm/test_modeling_smolvlm.py b/tests/models/smolvlm/test_modeling_smolvlm.py
index e9f4efc5f1ae..5d2383ded88d 100644
--- a/tests/models/smolvlm/test_modeling_smolvlm.py
+++ b/tests/models/smolvlm/test_modeling_smolvlm.py
@@ -167,7 +167,6 @@ class SmolVLMModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (SmolVLMModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = True
 
@@ -330,7 +329,6 @@ class SmolVLMForConditionalGenerationModelTest(GenerationTesterMixin, ModelTeste
     all_model_classes = (SmolVLMForConditionalGeneration,) if is_torch_available() else ()
     all_generative_model_classes = (SmolVLMForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-text-to-text": SmolVLMForConditionalGeneration} if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = True
 
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 40d6927db581..26262363bd48 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -264,7 +264,6 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False
     test_missing_keys = False
 
     def setUp(self):
diff --git a/tests/models/stablelm/test_modeling_stablelm.py b/tests/models/stablelm/test_modeling_stablelm.py
index 7ca57247bfd6..c5412db98a47 100644
--- a/tests/models/stablelm/test_modeling_stablelm.py
+++ b/tests/models/stablelm/test_modeling_stablelm.py
@@ -46,7 +46,6 @@ class StableLmModelTester(CausalLMModelTester):
 
 @require_torch
 class StableLmModelTest(CausalLMModelTest, unittest.TestCase):
-    fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
     model_tester_class = StableLmModelTester
 
 
diff --git a/tests/models/superglue/test_modeling_superglue.py b/tests/models/superglue/test_modeling_superglue.py
index 4fc8eb54e1a1..10909709318a 100644
--- a/tests/models/superglue/test_modeling_superglue.py
+++ b/tests/models/superglue/test_modeling_superglue.py
@@ -120,8 +120,6 @@ def prepare_config_and_inputs_for_common(self):
 class SuperGlueModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (SuperGlueForKeypointMatching,) if is_torch_available() else ()
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     has_attentions = True
 
diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py
index 0090defdfe05..09f159c571fd 100644
--- a/tests/models/superpoint/test_modeling_superpoint.py
+++ b/tests/models/superpoint/test_modeling_superpoint.py
@@ -114,8 +114,6 @@ def prepare_config_and_inputs_for_common(self):
 class SuperPointModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (SuperPointForKeypointDetection,) if is_torch_available() else ()
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     has_attentions = False
     from_pretrained_id = "magic-leap-community/superpoint"
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index d70215665f8c..f231599df1d7 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -141,8 +141,6 @@ class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
         else {}
     )
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     has_attentions = False
     test_torch_exportable = True
diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py
index 704884a03c30..c8d39802512b 100644
--- a/tests/models/swin/test_modeling_swin.py
+++ b/tests/models/swin/test_modeling_swin.py
@@ -235,7 +235,6 @@ class SwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py
index 89893aff10eb..9710c1a4c4af 100644
--- a/tests/models/swin2sr/test_modeling_swin2sr.py
+++ b/tests/models/swin2sr/test_modeling_swin2sr.py
@@ -166,8 +166,6 @@ class Swin2SRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         else {}
     )
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     test_torch_exportable = True
 
diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py
index 8ae5b54691c4..c80b0f95e801 100644
--- a/tests/models/swinv2/test_modeling_swinv2.py
+++ b/tests/models/swinv2/test_modeling_swinv2.py
@@ -222,8 +222,6 @@ class Swinv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         else {}
     )
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     test_torch_exportable = True
 
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index 30c07bee291d..65eb103c1fc4 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -566,7 +566,6 @@ class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = True
     is_encoder_decoder = True
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index 85d54055bab8..8345cd63b036 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -14,8 +14,6 @@
 
 
 import copy
-import os
-import pickle
 import tempfile
 import unittest
 from functools import cached_property
@@ -23,7 +21,6 @@
 import pytest
 
 from transformers import T5Config, is_torch_available
-from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
 from transformers.testing_utils import (
     Expectations,
@@ -36,11 +33,10 @@
     slow,
     torch_device,
 )
-from transformers.utils.fx import symbolic_trace
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -565,7 +561,6 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
 
     test_resize_embeddings = True
     is_encoder_decoder = True
@@ -595,119 +590,6 @@ def is_pipeline_test_to_skip(
 
         return False
 
-    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
-        if not self.fx_compatible:
-            self.skipTest(reason="torch.fx is not compatible with this model")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.return_dict = False
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == "T5ForSequenceClassification":
-                continue
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
-
-            try:
-                if model.config.is_encoder_decoder:
-                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                    labels = inputs.get("labels", None)
-                    input_names = [
-                        "attention_mask",
-                        "decoder_attention_mask",
-                        "decoder_input_ids",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                    ]
-                    if labels is not None:
-                        input_names.append("labels")
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-                    model_output = model(**filtered_inputs)
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                else:
-                    input_names = [
-                        "attention_mask",
-                        "bbox",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                        "pixel_values",
-                        "token_type_ids",
-                        "visual_feats",
-                        "visual_pos",
-                    ]
-                    labels = inputs.get("labels", None)
-                    start_positions = inputs.get("start_positions", None)
-                    end_positions = inputs.get("end_positions", None)
-                    if labels is not None:
-                        input_names.append("labels")
-                    if start_positions is not None:
-                        input_names.append("start_positions")
-                    if end_positions is not None:
-                        input_names.append("end_positions")
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-                    if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
-                        not hasattr(model.config, "problem_type") or model.config.problem_type is None
-                    ):
-                        model.config.problem_type = "single_label_classification"
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                    model_output = model(**filtered_inputs)
-
-            except Exception as e:
-                self.fail(f"Couldn't trace module: {e}")
-
-            def flatten_output(output):
-                flatten = []
-                for x in output:
-                    if isinstance(x, (tuple, list)):
-                        flatten += flatten_output(x)
-                    elif not isinstance(x, torch.Tensor):
-                        continue
-                    else:
-                        flatten.append(x)
-                return flatten
-
-            model_output = flatten_output(model_output)
-            traced_output = flatten_output(traced_output)
-            num_outputs = len(model_output)
-
-            for i in range(num_outputs):
-                self.assertTrue(
-                    torch.allclose(model_output[i], traced_output[i]),
-                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
-                )
-
-            # Test that the model can be serialized and restored properly
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
-                try:
-                    with open(pkl_file_name, "wb") as f:
-                        pickle.dump(traced_model, f)
-                    with open(pkl_file_name, "rb") as f:
-                        loaded = pickle.load(f)
-                except Exception as e:
-                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
-
-                loaded_output = loaded(**filtered_inputs)
-                loaded_output = flatten_output(loaded_output)
-
-                for i in range(num_outputs):
-                    self.assertTrue(
-                        torch.allclose(model_output[i], loaded_output[i]),
-                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
-                    )
-
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
-
     # overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids`
     def test_custom_4d_attention_mask(self):
         for model_class in self.all_generative_model_classes:
diff --git a/tests/models/textnet/test_modeling_textnet.py b/tests/models/textnet/test_modeling_textnet.py
index 5ab06cb62ae8..e5babc76693a 100644
--- a/tests/models/textnet/test_modeling_textnet.py
+++ b/tests/models/textnet/test_modeling_textnet.py
@@ -211,8 +211,6 @@ class TextNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         else {}
     )
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     test_torch_exportable = True
     has_attentions = False
diff --git a/tests/models/timesfm/test_modeling_timesfm.py b/tests/models/timesfm/test_modeling_timesfm.py
index 4269fdb6e91d..62fb00a7f094 100644
--- a/tests/models/timesfm/test_modeling_timesfm.py
+++ b/tests/models/timesfm/test_modeling_timesfm.py
@@ -123,7 +123,6 @@ def prepare_config_and_inputs_for_common(self):
 class TimesFmModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (TimesFmModelForPrediction,) if is_torch_available() else ()
     all_generative_model_classes = ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     is_encoder_decoder = False
diff --git a/tests/models/trocr/test_modeling_trocr.py b/tests/models/trocr/test_modeling_trocr.py
index b3fefff96b3c..b975f3939067 100644
--- a/tests/models/trocr/test_modeling_trocr.py
+++ b/tests/models/trocr/test_modeling_trocr.py
@@ -161,7 +161,6 @@ def prepare_config_and_inputs_for_common(self):
 class TrOCRStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (TrOCRDecoder, TrOCRForCausalLM) if is_torch_available() else ()
     pipeline_model_mapping = {"text-generation": TrOCRForCausalLM} if is_torch_available() else {}
-    fx_compatible = True
 
     def setUp(self):
         self.model_tester = TrOCRStandaloneDecoderModelTester(self, is_training=False)
diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py
index cf80324a4e77..2444ed9f5e78 100644
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@@ -273,7 +273,6 @@ class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     test_resize_embeddings = True
     is_encoder_decoder = True
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index 987b742cfb19..58e6e923e8df 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -13,13 +13,9 @@
 # limitations under the License.
 
 import copy
-import os
-import pickle
-import tempfile
 import unittest
 
 from transformers import UMT5Config, is_torch_available
-from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
 from transformers.testing_utils import (
     require_sentencepiece,
     require_tokenizers,
@@ -27,11 +23,10 @@
     slow,
     torch_device,
 )
-from transformers.utils.fx import symbolic_trace
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -249,7 +244,6 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False
 
     test_missing_keys = True
     # The small UMT5 model needs higher percentages for CPU/MP tests
@@ -275,126 +269,6 @@ def is_pipeline_test_to_skip(
 
         return False
 
-    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
-        if not self.fx_compatible:
-            self.skipTest(reason="torch fx is not compatible with this model")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.return_dict = False
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == "UMT5ForSequenceClassification":
-                continue
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
-
-            try:
-                if model.config.is_encoder_decoder:
-                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                    labels = inputs.get("labels", None)
-                    input_names = [
-                        "attention_mask",
-                        "decoder_attention_mask",
-                        "decoder_input_ids",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                    ]
-                    if labels is not None:
-                        input_names.append("labels")
-
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-
-                    model_output = model(**filtered_inputs)
-
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                else:
-                    input_names = [
-                        "attention_mask",
-                        "bbox",
-                        "input_features",
-                        "input_ids",
-                        "input_values",
-                        "pixel_values",
-                        "token_type_ids",
-                        "visual_feats",
-                        "visual_pos",
-                    ]
-
-                    labels = inputs.get("labels", None)
-                    start_positions = inputs.get("start_positions", None)
-                    end_positions = inputs.get("end_positions", None)
-                    if labels is not None:
-                        input_names.append("labels")
-                    if start_positions is not None:
-                        input_names.append("start_positions")
-                    if end_positions is not None:
-                        input_names.append("end_positions")
-
-                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                    input_names = list(filtered_inputs.keys())
-
-                    if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
-                        not hasattr(model.config, "problem_type") or model.config.problem_type is None
-                    ):
-                        model.config.problem_type = "single_label_classification"
-
-                    traced_model = symbolic_trace(model, input_names)
-                    traced_output = traced_model(**filtered_inputs)
-                    model_output = model(**filtered_inputs)
-
-            except Exception as e:
-                self.fail(f"Couldn't trace module: {e}")
-
-            def flatten_output(output):
-                flatten = []
-                for x in output:
-                    if isinstance(x, (tuple, list)):
-                        flatten += flatten_output(x)
-                    elif not isinstance(x, torch.Tensor):
-                        continue
-                    else:
-                        flatten.append(x)
-                return flatten
-
-            model_output = flatten_output(model_output)
-            traced_output = flatten_output(traced_output)
-            num_outputs = len(model_output)
-
-            for i in range(num_outputs):
-                self.assertTrue(
-                    torch.allclose(model_output[i], traced_output[i]),
-                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
-                )
-
-            # Test that the model can be serialized and restored properly
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
-                try:
-                    with open(pkl_file_name, "wb") as f:
-                        pickle.dump(traced_model, f)
-                    with open(pkl_file_name, "rb") as f:
-                        loaded = pickle.load(f)
-                except Exception as e:
-                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
-
-                loaded_output = loaded(**filtered_inputs)
-                loaded_output = flatten_output(loaded_output)
-
-                for i in range(num_outputs):
-                    self.assertTrue(
-                        torch.allclose(model_output[i], loaded_output[i]),
-                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
-                    )
-
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
-
     # UMT5ForSequenceClassification does not support inputs_embeds
     def test_inputs_embeds(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index 341587f9a845..1ac09d8e2006 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -149,7 +149,6 @@ class UperNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
 
     all_model_classes = (UperNetForSemanticSegmentation,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-segmentation": UperNetForSemanticSegmentation} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     has_attentions = False
diff --git a/tests/models/video_llama_3/test_modeling_video_llama_3.py b/tests/models/video_llama_3/test_modeling_video_llama_3.py
index eadd50cedb02..90efb9a43fa6 100644
--- a/tests/models/video_llama_3/test_modeling_video_llama_3.py
+++ b/tests/models/video_llama_3/test_modeling_video_llama_3.py
@@ -348,7 +348,6 @@ class VideoLlama3VisionModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (VideoLlama3VisionModel,) if is_torch_available() else ()
     additional_model_inputs = ["grid_thw", "merge_sizes"]
-    # fx_compatible = False
     test_resize_embeddings = False
     test_head_masking = False
     test_cpu_offload = False
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 2baf0b96b9ec..3bcea272dd51 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -201,7 +201,6 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
         if is_torch_available()
         else ()
     )
-    fx_compatible = False
 
     test_resize_embeddings = True
     _is_composite = True
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index 550561c7efbd..b158d37dd574 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -177,7 +177,6 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
         else ()
     )
     pipeline_model_mapping = {"image-text-to-text": VipLlavaForConditionalGeneration} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = True
     _is_composite = True
diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py
index dde15d4eef56..5e454c097c74 100644
--- a/tests/models/vit/test_modeling_vit.py
+++ b/tests/models/vit/test_modeling_vit.py
@@ -202,7 +202,6 @@ class ViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False  # broken by output recording refactor
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/vitdet/test_modeling_vitdet.py b/tests/models/vitdet/test_modeling_vitdet.py
index 11bb647e4acc..25c6d93fd1d0 100644
--- a/tests/models/vitdet/test_modeling_vitdet.py
+++ b/tests/models/vitdet/test_modeling_vitdet.py
@@ -164,8 +164,6 @@ class VitDetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (VitDetModel, VitDetBackbone) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": VitDetModel} if is_torch_available() else {}
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     test_torch_exportable = True
 
diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py
index cb2538fe7ea2..64ff2e582b77 100644
--- a/tests/models/vitmatte/test_modeling_vitmatte.py
+++ b/tests/models/vitmatte/test_modeling_vitmatte.py
@@ -139,8 +139,6 @@ class VitMatteModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
     all_model_classes = (VitMatteForImageMatting,) if is_torch_available() else ()
     pipeline_model_mapping = {}
 
-    fx_compatible = False
-
     test_resize_embeddings = False
     test_torch_exportable = True
     test_torch_exportable_strictly = get_torch_major_and_minor_version() != "2.7"
diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py
index e9eb3d2a0a36..645204331bf7 100644
--- a/tests/models/vitpose/test_modeling_vitpose.py
+++ b/tests/models/vitpose/test_modeling_vitpose.py
@@ -150,7 +150,6 @@ class VitPoseModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (VitPoseForPoseEstimation,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
index d74a53fe0f9a..a3671b61b477 100644
--- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
+++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py
@@ -124,7 +124,6 @@ class VitPoseBackboneModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (VitPoseBackbone,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_torch_exportable = True
diff --git a/tests/models/vjepa2/test_modeling_vjepa2.py b/tests/models/vjepa2/test_modeling_vjepa2.py
index e6e77b6aaf99..dc3034f7a246 100644
--- a/tests/models/vjepa2/test_modeling_vjepa2.py
+++ b/tests/models/vjepa2/test_modeling_vjepa2.py
@@ -155,8 +155,6 @@ class VJEPA2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
 
     all_model_classes = (VJEPA2Model, VJEPA2ForVideoClassification) if is_torch_available() else ()
 
-    fx_compatible = False
-
     pipeline_model_mapping = {}
 
     test_resize_embeddings = False
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index f3c3443f05a8..e645070ffa31 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -16,7 +16,6 @@
 import math
 import multiprocessing
 import os
-import pickle
 import tempfile
 import traceback
 import unittest
@@ -46,7 +45,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -89,9 +87,6 @@
     from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
 
 
-from transformers.utils.fx import symbolic_trace
-
-
 def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
     error = None
     try:
@@ -497,7 +492,6 @@ class Wav2Vec2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
 
     def setUp(self):
         self.model_tester = Wav2Vec2ModelTester(self)
@@ -675,109 +669,6 @@ def test_model_from_pretrained(self):
         model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
         self.assertIsNotNone(model)
 
-    # Wav2Vec2 cannot be torchscripted because of group norm.
-    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
-        # TODO: fix it
-        self.skipTest(reason="torch 2.1 breaks torch fx tests for wav2vec2/hubert.")
-
-        if not self.fx_compatible:
-            self.skipTest(reason="torch fx is not compatible with this model")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.return_dict = False
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
-
-            try:
-                input_names = [
-                    "attention_mask",
-                    "bbox",
-                    "input_features",
-                    "input_ids",
-                    "input_values",
-                    "pixel_values",
-                    "token_type_ids",
-                    "visual_feats",
-                    "visual_pos",
-                ]
-
-                labels = inputs.get("labels", None)
-                start_positions = inputs.get("start_positions", None)
-                end_positions = inputs.get("end_positions", None)
-                if labels is not None:
-                    input_names.append("labels")
-                if start_positions is not None:
-                    input_names.append("start_positions")
-                if end_positions is not None:
-                    input_names.append("end_positions")
-
-                filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
-                input_names = list(filtered_inputs.keys())
-
-                model_output = model(**filtered_inputs)
-
-                if (
-                    isinstance(model, Wav2Vec2ForSequenceClassification)
-                    and not hasattr(model.config, "problem_type")
-                    or model.config.problem_type is None
-                ):
-                    model.config.problem_type = "single_label_classification"
-
-                traced_model = symbolic_trace(model, input_names)
-                traced_output = traced_model(**filtered_inputs)
-
-            except Exception as e:
-                self.fail(f"Couldn't trace module: {e}")
-
-            def flatten_output(output):
-                flatten = []
-                for x in output:
-                    if isinstance(x, (tuple, list)):
-                        flatten += flatten_output(x)
-                    elif not isinstance(x, torch.Tensor):
-                        continue
-                    else:
-                        flatten.append(x)
-                return flatten
-
-            model_output = flatten_output(model_output)
-            traced_output = flatten_output(traced_output)
-            num_outputs = len(model_output)
-
-            for i in range(num_outputs):
-                self.assertTrue(
-                    torch.allclose(model_output[i], traced_output[i]),
-                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
-                )
-
-            # Test that the model can be serialized and restored properly
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pkl_file_name = os.path.join(tmp_dir_name, "model.pkl")
-                try:
-                    with open(pkl_file_name, "wb") as f:
-                        pickle.dump(traced_model, f)
-                    with open(pkl_file_name, "rb") as f:
-                        loaded = pickle.load(f)
-                except Exception as e:
-                    self.fail(f"Couldn't serialize / deserialize the traced model: {e}")
-
-                loaded_output = loaded(**filtered_inputs)
-                loaded_output = flatten_output(loaded_output)
-
-                for i in range(num_outputs):
-                    self.assertTrue(
-                        torch.allclose(model_output[i], loaded_output[i]),
-                        f"serialized model {i}th output doesn't match model {i}th output for {model_class}",
-                    )
-
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
-
 
 @require_torch
 class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase):
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 8fcd66392f09..35d4a8ffd3ca 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -362,7 +362,6 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         else {}
     )
     is_encoder_decoder = True
-    fx_compatible = False
 
     test_missing_keys = False
     # Needs higher percentages after model tester's vocab_size is changed to 200 (PR #21222)
@@ -3235,7 +3234,6 @@ def create_and_check_model_forward(self, config, inputs_dict, use_weighted_layer
 class WhisperEncoderModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (WhisperForAudioClassification,) if is_torch_available() else ()
     is_encoder_decoder = False
-    fx_compatible = False
 
     test_missing_keys = False
 
@@ -3503,8 +3501,6 @@ def create_and_check_decoder_model_attention_mask_past(self, config, input_ids):
 @require_torch
 class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (WhisperDecoder, WhisperForCausalLM) if is_torch_available() else ()
-    fx_comptatible = False
-
     is_encoder_decoder = False
     test_missing_keys = False
 
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index fc47cc899d94..dfa7084403f8 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -147,7 +147,6 @@ class XCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (XCLIPVisionModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     test_resize_embeddings = False
 
@@ -397,7 +396,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class XCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (XCLIPTextModel,) if is_torch_available() else ()
-    fx_compatible = False
 
     def setUp(self):
         self.model_tester = XCLIPTextModelTester(self)
@@ -517,7 +515,6 @@ def prepare_config_and_inputs_for_common(self):
 class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (XCLIPModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": XCLIPModel} if is_torch_available() else {}
-    fx_compatible = False
 
     test_resize_embeddings = False
     test_attention_outputs = False
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index 5fb1b3fb72a0..1eafa5cf535a 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -279,7 +279,6 @@ class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     pipeline_model_mapping = (
         {"feature-extraction": XGLMModel, "text-generation": XGLMForCausalLM} if is_torch_available() else {}
     )
-    fx_compatible = True
     test_missing_keys = False
 
     def setUp(self):
diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py
index a25d76904734..54b59c55d4cc 100644
--- a/tests/models/xlnet/test_modeling_xlnet.py
+++ b/tests/models/xlnet/test_modeling_xlnet.py
@@ -531,7 +531,6 @@ class XLNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
diff --git a/tests/models/xlstm/test_modeling_xlstm.py b/tests/models/xlstm/test_modeling_xlstm.py
index 18e7d2b14ba0..918734c7f8ac 100644
--- a/tests/models/xlstm/test_modeling_xlstm.py
+++ b/tests/models/xlstm/test_modeling_xlstm.py
@@ -150,7 +150,6 @@ class xLSTMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     all_model_classes = (xLSTMModel, xLSTMForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (xLSTMForCausalLM,) if is_torch_available() else ()
     has_attentions = False  # xLSTM does not support attentions
-    fx_compatible = False
 
     pipeline_model_mapping = (
         {"feature-extraction": xLSTMModel, "text-generation": xLSTMForCausalLM} if is_torch_available() else {}
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 48100dc6d1ff..2d6d470d43ef 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -121,13 +121,9 @@
     from torch import nn
 
     from transformers import MODEL_MAPPING
-    from transformers.cache_utils import DynamicCache
     from transformers.modeling_utils import load_state_dict
     from transformers.pytorch_utils import id_tensor_storage
 
-from transformers.utils.fx import _FX_SUPPORTED_MODELS_WITH_KV_CACHE, symbolic_trace
-
-
 if is_deepspeed_available():
     import deepspeed
 
@@ -565,7 +561,6 @@ def sdpa_kernel(enable_flash, enable_math, enable_mem_efficient):
 class ModelTesterMixin:
     model_tester = None
     all_model_classes = ()
-    fx_compatible = False
     test_resize_embeddings = True
     test_resize_position_embeddings = False
     test_mismatched_shapes = True
@@ -1357,177 +1352,6 @@ def clear_torch_jit_class_registry(self):
         if hasattr(torch.jit._state, "_clear_class_state"):
             torch.jit._state._clear_class_state()
 
-    def test_torch_fx(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self._create_and_check_torch_fx_tracing(config, inputs_dict)
-
-    def test_torch_fx_output_loss(self):
-        if self.all_model_classes[0].__name__ == "BloomModel":
-            self.skipTest(reason="Bloom currently has issues, @michaelbenayoun")
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        self._create_and_check_torch_fx_tracing(config, inputs_dict, output_loss=True)
-
-    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
-        if not self.fx_compatible:
-            self.skipTest(f"The model type {config.model_type} is not compatible with torch.fx")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.return_dict = False
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
-
-            # We may want to test several inputs (various shapes, etc.).
-            inputs_to_test = [inputs]
-
-            if model.config.is_encoder_decoder:
-                model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                labels = inputs.get("labels", None)
-                input_names = [
-                    "attention_mask",
-                    "decoder_attention_mask",
-                    "decoder_input_ids",
-                    "input_features",
-                    "input_ids",
-                    "input_values",
-                ]
-                if labels is not None:
-                    input_names.append("labels")
-            else:
-                input_names = [
-                    "attention_mask",
-                    "bbox",
-                    "input_features",
-                    "input_ids",
-                    "input_values",
-                    "inputs_embeds",
-                    "pixel_values",
-                    "pixel_values_videos",
-                    "token_type_ids",
-                    "visual_feats",
-                    "visual_pos",
-                    "noise",
-                ]
-
-                labels = inputs.get("labels", None)
-                start_positions = inputs.get("start_positions", None)
-                end_positions = inputs.get("end_positions", None)
-                if labels is not None:
-                    input_names.append("labels")
-                if start_positions is not None:
-                    input_names.append("start_positions")
-                if end_positions is not None:
-                    input_names.append("end_positions")
-
-                if model.config.model_type in _FX_SUPPORTED_MODELS_WITH_KV_CACHE:
-                    input_names.append("past_key_values")
-
-                    # Generally model_tester.prepare_config_and_inputs_for_common seem not to generate past key values inputs.
-                    if "past_key_values" not in inputs:
-                        batch_size = inputs[next(iter(inputs))].shape[0]
-                        num_heads = model.config.num_attention_heads
-                        head_dim = model.config.hidden_size // model.config.num_attention_heads
-
-                        cache_shape = (batch_size, num_heads, 0, head_dim)
-                        empty_pkv = DynamicCache(config=model.config)
-
-                        cache_length = 9
-                        cache_shape = (batch_size, num_heads, cache_length, head_dim)
-                        non_empty_pkv = tuple(
-                            (
-                                None,
-                                torch.rand(cache_shape, dtype=torch.float, device=torch_device),
-                                torch.rand(cache_shape, dtype=torch.float, device=torch_device),
-                            )
-                            for i in range(model.config.num_hidden_layers)
-                        )
-                        non_empty_pkv = DynamicCache(non_empty_pkv)
-
-                        inps = copy.deepcopy(inputs_to_test[0])
-
-                        inputs_to_test[0]["past_key_values"] = empty_pkv
-
-                        inps["past_key_values"] = non_empty_pkv
-                        inputs_to_test.append(inps)
-
-                        past_mask = torch.ones(batch_size, cache_length, device=torch_device, dtype=torch.float)
-                        inputs_to_test[1]["attention_mask"] = torch.cat(
-                            (past_mask, inputs_to_test[1]["attention_mask"]), dim=1
-                        )
-
-                forward_parameters = inspect.signature(model.forward).parameters
-                if "input_ids" in forward_parameters and "inputs_embeds" in forward_parameters:
-                    inps = copy.deepcopy(inputs_to_test[0])
-
-                    embedding_size = (
-                        model.config.embedding_size
-                        if getattr(model.config, "embedding_size", None) is not None
-                        and model.config.model_type != "megatron-bert"
-                        else model.config.hidden_size
-                    )
-
-                    if (
-                        model.config.model_type in MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES
-                        and model.__class__.__name__
-                        == MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES[model.config.model_type]
-                    ):
-                        batch_size, num_choices, sequence_length = inputs["input_ids"].shape
-                        shape = (batch_size, num_choices, sequence_length, embedding_size)
-                    elif inps["input_ids"].ndim == 2:
-                        batch_size, sequence_length = inputs["input_ids"].shape
-                        shape = (batch_size, sequence_length, embedding_size)
-                    else:
-                        self.skipTest("Unknown case")
-
-                    del inps["input_ids"]
-                    inps["inputs_embeds"] = torch.rand(shape, dtype=torch.float, device=torch_device)
-                    inputs_to_test.append(inps)
-
-            for inps in inputs_to_test:
-                filtered_inputs = {k: v for (k, v) in inps.items() if k in input_names}
-                input_names_to_trace = list(filtered_inputs.keys())
-
-                if model.__class__.__name__ in set(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values()) and (
-                    not hasattr(model.config, "problem_type") or model.config.problem_type is None
-                ):
-                    model.config.problem_type = "single_label_classification"
-
-                model.config.use_cache = "past_key_values" in input_names_to_trace
-
-                traced_model = symbolic_trace(model, input_names_to_trace)
-
-                with torch.no_grad():
-                    traced_output = traced_model(**filtered_inputs)
-                    model_output = model(**filtered_inputs)
-
-                def flatten_output(output):
-                    flatten = []
-                    for x in output:
-                        if isinstance(x, (tuple, list)):
-                            flatten += flatten_output(x)
-                        elif not isinstance(x, torch.Tensor):
-                            continue
-                        else:
-                            flatten.append(x)
-                    return flatten
-
-                model_output = flatten_output(model_output)
-                traced_output = flatten_output(traced_output)
-                num_outputs = len(model_output)
-
-                for i in range(num_outputs):
-                    self.assertTrue(
-                        torch.allclose(model_output[i], traced_output[i]),
-                        f"traced {i}th output doesn't match model {i}th output for {model_class}",
-                    )
-
-                # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-                # (Even with this call, there are still memory leak by ~0.04MB)
-                self.clear_torch_jit_class_registry()
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(copy.deepcopy(config))
diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt
index 2f327a23634b..3b95f810729d 100644
--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@@ -797,7 +797,6 @@ src/transformers/utils/dummy_sentencepiece_objects.py
 src/transformers/utils/dummy_speech_objects.py
 src/transformers/utils/dummy_tokenizers_objects.py
 src/transformers/utils/dummy_vision_objects.py
-src/transformers/utils/fx.py
 src/transformers/utils/generic.py
 src/transformers/utils/hp_naming.py
 src/transformers/utils/hub.py