From 7a1e59e0aa31104ed4632329ab73c8786ef85f2c Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Tue, 5 Aug 2025 08:16:06 +0000
Subject: [PATCH 01/14] refactor(inference): reorder automodels

---
 optimum/neuron/models/inference/auto_models.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/optimum/neuron/models/inference/auto_models.py b/optimum/neuron/models/inference/auto_models.py
index 7859bf47d..949493727 100644
--- a/optimum/neuron/models/inference/auto_models.py
+++ b/optimum/neuron/models/inference/auto_models.py
@@ -58,28 +58,28 @@ class LLamaModelForCausalLM(LlamaNxDModelForCausalLM):
     pass
 
 
-@register_neuron_model_for_inference("phi3", "text-generation")
-class Phi3ModelForCausalLM(Phi3NxDModelForCausalLM):
+@register_neuron_model_for_inference("mixtral", "text-generation")
+class MixtralNeuronModelForCausalLM(MixtralNxDModelForCausalLM):
     """
-    Phi3 model with NxD backend for inference on AWS Neuron.
+    Mixtral model with NxD backend for inference on AWS Neuron.
     """
 
     pass
 
 
-@register_neuron_model_for_inference("qwen2", "text-generation")
-class Qwen2ModelForCausalLM(Qwen2NxDModelForCausalLM):
+@register_neuron_model_for_inference("phi3", "text-generation")
+class Phi3ModelForCausalLM(Phi3NxDModelForCausalLM):
     """
-    Qwen2 model with NxD backend for inference on AWS Neuron.
+    Phi3 model with NxD backend for inference on AWS Neuron.
     """
 
     pass
 
 
-@register_neuron_model_for_inference("mixtral", "text-generation")
-class MixtralNeuronModelForCausalLM(MixtralNxDModelForCausalLM):
+@register_neuron_model_for_inference("qwen2", "text-generation")
+class Qwen2ModelForCausalLM(Qwen2NxDModelForCausalLM):
     """
-    Mixtral model with NxD backend for inference on AWS Neuron.
+    Qwen2 model with NxD backend for inference on AWS Neuron.
     """
 
     pass

From 10c54a98d800ba7f272a08c0d26395b2c910e0b4 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Tue, 5 Aug 2025 08:18:49 +0000
Subject: [PATCH 02/14] chore: bump transformers version

---
 optimum/neuron/models/training/granite/modeling_granite.py | 4 ++--
 optimum/neuron/models/training/llama/modeling_llama.py     | 4 ++--
 pyproject.toml                                             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/optimum/neuron/models/training/granite/modeling_granite.py b/optimum/neuron/models/training/granite/modeling_granite.py
index 946f3f775..deab7b863 100644
--- a/optimum/neuron/models/training/granite/modeling_granite.py
+++ b/optimum/neuron/models/training/granite/modeling_granite.py
@@ -27,7 +27,7 @@
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.models.granite.configuration_granite import GraniteConfig
 from transformers.processing_utils import Unpack
-from transformers.utils import LossKwargs, can_return_tuple, logging
+from transformers.utils import TransformersKwargs, can_return_tuple, logging
 
 from ..config import TrainingNeuronConfig
 from ..llama.modeling_llama import (
@@ -225,7 +225,7 @@ def forward(
         return output
 
 
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
 
 
 class GraniteForCausalLM(LlamaForCausalLM):
diff --git a/optimum/neuron/models/training/llama/modeling_llama.py b/optimum/neuron/models/training/llama/modeling_llama.py
index 3f608c49c..9dab0d2a4 100644
--- a/optimum/neuron/models/training/llama/modeling_llama.py
+++ b/optimum/neuron/models/training/llama/modeling_llama.py
@@ -40,7 +40,7 @@
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.processing_utils import Unpack
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.utils import LossKwargs, can_return_tuple, logging
+from transformers.utils import TransformersKwargs, can_return_tuple, logging
 
 from ..config import TrainingNeuronConfig
 from ..loss_utils import ForCausalLMLoss
@@ -812,7 +812,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
 
 
 class LlamaForCausalLM(NeuronModelMixin, LlamaPreTrainedModel):
diff --git a/pyproject.toml b/pyproject.toml
index 6d460872d..22452e853 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 dependencies = [
-    "transformers ~= 4.51.0",
+    "transformers ~= 4.55.4",
     "accelerate == 1.8.1",
     "optimum ~= 1.24.0",
     "huggingface_hub >= 0.29.0",

From c6dcb50a6b76d55b3b1eb3d4375e64a4bac0d05f Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Tue, 5 Aug 2025 12:50:00 +0000
Subject: [PATCH 03/14] chore: bump vllm version

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 22452e853..242cd5836 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,7 +101,7 @@ neuronx = [
     "wheel",
     "neuronx-cc==2.19.8089.0",
     "torch-neuronx==2.7.0.2.8.6734+ac864f72",
-    "torch==2.7.0.*",
+    "torch==2.7.1.*",
     "torchvision==0.22.*",
     "neuronx_distributed==0.13.14393",
     "libneuronxla==2.2.4410.0",
@@ -114,7 +114,7 @@ sentence-transformers = [
     "sentence-transformers >= 2.2.0",
 ]
 vllm = [
-    "vllm == 0.9.2",
+    "vllm == 0.10.0",
 ]
 
 [project.scripts]

From f0700e05d8e6c86ca3716af85c6983695bbab720 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Tue, 5 Aug 2025 14:19:44 +0000
Subject: [PATCH 04/14] test(vllm): device argument is deprecated

---
 tests/decoder/test_vllm.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/decoder/test_vllm.py b/tests/decoder/test_vllm.py
index 6e732af00..34f6abf1d 100644
--- a/tests/decoder/test_vllm.py
+++ b/tests/decoder/test_vllm.py
@@ -42,7 +42,7 @@ def _test_vllm_generation(llm):
 
 def test_vllm_from_neuron_model(base_neuron_decoder_path):
     """Test vLLm generation on a single model exported locally."""
-    llm = LLM(model=base_neuron_decoder_path, device="neuron")
+    llm = LLM(model=base_neuron_decoder_path)
     _test_vllm_generation(llm)
 
 
@@ -56,7 +56,6 @@ def test_vllm_from_hub_model(neuron_decoder_config):
         max_model_len=export_kwargs["sequence_length"],
         tensor_parallel_size=export_kwargs["num_cores"],
         dtype=DTYPE_MAPPER.pt(export_kwargs["auto_cast_type"]),
-        device="neuron",
     )
     _test_vllm_generation(llm)
 
@@ -68,7 +67,6 @@ def test_vllm_greedy_expectations(neuron_decoder_config):
     llm = LLM(
         model=neuron_decoder_config["neuron_model_path"],
         max_num_seqs=neuron_decoder_config["export_kwargs"]["batch_size"],
-        device="neuron",
     )
     # Send more prompts than the compiled batch size (4) and request
     # varying generation lengths to test continuous batching.

From 431556bb0799107f01a87a62a54e516649c043c5 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Tue, 5 Aug 2025 12:53:18 +0000
Subject: [PATCH 05/14] chore: bump dev version

---
 optimum/neuron/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py
index 1a8863a03..d447f9c78 100644
--- a/optimum/neuron/version.py
+++ b/optimum/neuron/version.py
@@ -12,6 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "0.3.1.dev0"
+__version__ = "0.3.1.dev1"
 
 __sdk_version__ = "2.24.0"

From 28c679abec3f0826a0cb1d463d214aa46afd3846 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Tue, 5 Aug 2025 08:19:21 +0000
Subject: [PATCH 06/14] feat(inference): add SmolLM3

---
 .../neuron/models/inference/auto_models.py    |  10 ++
 .../inference/smollm3/modeling_smollm3.py     | 116 ++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 optimum/neuron/models/inference/smollm3/modeling_smollm3.py

diff --git a/optimum/neuron/models/inference/auto_models.py b/optimum/neuron/models/inference/auto_models.py
index 949493727..3126ae8b2 100644
--- a/optimum/neuron/models/inference/auto_models.py
+++ b/optimum/neuron/models/inference/auto_models.py
@@ -28,6 +28,7 @@
 from .phi3.modeling_phi3 import Phi3NxDModelForCausalLM
 from .qwen2.modeling_qwen2 import Qwen2NxDModelForCausalLM
 from .qwen3.modeling_qwen3 import Qwen3NxDModelForCausalLM
+from .smollm3.modeling_smollm3 import SmolLM3NxDModelForCausalLM
 
 
 prioritize_hlo_backend = os.environ.get("OPTIMUM_NEURON_PRIORITIZE_HLO_BACKEND", "0") == "1"
@@ -92,3 +93,12 @@ class Qwen3NeuronModelForCausalLM(Qwen3NxDModelForCausalLM):
     """
 
     pass
+
+
+@register_neuron_model_for_inference("smollm3", "text-generation")
+class SmolLM3NeuronModelForCausalLM(SmolLM3NxDModelForCausalLM):
+    """
+    SomlLM3 model with NxD backend for inference on AWS Neuron.
+    """
+
+    pass
diff --git a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py
new file mode 100644
index 000000000..db6602ad2
--- /dev/null
+++ b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SmolLM3 model for NXD inference."""
+
+import logging
+
+from neuronx_distributed.parallel_layers.layers import (
+    ColumnParallelLinear,
+    ParallelEmbedding,
+)
+from torch import nn
+from transformers.models.smollm3.configuration_smollm3 import SmolLM3Config
+
+from ..backend.config import NxDNeuronConfig  # noqa: E402
+from ..backend.modules.attention.attention_base import NeuronAttentionBase
+from ..backend.modules.attention.utils import RotaryEmbedding
+from ..backend.modules.custom_calls import CustomRMSNorm
+from ..backend.modules.decoder import NxDDecoderModel
+from ..llama.modeling_llama import (
+    LlamaNxDModelForCausalLM,
+    NeuronLlamaDecoderLayer,
+)
+
+
+logger = logging.getLogger("Neuron")
+
+
+class NeuronSmolLM3Attention(NeuronAttentionBase):
+    """
+    The only difference with the NeuronAttentionBase is the definition of the SmolLM3 rotary embedding
+    """
+
+    def __init__(
+        self,
+        config: SmolLM3Config,
+        neuron_config: NxDNeuronConfig,
+        layer_idx: int,
+        qkv_proj_bias: bool | None = False,
+        o_proj_bias: bool | None = False,
+        qk_scale: float | None = None,
+    ):
+        if config.use_sliding_window:
+            raise ValueError("SmolLM3 for Neuron does not support sliding window attention.")
+        if getattr(config, "rope_scaling", None) is not None:
+            raise ValueError("SmolLM3 for Neuron does not support rope scaling.")
+        super().__init__(
+            config, neuron_config, qkv_proj_bias=qkv_proj_bias, o_proj_bias=o_proj_bias, qk_scale=qk_scale
+        )
+        if config.no_rope_layers[layer_idx]:
+            # Yes, the condition is slightly counter-intuitive, but that is the transformers convention
+            head_dim = config.hidden_size // config.num_attention_heads
+            self.rotary_emb = RotaryEmbedding(
+                head_dim,
+                max_position_embeddings=config.max_position_embeddings,
+                base=config.rope_theta,
+            )
+        else:
+            self.rotary_emb = None
+
+
+class NeuronSmolLM3DecoderLayer(NeuronLlamaDecoderLayer):
+    def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig, layer_idx: int):
+        super().__init__(config, neuron_config)
+        self.self_attn = NeuronSmolLM3Attention(config, neuron_config, layer_idx)
+
+
+class NxDSmolLM3Model(NxDDecoderModel):
+    """
+    The neuron version of the SmolLM3Model
+    """
+
+    def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig):
+        super().__init__(config, neuron_config)
+
+        self.embed_tokens = ParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            config.pad_token_id,
+            dtype=neuron_config.torch_dtype,
+            shard_across_embedding=not neuron_config.vocab_parallel,
+            sequence_parallel_enabled=False,
+            pad=True,
+            use_spmd_rank=neuron_config.vocab_parallel,
+        )
+
+        self.lm_head = ColumnParallelLinear(
+            config.hidden_size,
+            config.vocab_size,
+            gather_output=not neuron_config.on_device_sampling,
+            bias=False,
+            pad=True,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                NeuronSmolLM3DecoderLayer(config, neuron_config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = CustomRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class SmolLM3NxDModelForCausalLM(LlamaNxDModelForCausalLM):
+    _model_cls = NxDSmolLM3Model

From a7fadc2ff90394e7949af6803f86397bca629e3f Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 8 Aug 2025 09:13:17 +0000
Subject: [PATCH 07/14] test(decoder): add smollm3 tests

---
 tests/decoder/conftest.py                | 9 +++++++++
 tests/decoder/test_decoder_export.py     | 1 +
 tests/decoder/test_decoder_generation.py | 1 +
 tests/decoder/test_vllm.py               | 8 ++++++++
 4 files changed, 19 insertions(+)

diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py
index 6a7a16868..de0ba0405 100644
--- a/tests/decoder/conftest.py
+++ b/tests/decoder/conftest.py
@@ -72,6 +72,15 @@
             "auto_cast_type": "bf16",
         },
     },
+    "smollm3": {
+        "model_id": "HuggingFaceTB/SmolLM3-3B",
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
+    },
 }
 
 
diff --git a/tests/decoder/test_decoder_export.py b/tests/decoder/test_decoder_export.py
index 2d0d97c78..489732b29 100644
--- a/tests/decoder/test_decoder_export.py
+++ b/tests/decoder/test_decoder_export.py
@@ -30,6 +30,7 @@
     "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM",
     "phi3": "yujiepan/phi-4-tiny-random",
     "mixtral": "dacorvo/Mixtral-tiny",
+    "smollm3": "HuggingFaceTB/SmolLM3-3B",
 }
 
 
diff --git a/tests/decoder/test_decoder_generation.py b/tests/decoder/test_decoder_generation.py
index 7557f476b..45db4369c 100644
--- a/tests/decoder/test_decoder_generation.py
+++ b/tests/decoder/test_decoder_generation.py
@@ -105,6 +105,7 @@ def test_decoder_generation_greedy_expectations(neuron_decoder_config):
         "qwen3": " What is the difference between Deep Learning and Machine Learning?\n\nDeep Learning is a subset of",
         "granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and",
         "phi": "\n\nDeep learning is a subfield of machine learning that focuses on creating",
+        "smollm3": " Deep learning is a subset of machine learning that uses neural networks with many layers to learn",
     }
     config_name = neuron_decoder_config["name"]
     generated_text = tokenizer.decode(outputs[0])
diff --git a/tests/decoder/test_vllm.py b/tests/decoder/test_vllm.py
index 34f6abf1d..778620f15 100644
--- a/tests/decoder/test_vllm.py
+++ b/tests/decoder/test_vllm.py
@@ -131,6 +131,14 @@ def test_vllm_greedy_expectations(neuron_decoder_config):
             " due to the absorption of light by the atmosphere.",
             " the time I was in the first grade. I remember the day I got the first grade, the",
         ],
+        "smollm3": [
+            " the head of state and government of the United States",
+            " Paris. The Eiffel Tower is in Paris. The Eiffel Tower is a famous landmark",
+            " It was a special day, for it was the first of April, and people were putting the finishing touches to their April Fools' Day pranks",
+            " to be happy. I believe that happiness is the most important thing in life. I believe that happiness is not just a feeling, but a state of being. I believe that happiness is not something that",
+            " blue because of Rayleigh scattering. This is the",
+            " of my grandmother, who was a wonderful cook, making a delicious chicken and dumplings soup. She",
+        ],
     }[neuron_decoder_config["name"]]
 
     for expected_output, output in zip(expected_outputs, outputs):

From 0e393aac9f08f51c45349b33d37532f5b1e9acd2 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 8 Aug 2025 09:40:31 +0000
Subject: [PATCH 08/14] ci: add smollm3 models to cache workflow

---
 .github/workflows/inference_cache_llm.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/inference_cache_llm.yml b/.github/workflows/inference_cache_llm.yml
index b73c48d9a..8b5a5ce6d 100644
--- a/.github/workflows/inference_cache_llm.yml
+++ b/.github/workflows/inference_cache_llm.yml
@@ -32,6 +32,7 @@ jobs:
           llama3.1-70b,
           qwen2.5-large,
           llama-variants,
+          smollm3,
         ]
     steps:
       - name: Checkout

From 81439780174f01699bd9844a322768a7f56f6cdc Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 8 Aug 2025 12:30:24 +0000
Subject: [PATCH 09/14] fix(Mixtral): workaround null head_dim

---
 .../inference/backend/modules/decoder/modeling_decoder.py      | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py b/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py
index b28437465..dd99dcd52 100644
--- a/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py
+++ b/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py
@@ -845,6 +845,9 @@ def export(
             )
         # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type
         config.torch_dtype = neuron_config.torch_dtype
+        # Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+)
+        if hasattr(config, "head_dim") and config.head_dim is None:
+            config.head_dim = config.hidden_size // config.num_attention_heads
         context_encoding_model, token_generation_model, speculation_model = cls.create_model_wrappers(
             model_cls=cls._model_cls,
             config=config,

From 52a50fc2321fccc5731ad5acf5ef28b61169c564 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 8 Aug 2025 13:19:26 +0000
Subject: [PATCH 10/14] fix(pipeline): increase minimum sequence_length

Starting from transformers 4.54, there is an error when compiling
Qwen2.5-0.5M with a sequence length of 128. This is a very unlikely
configuration, and not one we want to cache.
The pipeline code is therefore modified to align on default values that
are actually tested in the NeuronModelForCausalLM export tests.
---
 optimum/neuron/pipelines/transformers/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/neuron/pipelines/transformers/base.py b/optimum/neuron/pipelines/transformers/base.py
index bc3ab886d..6049afff1 100644
--- a/optimum/neuron/pipelines/transformers/base.py
+++ b/optimum/neuron/pipelines/transformers/base.py
@@ -274,7 +274,8 @@ def pipeline(
     if export:
         if neuron_config is not None:
             raise ValueError("This model has already been exported to Neuron format")
-        if not input_shapes:
+        # Decoder models can select default input shapes from the config
+        if task != "text-generation" and not input_shapes:
             input_shapes = {"batch_size": 1, "sequence_length": 128}
             logger.warning(f"No input shapes provided, using default shapes, {input_shapes}")
     else:

From af17ad48c725e90cdebfda3ea8ee8a015984b75a Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Tue, 26 Aug 2025 14:35:47 +0000
Subject: [PATCH 11/14] fix: do not return dict in CLIP models

CLIP models used in SD pipelines do not specify return_dict in their config
but the tracing fails if return_dict is True, which is now the default
in transformers.
---
 optimum/exporters/neuron/model_configs.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 83f9c3a69..a2884e722 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -19,6 +19,7 @@
 import os
 from functools import partial
 from pathlib import Path
+from typing import Any
 
 import neuronx_distributed
 import torch
@@ -354,6 +355,10 @@ def outputs(self) -> list[str]:
 
         return common_outputs
 
+    @property
+    def values_override(self) -> dict[str, Any] | None:
+        return {"return_dict": False}
+
 
 @register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="diffusers")
 class CLIPTextNeuronConfig(CLIPTextWithProjectionNeuronConfig):

From ea64e279826c10770a6c6c93b8e34071a8ea5332 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Tue, 26 Aug 2025 15:06:22 +0000
Subject: [PATCH 12/14] fix(t5): explicitly convert past_key_values to a Cache

In the latest transformers version, it is not done automatically
anymore.
---
 optimum/exporters/neuron/model_wrappers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index 9d277a131..e5a1425b7 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -17,6 +17,7 @@
 from typing import TYPE_CHECKING
 
 import torch
+from transformers.cache_utils import EncoderDecoderCache
 from transformers.models.t5.modeling_t5 import T5LayerCrossAttention
 
 from ...neuron.utils import is_neuronx_available
@@ -621,7 +622,7 @@ def forward(
         decoder_output = self.model.decoder(
             input_ids=input_ids,
             attention_mask=decoder_attention_mask,
-            past_key_values=past_key_values,
+            past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values),
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=encoder_attention_mask,
             use_cache=True,

From d665dc8fd268afcfb1b45f0db221a31ac0045858 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 27 Aug 2025 11:40:44 +0000
Subject: [PATCH 13/14] fix(t5): adapt T5 attention custom modeling

The latest T5Block layer in transformers does not expect the
past_key_value to be returned by the T5Attention anymore.
---
 optimum/neuron/models/inference/t5/modeling_t5.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/neuron/models/inference/t5/modeling_t5.py b/optimum/neuron/models/inference/t5/modeling_t5.py
index b39e91e77..15c3e8c67 100644
--- a/optimum/neuron/models/inference/t5/modeling_t5.py
+++ b/optimum/neuron/models/inference/t5/modeling_t5.py
@@ -257,7 +257,7 @@ def forward(
         attn_output = attn_output.view(batch_size, -1, self.hidden_size_per_partition)
         attn_output = self.o(attn_output)
 
-        outputs = (attn_output, past_key_value, position_bias)
+        outputs = (attn_output, position_bias)
 
         if output_attentions:
             outputs = outputs + (attn_weights,)

From 1932d2885325d4cdde52eab5dcfbd239991ee4c0 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 27 Aug 2025 12:18:22 +0000
Subject: [PATCH 14/14] fix(generation): call to non-existent method

---
 optimum/neuron/generation/utils.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
index 2cd0921ba..fd654b492 100644
--- a/optimum/neuron/generation/utils.py
+++ b/optimum/neuron/generation/utils.py
@@ -233,9 +233,6 @@ def generate(
         generation_config: GenerationConfig | None = None,
         **kwargs,
     ):
-        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
-        self._validate_model_class()
-
         # priority: `generation_config` argument > `model.generation_config` (the default generation config)
         if generation_config is None:
             # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
@@ -623,9 +620,6 @@ def generate(
                     - [`~generation.BeamSampleEncoderDecoderOutput`]
         """
 
-        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
-        self._validate_model_class()
-
         # priority: `generation_config` argument > `model.generation_config` (the default generation config)
         if generation_config is None:
             # legacy: users may modify the model configuration to control generation -- update the generation config