huggingface · dacorvo · Aug 27, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/.github/workflows/inference_cache_llm.yml b/.github/workflows/inference_cache_llm.yml
@@ -32,6 +32,7 @@ jobs:
           llama3.1-70b,
           qwen2.5-large,
           llama-variants,
+          smollm3,
         ]
     steps:
       - name: Checkout

diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
@@ -19,6 +19,7 @@
 import os
 from functools import partial
 from pathlib import Path
+from typing import Any
 
 import neuronx_distributed
 import torch
@@ -354,6 +355,10 @@ def outputs(self) -> list[str]:
 
         return common_outputs
 
+    @property
+    def values_override(self) -> dict[str, Any] | None:
+        return {"return_dict": False}
+
 
 @register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="diffusers")
 class CLIPTextNeuronConfig(CLIPTextWithProjectionNeuronConfig):

diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
@@ -17,6 +17,7 @@
 from typing import TYPE_CHECKING
 
 import torch
+from transformers.cache_utils import EncoderDecoderCache
 from transformers.models.t5.modeling_t5 import T5LayerCrossAttention
 
 from ...neuron.utils import is_neuronx_available
@@ -621,7 +622,7 @@ def forward(
         decoder_output = self.model.decoder(
             input_ids=input_ids,
             attention_mask=decoder_attention_mask,
-            past_key_values=past_key_values,
+            past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values),
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=encoder_attention_mask,
             use_cache=True,

diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
@@ -233,9 +233,6 @@ def generate(
         generation_config: GenerationConfig | None = None,
         **kwargs,
     ):
-        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
-        self._validate_model_class()
-
         # priority: `generation_config` argument > `model.generation_config` (the default generation config)
         if generation_config is None:
             # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
@@ -623,9 +620,6 @@ def generate(
                     - [`~generation.BeamSampleEncoderDecoderOutput`]
         """
 
-        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
-        self._validate_model_class()
-
         # priority: `generation_config` argument > `model.generation_config` (the default generation config)
         if generation_config is None:
             # legacy: users may modify the model configuration to control generation -- update the generation config

diff --git a/optimum/neuron/models/inference/auto_models.py b/optimum/neuron/models/inference/auto_models.py
@@ -28,6 +28,7 @@
 from .phi3.modeling_phi3 import Phi3NxDModelForCausalLM
 from .qwen2.modeling_qwen2 import Qwen2NxDModelForCausalLM
 from .qwen3.modeling_qwen3 import Qwen3NxDModelForCausalLM
+from .smollm3.modeling_smollm3 import SmolLM3NxDModelForCausalLM
 
 
 prioritize_hlo_backend = os.environ.get("OPTIMUM_NEURON_PRIORITIZE_HLO_BACKEND", "0") == "1"
@@ -58,6 +59,15 @@ class LLamaModelForCausalLM(LlamaNxDModelForCausalLM):
     pass
 
 
+@register_neuron_model_for_inference("mixtral", "text-generation")
+class MixtralNeuronModelForCausalLM(MixtralNxDModelForCausalLM):
+    """
+    Mixtral model with NxD backend for inference on AWS Neuron.
+    """
+
+    pass
+
+
 @register_neuron_model_for_inference("phi3", "text-generation")
 class Phi3ModelForCausalLM(Phi3NxDModelForCausalLM):
     """
@@ -76,19 +86,19 @@ class Qwen2ModelForCausalLM(Qwen2NxDModelForCausalLM):
     pass
 
 
-@register_neuron_model_for_inference("mixtral", "text-generation")
-class MixtralNeuronModelForCausalLM(MixtralNxDModelForCausalLM):
+@register_neuron_model_for_inference("qwen3", "text-generation")
+class Qwen3NeuronModelForCausalLM(Qwen3NxDModelForCausalLM):
     """
-    Mixtral model with NxD backend for inference on AWS Neuron.
+    Qwen3 model with NxD backend for inference on AWS Neuron.
     """
 
     pass
 
 
-@register_neuron_model_for_inference("qwen3", "text-generation")
-class Qwen3NeuronModelForCausalLM(Qwen3NxDModelForCausalLM):
+@register_neuron_model_for_inference("smollm3", "text-generation")
+class SmolLM3NeuronModelForCausalLM(SmolLM3NxDModelForCausalLM):
     """
-    Qwen3 model with NxD backend for inference on AWS Neuron.
+    SomlLM3 model with NxD backend for inference on AWS Neuron.
     """
 
     pass
diff --git a/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py b/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py
@@ -845,6 +845,9 @@ def export(
             )
         # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type
         config.torch_dtype = neuron_config.torch_dtype
+        # Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+)
+        if hasattr(config, "head_dim") and config.head_dim is None:
+            config.head_dim = config.hidden_size // config.num_attention_heads
         context_encoding_model, token_generation_model, speculation_model = cls.create_model_wrappers(
             model_cls=cls._model_cls,
             config=config,

diff --git a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SmolLM3 model for NXD inference."""
+
+import logging
+
+from neuronx_distributed.parallel_layers.layers import (
+    ColumnParallelLinear,
+    ParallelEmbedding,
+)
+from torch import nn
+from transformers.models.smollm3.configuration_smollm3 import SmolLM3Config
+
+from ..backend.config import NxDNeuronConfig  # noqa: E402
+from ..backend.modules.attention.attention_base import NeuronAttentionBase
+from ..backend.modules.attention.utils import RotaryEmbedding
+from ..backend.modules.custom_calls import CustomRMSNorm
+from ..backend.modules.decoder import NxDDecoderModel
+from ..llama.modeling_llama import (
+    LlamaNxDModelForCausalLM,
+    NeuronLlamaDecoderLayer,
+)
+
+
+logger = logging.getLogger("Neuron")
+
+
+class NeuronSmolLM3Attention(NeuronAttentionBase):
+    """
+    The only difference with the NeuronAttentionBase is the definition of the SmolLM3 rotary embedding
+    """
+
+    def __init__(
+        self,
+        config: SmolLM3Config,
+        neuron_config: NxDNeuronConfig,
+        layer_idx: int,
+        qkv_proj_bias: bool | None = False,
+        o_proj_bias: bool | None = False,
+        qk_scale: float | None = None,
+    ):
+        if config.use_sliding_window:
+            raise ValueError("SmolLM3 for Neuron does not support sliding window attention.")
+        if getattr(config, "rope_scaling", None) is not None:
+            raise ValueError("SmolLM3 for Neuron does not support rope scaling.")
+        super().__init__(
+            config, neuron_config, qkv_proj_bias=qkv_proj_bias, o_proj_bias=o_proj_bias, qk_scale=qk_scale
+        )
+        if config.no_rope_layers[layer_idx]:
+            # Yes, the condition is slightly counter-intuitive, but that is the transformers convention
+            head_dim = config.hidden_size // config.num_attention_heads
+            self.rotary_emb = RotaryEmbedding(
+                head_dim,
+                max_position_embeddings=config.max_position_embeddings,
+                base=config.rope_theta,
+            )
+        else:
+            self.rotary_emb = None
+
+
+class NeuronSmolLM3DecoderLayer(NeuronLlamaDecoderLayer):
+    def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig, layer_idx: int):
+        super().__init__(config, neuron_config)
+        self.self_attn = NeuronSmolLM3Attention(config, neuron_config, layer_idx)
+
+
+class NxDSmolLM3Model(NxDDecoderModel):
+    """
+    The neuron version of the SmolLM3Model
+    """
+
+    def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig):
+        super().__init__(config, neuron_config)
+
+        self.embed_tokens = ParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            config.pad_token_id,
+            dtype=neuron_config.torch_dtype,
+            shard_across_embedding=not neuron_config.vocab_parallel,
+            sequence_parallel_enabled=False,
+            pad=True,
+            use_spmd_rank=neuron_config.vocab_parallel,
+        )
+
+        self.lm_head = ColumnParallelLinear(
+            config.hidden_size,
+            config.vocab_size,
+            gather_output=not neuron_config.on_device_sampling,
+            bias=False,
+            pad=True,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                NeuronSmolLM3DecoderLayer(config, neuron_config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = CustomRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class SmolLM3NxDModelForCausalLM(LlamaNxDModelForCausalLM):
+    _model_cls = NxDSmolLM3Model
diff --git a/optimum/neuron/models/inference/t5/modeling_t5.py b/optimum/neuron/models/inference/t5/modeling_t5.py
@@ -257,7 +257,7 @@ def forward(
         attn_output = attn_output.view(batch_size, -1, self.hidden_size_per_partition)
         attn_output = self.o(attn_output)
 
-        outputs = (attn_output, past_key_value, position_bias)
+        outputs = (attn_output, position_bias)
 
         if output_attentions:
             outputs = outputs + (attn_weights,)

diff --git a/optimum/neuron/models/training/granite/modeling_granite.py b/optimum/neuron/models/training/granite/modeling_granite.py
@@ -27,7 +27,7 @@
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.models.granite.configuration_granite import GraniteConfig
 from transformers.processing_utils import Unpack
-from transformers.utils import LossKwargs, can_return_tuple, logging
+from transformers.utils import TransformersKwargs, can_return_tuple, logging
 
 from ..config import TrainingNeuronConfig
 from ..llama.modeling_llama import (
@@ -225,7 +225,7 @@ def forward(
         return output
 
 
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
 
 
 class GraniteForCausalLM(LlamaForCausalLM):

diff --git a/optimum/neuron/models/training/llama/modeling_llama.py b/optimum/neuron/models/training/llama/modeling_llama.py
@@ -40,7 +40,7 @@
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.processing_utils import Unpack
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.utils import LossKwargs, can_return_tuple, logging
+from transformers.utils import TransformersKwargs, can_return_tuple, logging
 
 from ..config import TrainingNeuronConfig
 from ..loss_utils import ForCausalLMLoss
@@ -812,7 +812,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...
 
 
 class LlamaForCausalLM(NeuronModelMixin, LlamaPreTrainedModel):

diff --git a/optimum/neuron/pipelines/transformers/base.py b/optimum/neuron/pipelines/transformers/base.py
@@ -274,7 +274,8 @@ def pipeline(
     if export:
         if neuron_config is not None:
             raise ValueError("This model has already been exported to Neuron format")
-        if not input_shapes:
+        # Decoder models can select default input shapes from the config
+        if task != "text-generation" and not input_shapes:
             input_shapes = {"batch_size": 1, "sequence_length": 128}
             logger.warning(f"No input shapes provided, using default shapes, {input_shapes}")
     else:

diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py
@@ -12,6 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "0.3.1.dev0"
+__version__ = "0.3.1.dev1"
 
 __sdk_version__ = "2.24.0"
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 dependencies = [
-    "transformers ~= 4.51.0",
+    "transformers ~= 4.55.4",
     "accelerate == 1.8.1",
     "optimum ~= 1.24.0",
     "huggingface_hub >= 0.29.0",
@@ -101,7 +101,7 @@ neuronx = [
     "wheel",
     "neuronx-cc==2.19.8089.0",
     "torch-neuronx==2.7.0.2.8.6734+ac864f72",
-    "torch==2.7.0.*",
+    "torch==2.7.1.*",
     "torchvision==0.22.*",
     "neuronx_distributed==0.13.14393",
     "libneuronxla==2.2.4410.0",
@@ -114,7 +114,7 @@ sentence-transformers = [
     "sentence-transformers >= 2.2.0",
 ]
 vllm = [
-    "vllm == 0.9.2",
+    "vllm == 0.10.0",
 ]
 
 [project.scripts]

diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py
@@ -72,6 +72,15 @@
             "auto_cast_type": "bf16",
         },
     },
+    "smollm3": {
+        "model_id": "HuggingFaceTB/SmolLM3-3B",
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
+    },
 }
 
 

diff --git a/tests/decoder/test_decoder_export.py b/tests/decoder/test_decoder_export.py
@@ -30,6 +30,7 @@
     "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM",
     "phi3": "yujiepan/phi-4-tiny-random",
     "mixtral": "dacorvo/Mixtral-tiny",
+    "smollm3": "HuggingFaceTB/SmolLM3-3B",
 }
 
 

diff --git a/tests/decoder/test_decoder_generation.py b/tests/decoder/test_decoder_generation.py
@@ -105,6 +105,7 @@ def test_decoder_generation_greedy_expectations(neuron_decoder_config):
         "qwen3": " What is the difference between Deep Learning and Machine Learning?\n\nDeep Learning is a subset of",
         "granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and",
         "phi": "\n\nDeep learning is a subfield of machine learning that focuses on creating",
+        "smollm3": " Deep learning is a subset of machine learning that uses neural networks with many layers to learn",
     }
     config_name = neuron_decoder_config["name"]
     generated_text = tokenizer.decode(outputs[0])