From 7a1e59e0aa31104ed4632329ab73c8786ef85f2c Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 5 Aug 2025 08:16:06 +0000 Subject: [PATCH 01/14] refactor(inference): reorder automodels --- optimum/neuron/models/inference/auto_models.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/optimum/neuron/models/inference/auto_models.py b/optimum/neuron/models/inference/auto_models.py index 7859bf47d..949493727 100644 --- a/optimum/neuron/models/inference/auto_models.py +++ b/optimum/neuron/models/inference/auto_models.py @@ -58,28 +58,28 @@ class LLamaModelForCausalLM(LlamaNxDModelForCausalLM): pass -@register_neuron_model_for_inference("phi3", "text-generation") -class Phi3ModelForCausalLM(Phi3NxDModelForCausalLM): +@register_neuron_model_for_inference("mixtral", "text-generation") +class MixtralNeuronModelForCausalLM(MixtralNxDModelForCausalLM): """ - Phi3 model with NxD backend for inference on AWS Neuron. + Mixtral model with NxD backend for inference on AWS Neuron. """ pass -@register_neuron_model_for_inference("qwen2", "text-generation") -class Qwen2ModelForCausalLM(Qwen2NxDModelForCausalLM): +@register_neuron_model_for_inference("phi3", "text-generation") +class Phi3ModelForCausalLM(Phi3NxDModelForCausalLM): """ - Qwen2 model with NxD backend for inference on AWS Neuron. + Phi3 model with NxD backend for inference on AWS Neuron. """ pass -@register_neuron_model_for_inference("mixtral", "text-generation") -class MixtralNeuronModelForCausalLM(MixtralNxDModelForCausalLM): +@register_neuron_model_for_inference("qwen2", "text-generation") +class Qwen2ModelForCausalLM(Qwen2NxDModelForCausalLM): """ - Mixtral model with NxD backend for inference on AWS Neuron. + Qwen2 model with NxD backend for inference on AWS Neuron. """ pass From 10c54a98d800ba7f272a08c0d26395b2c910e0b4 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 5 Aug 2025 08:18:49 +0000 Subject: [PATCH 02/14] chore: bump transformers version --- optimum/neuron/models/training/granite/modeling_granite.py | 4 ++-- optimum/neuron/models/training/llama/modeling_llama.py | 4 ++-- pyproject.toml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/optimum/neuron/models/training/granite/modeling_granite.py b/optimum/neuron/models/training/granite/modeling_granite.py index 946f3f775..deab7b863 100644 --- a/optimum/neuron/models/training/granite/modeling_granite.py +++ b/optimum/neuron/models/training/granite/modeling_granite.py @@ -27,7 +27,7 @@ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.models.granite.configuration_granite import GraniteConfig from transformers.processing_utils import Unpack -from transformers.utils import LossKwargs, can_return_tuple, logging +from transformers.utils import TransformersKwargs, can_return_tuple, logging from ..config import TrainingNeuronConfig from ..llama.modeling_llama import ( @@ -225,7 +225,7 @@ def forward( return output -class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... +class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ... class GraniteForCausalLM(LlamaForCausalLM): diff --git a/optimum/neuron/models/training/llama/modeling_llama.py b/optimum/neuron/models/training/llama/modeling_llama.py index 3f608c49c..9dab0d2a4 100644 --- a/optimum/neuron/models/training/llama/modeling_llama.py +++ b/optimum/neuron/models/training/llama/modeling_llama.py @@ -40,7 +40,7 @@ from transformers.models.llama.configuration_llama import LlamaConfig from transformers.processing_utils import Unpack from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS -from transformers.utils import LossKwargs, can_return_tuple, logging +from transformers.utils import TransformersKwargs, can_return_tuple, logging from ..config import TrainingNeuronConfig from ..loss_utils import ForCausalLMLoss @@ -812,7 +812,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( return causal_mask -class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... +class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ... class LlamaForCausalLM(NeuronModelMixin, LlamaPreTrainedModel): diff --git a/pyproject.toml b/pyproject.toml index 6d460872d..22452e853 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", ] dependencies = [ - "transformers ~= 4.51.0", + "transformers ~= 4.55.4", "accelerate == 1.8.1", "optimum ~= 1.24.0", "huggingface_hub >= 0.29.0", From c6dcb50a6b76d55b3b1eb3d4375e64a4bac0d05f Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 5 Aug 2025 12:50:00 +0000 Subject: [PATCH 03/14] chore: bump vllm version --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 22452e853..242cd5836 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ neuronx = [ "wheel", "neuronx-cc==2.19.8089.0", "torch-neuronx==2.7.0.2.8.6734+ac864f72", - "torch==2.7.0.*", + "torch==2.7.1.*", "torchvision==0.22.*", "neuronx_distributed==0.13.14393", "libneuronxla==2.2.4410.0", @@ -114,7 +114,7 @@ sentence-transformers = [ "sentence-transformers >= 2.2.0", ] vllm = [ - "vllm == 0.9.2", + "vllm == 0.10.0", ] [project.scripts] From f0700e05d8e6c86ca3716af85c6983695bbab720 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 5 Aug 2025 14:19:44 +0000 Subject: [PATCH 04/14] test(vllm): device argument is deprecated --- tests/decoder/test_vllm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/decoder/test_vllm.py b/tests/decoder/test_vllm.py index 6e732af00..34f6abf1d 100644 --- a/tests/decoder/test_vllm.py +++ b/tests/decoder/test_vllm.py @@ -42,7 +42,7 @@ def _test_vllm_generation(llm): def test_vllm_from_neuron_model(base_neuron_decoder_path): """Test vLLm generation on a single model exported locally.""" - llm = LLM(model=base_neuron_decoder_path, device="neuron") + llm = LLM(model=base_neuron_decoder_path) _test_vllm_generation(llm) @@ -56,7 +56,6 @@ def test_vllm_from_hub_model(neuron_decoder_config): max_model_len=export_kwargs["sequence_length"], tensor_parallel_size=export_kwargs["num_cores"], dtype=DTYPE_MAPPER.pt(export_kwargs["auto_cast_type"]), - device="neuron", ) _test_vllm_generation(llm) @@ -68,7 +67,6 @@ def test_vllm_greedy_expectations(neuron_decoder_config): llm = LLM( model=neuron_decoder_config["neuron_model_path"], max_num_seqs=neuron_decoder_config["export_kwargs"]["batch_size"], - device="neuron", ) # Send more prompts than the compiled batch size (4) and request # varying generation lengths to test continuous batching. From 431556bb0799107f01a87a62a54e516649c043c5 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 5 Aug 2025 12:53:18 +0000 Subject: [PATCH 05/14] chore: bump dev version --- optimum/neuron/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py index 1a8863a03..d447f9c78 100644 --- a/optimum/neuron/version.py +++ b/optimum/neuron/version.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.3.1.dev0" +__version__ = "0.3.1.dev1" __sdk_version__ = "2.24.0" From 28c679abec3f0826a0cb1d463d214aa46afd3846 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 5 Aug 2025 08:19:21 +0000 Subject: [PATCH 06/14] feat(inference): add SmolLM3 --- .../neuron/models/inference/auto_models.py | 10 ++ .../inference/smollm3/modeling_smollm3.py | 116 ++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 optimum/neuron/models/inference/smollm3/modeling_smollm3.py diff --git a/optimum/neuron/models/inference/auto_models.py b/optimum/neuron/models/inference/auto_models.py index 949493727..3126ae8b2 100644 --- a/optimum/neuron/models/inference/auto_models.py +++ b/optimum/neuron/models/inference/auto_models.py @@ -28,6 +28,7 @@ from .phi3.modeling_phi3 import Phi3NxDModelForCausalLM from .qwen2.modeling_qwen2 import Qwen2NxDModelForCausalLM from .qwen3.modeling_qwen3 import Qwen3NxDModelForCausalLM +from .smollm3.modeling_smollm3 import SmolLM3NxDModelForCausalLM prioritize_hlo_backend = os.environ.get("OPTIMUM_NEURON_PRIORITIZE_HLO_BACKEND", "0") == "1" @@ -92,3 +93,12 @@ class Qwen3NeuronModelForCausalLM(Qwen3NxDModelForCausalLM): """ pass + + +@register_neuron_model_for_inference("smollm3", "text-generation") +class SmolLM3NeuronModelForCausalLM(SmolLM3NxDModelForCausalLM): + """ + SomlLM3 model with NxD backend for inference on AWS Neuron. + """ + + pass diff --git a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py new file mode 100644 index 000000000..db6602ad2 --- /dev/null +++ b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py @@ -0,0 +1,116 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch SmolLM3 model for NXD inference.""" + +import logging + +from neuronx_distributed.parallel_layers.layers import ( + ColumnParallelLinear, + ParallelEmbedding, +) +from torch import nn +from transformers.models.smollm3.configuration_smollm3 import SmolLM3Config + +from ..backend.config import NxDNeuronConfig # noqa: E402 +from ..backend.modules.attention.attention_base import NeuronAttentionBase +from ..backend.modules.attention.utils import RotaryEmbedding +from ..backend.modules.custom_calls import CustomRMSNorm +from ..backend.modules.decoder import NxDDecoderModel +from ..llama.modeling_llama import ( + LlamaNxDModelForCausalLM, + NeuronLlamaDecoderLayer, +) + + +logger = logging.getLogger("Neuron") + + +class NeuronSmolLM3Attention(NeuronAttentionBase): + """ + The only difference with the NeuronAttentionBase is the definition of the SmolLM3 rotary embedding + """ + + def __init__( + self, + config: SmolLM3Config, + neuron_config: NxDNeuronConfig, + layer_idx: int, + qkv_proj_bias: bool | None = False, + o_proj_bias: bool | None = False, + qk_scale: float | None = None, + ): + if config.use_sliding_window: + raise ValueError("SmolLM3 for Neuron does not support sliding window attention.") + if getattr(config, "rope_scaling", None) is not None: + raise ValueError("SmolLM3 for Neuron does not support rope scaling.") + super().__init__( + config, neuron_config, qkv_proj_bias=qkv_proj_bias, o_proj_bias=o_proj_bias, qk_scale=qk_scale + ) + if config.no_rope_layers[layer_idx]: + # Yes, the condition is slightly counter-intuitive, but that is the transformers convention + head_dim = config.hidden_size // config.num_attention_heads + self.rotary_emb = RotaryEmbedding( + head_dim, + max_position_embeddings=config.max_position_embeddings, + base=config.rope_theta, + ) + else: + self.rotary_emb = None + + +class NeuronSmolLM3DecoderLayer(NeuronLlamaDecoderLayer): + def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig, layer_idx: int): + super().__init__(config, neuron_config) + self.self_attn = NeuronSmolLM3Attention(config, neuron_config, layer_idx) + + +class NxDSmolLM3Model(NxDDecoderModel): + """ + The neuron version of the SmolLM3Model + """ + + def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig): + super().__init__(config, neuron_config) + + self.embed_tokens = ParallelEmbedding( + config.vocab_size, + config.hidden_size, + config.pad_token_id, + dtype=neuron_config.torch_dtype, + shard_across_embedding=not neuron_config.vocab_parallel, + sequence_parallel_enabled=False, + pad=True, + use_spmd_rank=neuron_config.vocab_parallel, + ) + + self.lm_head = ColumnParallelLinear( + config.hidden_size, + config.vocab_size, + gather_output=not neuron_config.on_device_sampling, + bias=False, + pad=True, + ) + + self.layers = nn.ModuleList( + [ + NeuronSmolLM3DecoderLayer(config, neuron_config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = CustomRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + +class SmolLM3NxDModelForCausalLM(LlamaNxDModelForCausalLM): + _model_cls = NxDSmolLM3Model From a7fadc2ff90394e7949af6803f86397bca629e3f Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Fri, 8 Aug 2025 09:13:17 +0000 Subject: [PATCH 07/14] test(decoder): add smollm3 tests --- tests/decoder/conftest.py | 9 +++++++++ tests/decoder/test_decoder_export.py | 1 + tests/decoder/test_decoder_generation.py | 1 + tests/decoder/test_vllm.py | 8 ++++++++ 4 files changed, 19 insertions(+) diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py index 6a7a16868..de0ba0405 100644 --- a/tests/decoder/conftest.py +++ b/tests/decoder/conftest.py @@ -72,6 +72,15 @@ "auto_cast_type": "bf16", }, }, + "smollm3": { + "model_id": "HuggingFaceTB/SmolLM3-3B", + "export_kwargs": { + "batch_size": 4, + "sequence_length": 4096, + "num_cores": 2, + "auto_cast_type": "bf16", + }, + }, } diff --git a/tests/decoder/test_decoder_export.py b/tests/decoder/test_decoder_export.py index 2d0d97c78..489732b29 100644 --- a/tests/decoder/test_decoder_export.py +++ b/tests/decoder/test_decoder_export.py @@ -30,6 +30,7 @@ "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "phi3": "yujiepan/phi-4-tiny-random", "mixtral": "dacorvo/Mixtral-tiny", + "smollm3": "HuggingFaceTB/SmolLM3-3B", } diff --git a/tests/decoder/test_decoder_generation.py b/tests/decoder/test_decoder_generation.py index 7557f476b..45db4369c 100644 --- a/tests/decoder/test_decoder_generation.py +++ b/tests/decoder/test_decoder_generation.py @@ -105,6 +105,7 @@ def test_decoder_generation_greedy_expectations(neuron_decoder_config): "qwen3": " What is the difference between Deep Learning and Machine Learning?\n\nDeep Learning is a subset of", "granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and", "phi": "\n\nDeep learning is a subfield of machine learning that focuses on creating", + "smollm3": " Deep learning is a subset of machine learning that uses neural networks with many layers to learn", } config_name = neuron_decoder_config["name"] generated_text = tokenizer.decode(outputs[0]) diff --git a/tests/decoder/test_vllm.py b/tests/decoder/test_vllm.py index 34f6abf1d..778620f15 100644 --- a/tests/decoder/test_vllm.py +++ b/tests/decoder/test_vllm.py @@ -131,6 +131,14 @@ def test_vllm_greedy_expectations(neuron_decoder_config): " due to the absorption of light by the atmosphere.", " the time I was in the first grade. I remember the day I got the first grade, the", ], + "smollm3": [ + " the head of state and government of the United States", + " Paris. The Eiffel Tower is in Paris. The Eiffel Tower is a famous landmark", + " It was a special day, for it was the first of April, and people were putting the finishing touches to their April Fools' Day pranks", + " to be happy. I believe that happiness is the most important thing in life. I believe that happiness is not just a feeling, but a state of being. I believe that happiness is not something that", + " blue because of Rayleigh scattering. This is the", + " of my grandmother, who was a wonderful cook, making a delicious chicken and dumplings soup. She", + ], }[neuron_decoder_config["name"]] for expected_output, output in zip(expected_outputs, outputs): From 0e393aac9f08f51c45349b33d37532f5b1e9acd2 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Fri, 8 Aug 2025 09:40:31 +0000 Subject: [PATCH 08/14] ci: add smollm3 models to cache workflow --- .github/workflows/inference_cache_llm.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/inference_cache_llm.yml b/.github/workflows/inference_cache_llm.yml index b73c48d9a..8b5a5ce6d 100644 --- a/.github/workflows/inference_cache_llm.yml +++ b/.github/workflows/inference_cache_llm.yml @@ -32,6 +32,7 @@ jobs: llama3.1-70b, qwen2.5-large, llama-variants, + smollm3, ] steps: - name: Checkout From 81439780174f01699bd9844a322768a7f56f6cdc Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Fri, 8 Aug 2025 12:30:24 +0000 Subject: [PATCH 09/14] fix(Mixtral): workaround null head_dim --- .../inference/backend/modules/decoder/modeling_decoder.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py b/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py index b28437465..dd99dcd52 100644 --- a/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py +++ b/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py @@ -845,6 +845,9 @@ def export( ) # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type config.torch_dtype = neuron_config.torch_dtype + # Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+) + if hasattr(config, "head_dim") and config.head_dim is None: + config.head_dim = config.hidden_size // config.num_attention_heads context_encoding_model, token_generation_model, speculation_model = cls.create_model_wrappers( model_cls=cls._model_cls, config=config, From 52a50fc2321fccc5731ad5acf5ef28b61169c564 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Fri, 8 Aug 2025 13:19:26 +0000 Subject: [PATCH 10/14] fix(pipeline): increase minimum sequence_length Starting from transformers 4.54, there is an error when compiling Qwen2.5-0.5M with a sequence length of 128. This is a very unlikely configuration, and not one we want to cache. The pipeline code is therefore modified to align on default values that are actually tested in the NeuronModelForCausalLM export tests. --- optimum/neuron/pipelines/transformers/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/neuron/pipelines/transformers/base.py b/optimum/neuron/pipelines/transformers/base.py index bc3ab886d..6049afff1 100644 --- a/optimum/neuron/pipelines/transformers/base.py +++ b/optimum/neuron/pipelines/transformers/base.py @@ -274,7 +274,8 @@ def pipeline( if export: if neuron_config is not None: raise ValueError("This model has already been exported to Neuron format") - if not input_shapes: + # Decoder models can select default input shapes from the config + if task != "text-generation" and not input_shapes: input_shapes = {"batch_size": 1, "sequence_length": 128} logger.warning(f"No input shapes provided, using default shapes, {input_shapes}") else: From af17ad48c725e90cdebfda3ea8ee8a015984b75a Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 26 Aug 2025 14:35:47 +0000 Subject: [PATCH 11/14] fix: do not return dict in CLIP models CLIP models used in SD pipelines do not specify return_dict in their config but the tracing fails if return_dict is True, which is now the default in transformers. --- optimum/exporters/neuron/model_configs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 83f9c3a69..a2884e722 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -19,6 +19,7 @@ import os from functools import partial from pathlib import Path +from typing import Any import neuronx_distributed import torch @@ -354,6 +355,10 @@ def outputs(self) -> list[str]: return common_outputs + @property + def values_override(self) -> dict[str, Any] | None: + return {"return_dict": False} + @register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="diffusers") class CLIPTextNeuronConfig(CLIPTextWithProjectionNeuronConfig): From ea64e279826c10770a6c6c93b8e34071a8ea5332 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 26 Aug 2025 15:06:22 +0000 Subject: [PATCH 12/14] fix(t5): explicitly convert past_key_values to a Cache In the latest transformers version, it is not done automatically anymore. --- optimum/exporters/neuron/model_wrappers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index 9d277a131..e5a1425b7 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -17,6 +17,7 @@ from typing import TYPE_CHECKING import torch +from transformers.cache_utils import EncoderDecoderCache from transformers.models.t5.modeling_t5 import T5LayerCrossAttention from ...neuron.utils import is_neuronx_available @@ -621,7 +622,7 @@ def forward( decoder_output = self.model.decoder( input_ids=input_ids, attention_mask=decoder_attention_mask, - past_key_values=past_key_values, + past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values), encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, use_cache=True, From d665dc8fd268afcfb1b45f0db221a31ac0045858 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 27 Aug 2025 11:40:44 +0000 Subject: [PATCH 13/14] fix(t5): adapt T5 attention custom modeling The latest T5Block layer in transformers does not expect the past_key_value to be returned by the T5Attention anymore. --- optimum/neuron/models/inference/t5/modeling_t5.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/neuron/models/inference/t5/modeling_t5.py b/optimum/neuron/models/inference/t5/modeling_t5.py index b39e91e77..15c3e8c67 100644 --- a/optimum/neuron/models/inference/t5/modeling_t5.py +++ b/optimum/neuron/models/inference/t5/modeling_t5.py @@ -257,7 +257,7 @@ def forward( attn_output = attn_output.view(batch_size, -1, self.hidden_size_per_partition) attn_output = self.o(attn_output) - outputs = (attn_output, past_key_value, position_bias) + outputs = (attn_output, position_bias) if output_attentions: outputs = outputs + (attn_weights,) From 1932d2885325d4cdde52eab5dcfbd239991ee4c0 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 27 Aug 2025 12:18:22 +0000 Subject: [PATCH 14/14] fix(generation): call to non-existent method --- optimum/neuron/generation/utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index 2cd0921ba..fd654b492 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -233,9 +233,6 @@ def generate( generation_config: GenerationConfig | None = None, **kwargs, ): - # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call - self._validate_model_class() - # priority: `generation_config` argument > `model.generation_config` (the default generation config) if generation_config is None: # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior, @@ -623,9 +620,6 @@ def generate( - [`~generation.BeamSampleEncoderDecoderOutput`] """ - # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call - self._validate_model_class() - # priority: `generation_config` argument > `model.generation_config` (the default generation config) if generation_config is None: # legacy: users may modify the model configuration to control generation -- update the generation config