diff --git a/.github/workflows/inference_cache_llm.yml b/.github/workflows/inference_cache_llm.yml index b73c48d9a..8b5a5ce6d 100644 --- a/.github/workflows/inference_cache_llm.yml +++ b/.github/workflows/inference_cache_llm.yml @@ -32,6 +32,7 @@ jobs: llama3.1-70b, qwen2.5-large, llama-variants, + smollm3, ] steps: - name: Checkout diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 83f9c3a69..a2884e722 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -19,6 +19,7 @@ import os from functools import partial from pathlib import Path +from typing import Any import neuronx_distributed import torch @@ -354,6 +355,10 @@ def outputs(self) -> list[str]: return common_outputs + @property + def values_override(self) -> dict[str, Any] | None: + return {"return_dict": False} + @register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="diffusers") class CLIPTextNeuronConfig(CLIPTextWithProjectionNeuronConfig): diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index 9d277a131..e5a1425b7 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -17,6 +17,7 @@ from typing import TYPE_CHECKING import torch +from transformers.cache_utils import EncoderDecoderCache from transformers.models.t5.modeling_t5 import T5LayerCrossAttention from ...neuron.utils import is_neuronx_available @@ -621,7 +622,7 @@ def forward( decoder_output = self.model.decoder( input_ids=input_ids, attention_mask=decoder_attention_mask, - past_key_values=past_key_values, + past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values), encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, use_cache=True, diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index 2cd0921ba..fd654b492 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -233,9 +233,6 @@ def generate( generation_config: GenerationConfig | None = None, **kwargs, ): - # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call - self._validate_model_class() - # priority: `generation_config` argument > `model.generation_config` (the default generation config) if generation_config is None: # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior, @@ -623,9 +620,6 @@ def generate( - [`~generation.BeamSampleEncoderDecoderOutput`] """ - # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call - self._validate_model_class() - # priority: `generation_config` argument > `model.generation_config` (the default generation config) if generation_config is None: # legacy: users may modify the model configuration to control generation -- update the generation config diff --git a/optimum/neuron/models/inference/auto_models.py b/optimum/neuron/models/inference/auto_models.py index 7859bf47d..3126ae8b2 100644 --- a/optimum/neuron/models/inference/auto_models.py +++ b/optimum/neuron/models/inference/auto_models.py @@ -28,6 +28,7 @@ from .phi3.modeling_phi3 import Phi3NxDModelForCausalLM from .qwen2.modeling_qwen2 import Qwen2NxDModelForCausalLM from .qwen3.modeling_qwen3 import Qwen3NxDModelForCausalLM +from .smollm3.modeling_smollm3 import SmolLM3NxDModelForCausalLM prioritize_hlo_backend = os.environ.get("OPTIMUM_NEURON_PRIORITIZE_HLO_BACKEND", "0") == "1" @@ -58,6 +59,15 @@ class LLamaModelForCausalLM(LlamaNxDModelForCausalLM): pass +@register_neuron_model_for_inference("mixtral", "text-generation") +class MixtralNeuronModelForCausalLM(MixtralNxDModelForCausalLM): + """ + Mixtral model with NxD backend for inference on AWS Neuron. + """ + + pass + + @register_neuron_model_for_inference("phi3", "text-generation") class Phi3ModelForCausalLM(Phi3NxDModelForCausalLM): """ @@ -76,19 +86,19 @@ class Qwen2ModelForCausalLM(Qwen2NxDModelForCausalLM): pass -@register_neuron_model_for_inference("mixtral", "text-generation") -class MixtralNeuronModelForCausalLM(MixtralNxDModelForCausalLM): +@register_neuron_model_for_inference("qwen3", "text-generation") +class Qwen3NeuronModelForCausalLM(Qwen3NxDModelForCausalLM): """ - Mixtral model with NxD backend for inference on AWS Neuron. + Qwen3 model with NxD backend for inference on AWS Neuron. """ pass -@register_neuron_model_for_inference("qwen3", "text-generation") -class Qwen3NeuronModelForCausalLM(Qwen3NxDModelForCausalLM): +@register_neuron_model_for_inference("smollm3", "text-generation") +class SmolLM3NeuronModelForCausalLM(SmolLM3NxDModelForCausalLM): """ - Qwen3 model with NxD backend for inference on AWS Neuron. + SomlLM3 model with NxD backend for inference on AWS Neuron. """ pass diff --git a/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py b/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py index b28437465..dd99dcd52 100644 --- a/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py +++ b/optimum/neuron/models/inference/backend/modules/decoder/modeling_decoder.py @@ -845,6 +845,9 @@ def export( ) # Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type config.torch_dtype = neuron_config.torch_dtype + # Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+) + if hasattr(config, "head_dim") and config.head_dim is None: + config.head_dim = config.hidden_size // config.num_attention_heads context_encoding_model, token_generation_model, speculation_model = cls.create_model_wrappers( model_cls=cls._model_cls, config=config, diff --git a/optimum/neuron/models/inference/smollm3/modeling_smollm3.py b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py new file mode 100644 index 000000000..db6602ad2 --- /dev/null +++ b/optimum/neuron/models/inference/smollm3/modeling_smollm3.py @@ -0,0 +1,116 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch SmolLM3 model for NXD inference.""" + +import logging + +from neuronx_distributed.parallel_layers.layers import ( + ColumnParallelLinear, + ParallelEmbedding, +) +from torch import nn +from transformers.models.smollm3.configuration_smollm3 import SmolLM3Config + +from ..backend.config import NxDNeuronConfig # noqa: E402 +from ..backend.modules.attention.attention_base import NeuronAttentionBase +from ..backend.modules.attention.utils import RotaryEmbedding +from ..backend.modules.custom_calls import CustomRMSNorm +from ..backend.modules.decoder import NxDDecoderModel +from ..llama.modeling_llama import ( + LlamaNxDModelForCausalLM, + NeuronLlamaDecoderLayer, +) + + +logger = logging.getLogger("Neuron") + + +class NeuronSmolLM3Attention(NeuronAttentionBase): + """ + The only difference with the NeuronAttentionBase is the definition of the SmolLM3 rotary embedding + """ + + def __init__( + self, + config: SmolLM3Config, + neuron_config: NxDNeuronConfig, + layer_idx: int, + qkv_proj_bias: bool | None = False, + o_proj_bias: bool | None = False, + qk_scale: float | None = None, + ): + if config.use_sliding_window: + raise ValueError("SmolLM3 for Neuron does not support sliding window attention.") + if getattr(config, "rope_scaling", None) is not None: + raise ValueError("SmolLM3 for Neuron does not support rope scaling.") + super().__init__( + config, neuron_config, qkv_proj_bias=qkv_proj_bias, o_proj_bias=o_proj_bias, qk_scale=qk_scale + ) + if config.no_rope_layers[layer_idx]: + # Yes, the condition is slightly counter-intuitive, but that is the transformers convention + head_dim = config.hidden_size // config.num_attention_heads + self.rotary_emb = RotaryEmbedding( + head_dim, + max_position_embeddings=config.max_position_embeddings, + base=config.rope_theta, + ) + else: + self.rotary_emb = None + + +class NeuronSmolLM3DecoderLayer(NeuronLlamaDecoderLayer): + def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig, layer_idx: int): + super().__init__(config, neuron_config) + self.self_attn = NeuronSmolLM3Attention(config, neuron_config, layer_idx) + + +class NxDSmolLM3Model(NxDDecoderModel): + """ + The neuron version of the SmolLM3Model + """ + + def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig): + super().__init__(config, neuron_config) + + self.embed_tokens = ParallelEmbedding( + config.vocab_size, + config.hidden_size, + config.pad_token_id, + dtype=neuron_config.torch_dtype, + shard_across_embedding=not neuron_config.vocab_parallel, + sequence_parallel_enabled=False, + pad=True, + use_spmd_rank=neuron_config.vocab_parallel, + ) + + self.lm_head = ColumnParallelLinear( + config.hidden_size, + config.vocab_size, + gather_output=not neuron_config.on_device_sampling, + bias=False, + pad=True, + ) + + self.layers = nn.ModuleList( + [ + NeuronSmolLM3DecoderLayer(config, neuron_config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = CustomRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + +class SmolLM3NxDModelForCausalLM(LlamaNxDModelForCausalLM): + _model_cls = NxDSmolLM3Model diff --git a/optimum/neuron/models/inference/t5/modeling_t5.py b/optimum/neuron/models/inference/t5/modeling_t5.py index b39e91e77..15c3e8c67 100644 --- a/optimum/neuron/models/inference/t5/modeling_t5.py +++ b/optimum/neuron/models/inference/t5/modeling_t5.py @@ -257,7 +257,7 @@ def forward( attn_output = attn_output.view(batch_size, -1, self.hidden_size_per_partition) attn_output = self.o(attn_output) - outputs = (attn_output, past_key_value, position_bias) + outputs = (attn_output, position_bias) if output_attentions: outputs = outputs + (attn_weights,) diff --git a/optimum/neuron/models/training/granite/modeling_granite.py b/optimum/neuron/models/training/granite/modeling_granite.py index 946f3f775..deab7b863 100644 --- a/optimum/neuron/models/training/granite/modeling_granite.py +++ b/optimum/neuron/models/training/granite/modeling_granite.py @@ -27,7 +27,7 @@ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.models.granite.configuration_granite import GraniteConfig from transformers.processing_utils import Unpack -from transformers.utils import LossKwargs, can_return_tuple, logging +from transformers.utils import TransformersKwargs, can_return_tuple, logging from ..config import TrainingNeuronConfig from ..llama.modeling_llama import ( @@ -225,7 +225,7 @@ def forward( return output -class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... +class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ... class GraniteForCausalLM(LlamaForCausalLM): diff --git a/optimum/neuron/models/training/llama/modeling_llama.py b/optimum/neuron/models/training/llama/modeling_llama.py index 3f608c49c..9dab0d2a4 100644 --- a/optimum/neuron/models/training/llama/modeling_llama.py +++ b/optimum/neuron/models/training/llama/modeling_llama.py @@ -40,7 +40,7 @@ from transformers.models.llama.configuration_llama import LlamaConfig from transformers.processing_utils import Unpack from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS -from transformers.utils import LossKwargs, can_return_tuple, logging +from transformers.utils import TransformersKwargs, can_return_tuple, logging from ..config import TrainingNeuronConfig from ..loss_utils import ForCausalLMLoss @@ -812,7 +812,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( return causal_mask -class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... +class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ... class LlamaForCausalLM(NeuronModelMixin, LlamaPreTrainedModel): diff --git a/optimum/neuron/pipelines/transformers/base.py b/optimum/neuron/pipelines/transformers/base.py index bc3ab886d..6049afff1 100644 --- a/optimum/neuron/pipelines/transformers/base.py +++ b/optimum/neuron/pipelines/transformers/base.py @@ -274,7 +274,8 @@ def pipeline( if export: if neuron_config is not None: raise ValueError("This model has already been exported to Neuron format") - if not input_shapes: + # Decoder models can select default input shapes from the config + if task != "text-generation" and not input_shapes: input_shapes = {"batch_size": 1, "sequence_length": 128} logger.warning(f"No input shapes provided, using default shapes, {input_shapes}") else: diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py index 1a8863a03..d447f9c78 100644 --- a/optimum/neuron/version.py +++ b/optimum/neuron/version.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.3.1.dev0" +__version__ = "0.3.1.dev1" __sdk_version__ = "2.24.0" diff --git a/pyproject.toml b/pyproject.toml index 6d460872d..242cd5836 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", ] dependencies = [ - "transformers ~= 4.51.0", + "transformers ~= 4.55.4", "accelerate == 1.8.1", "optimum ~= 1.24.0", "huggingface_hub >= 0.29.0", @@ -101,7 +101,7 @@ neuronx = [ "wheel", "neuronx-cc==2.19.8089.0", "torch-neuronx==2.7.0.2.8.6734+ac864f72", - "torch==2.7.0.*", + "torch==2.7.1.*", "torchvision==0.22.*", "neuronx_distributed==0.13.14393", "libneuronxla==2.2.4410.0", @@ -114,7 +114,7 @@ sentence-transformers = [ "sentence-transformers >= 2.2.0", ] vllm = [ - "vllm == 0.9.2", + "vllm == 0.10.0", ] [project.scripts] diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py index 6a7a16868..de0ba0405 100644 --- a/tests/decoder/conftest.py +++ b/tests/decoder/conftest.py @@ -72,6 +72,15 @@ "auto_cast_type": "bf16", }, }, + "smollm3": { + "model_id": "HuggingFaceTB/SmolLM3-3B", + "export_kwargs": { + "batch_size": 4, + "sequence_length": 4096, + "num_cores": 2, + "auto_cast_type": "bf16", + }, + }, } diff --git a/tests/decoder/test_decoder_export.py b/tests/decoder/test_decoder_export.py index 2d0d97c78..489732b29 100644 --- a/tests/decoder/test_decoder_export.py +++ b/tests/decoder/test_decoder_export.py @@ -30,6 +30,7 @@ "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "phi3": "yujiepan/phi-4-tiny-random", "mixtral": "dacorvo/Mixtral-tiny", + "smollm3": "HuggingFaceTB/SmolLM3-3B", } diff --git a/tests/decoder/test_decoder_generation.py b/tests/decoder/test_decoder_generation.py index 7557f476b..45db4369c 100644 --- a/tests/decoder/test_decoder_generation.py +++ b/tests/decoder/test_decoder_generation.py @@ -105,6 +105,7 @@ def test_decoder_generation_greedy_expectations(neuron_decoder_config): "qwen3": " What is the difference between Deep Learning and Machine Learning?\n\nDeep Learning is a subset of", "granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and", "phi": "\n\nDeep learning is a subfield of machine learning that focuses on creating", + "smollm3": " Deep learning is a subset of machine learning that uses neural networks with many layers to learn", } config_name = neuron_decoder_config["name"] generated_text = tokenizer.decode(outputs[0]) diff --git a/tests/decoder/test_vllm.py b/tests/decoder/test_vllm.py index 6e732af00..778620f15 100644 --- a/tests/decoder/test_vllm.py +++ b/tests/decoder/test_vllm.py @@ -42,7 +42,7 @@ def _test_vllm_generation(llm): def test_vllm_from_neuron_model(base_neuron_decoder_path): """Test vLLm generation on a single model exported locally.""" - llm = LLM(model=base_neuron_decoder_path, device="neuron") + llm = LLM(model=base_neuron_decoder_path) _test_vllm_generation(llm) @@ -56,7 +56,6 @@ def test_vllm_from_hub_model(neuron_decoder_config): max_model_len=export_kwargs["sequence_length"], tensor_parallel_size=export_kwargs["num_cores"], dtype=DTYPE_MAPPER.pt(export_kwargs["auto_cast_type"]), - device="neuron", ) _test_vllm_generation(llm) @@ -68,7 +67,6 @@ def test_vllm_greedy_expectations(neuron_decoder_config): llm = LLM( model=neuron_decoder_config["neuron_model_path"], max_num_seqs=neuron_decoder_config["export_kwargs"]["batch_size"], - device="neuron", ) # Send more prompts than the compiled batch size (4) and request # varying generation lengths to test continuous batching. @@ -133,6 +131,14 @@ def test_vllm_greedy_expectations(neuron_decoder_config): " due to the absorption of light by the atmosphere.", " the time I was in the first grade. I remember the day I got the first grade, the", ], + "smollm3": [ + " the head of state and government of the United States", + " Paris. The Eiffel Tower is in Paris. The Eiffel Tower is a famous landmark", + " It was a special day, for it was the first of April, and people were putting the finishing touches to their April Fools' Day pranks", + " to be happy. I believe that happiness is the most important thing in life. I believe that happiness is not just a feeling, but a state of being. I believe that happiness is not something that", + " blue because of Rayleigh scattering. This is the", + " of my grandmother, who was a wonderful cook, making a delicious chicken and dumplings soup. She", + ], }[neuron_decoder_config["name"]] for expected_output, output in zip(expected_outputs, outputs):