Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/inference_cache_llm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ jobs:
llama3.1-70b,
qwen2.5-large,
llama-variants,
smollm3,
]
steps:
- name: Checkout
Expand Down
5 changes: 5 additions & 0 deletions optimum/exporters/neuron/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import os
from functools import partial
from pathlib import Path
from typing import Any

import neuronx_distributed
import torch
Expand Down Expand Up @@ -354,6 +355,10 @@ def outputs(self) -> list[str]:

return common_outputs

@property
def values_override(self) -> dict[str, Any] | None:
return {"return_dict": False}


@register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="diffusers")
class CLIPTextNeuronConfig(CLIPTextWithProjectionNeuronConfig):
Expand Down
3 changes: 2 additions & 1 deletion optimum/exporters/neuron/model_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from typing import TYPE_CHECKING

import torch
from transformers.cache_utils import EncoderDecoderCache
from transformers.models.t5.modeling_t5 import T5LayerCrossAttention

from ...neuron.utils import is_neuronx_available
Expand Down Expand Up @@ -621,7 +622,7 @@ def forward(
decoder_output = self.model.decoder(
input_ids=input_ids,
attention_mask=decoder_attention_mask,
past_key_values=past_key_values,
past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values),
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=True,
Expand Down
6 changes: 0 additions & 6 deletions optimum/neuron/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,6 @@ def generate(
generation_config: GenerationConfig | None = None,
**kwargs,
):
# 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
self._validate_model_class()

# priority: `generation_config` argument > `model.generation_config` (the default generation config)
if generation_config is None:
# legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
Expand Down Expand Up @@ -623,9 +620,6 @@ def generate(
- [`~generation.BeamSampleEncoderDecoderOutput`]
"""

# 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
self._validate_model_class()

# priority: `generation_config` argument > `model.generation_config` (the default generation config)
if generation_config is None:
# legacy: users may modify the model configuration to control generation -- update the generation config
Expand Down
22 changes: 16 additions & 6 deletions optimum/neuron/models/inference/auto_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .phi3.modeling_phi3 import Phi3NxDModelForCausalLM
from .qwen2.modeling_qwen2 import Qwen2NxDModelForCausalLM
from .qwen3.modeling_qwen3 import Qwen3NxDModelForCausalLM
from .smollm3.modeling_smollm3 import SmolLM3NxDModelForCausalLM


prioritize_hlo_backend = os.environ.get("OPTIMUM_NEURON_PRIORITIZE_HLO_BACKEND", "0") == "1"
Expand Down Expand Up @@ -58,6 +59,15 @@ class LLamaModelForCausalLM(LlamaNxDModelForCausalLM):
pass


@register_neuron_model_for_inference("mixtral", "text-generation")
class MixtralNeuronModelForCausalLM(MixtralNxDModelForCausalLM):
"""
Mixtral model with NxD backend for inference on AWS Neuron.
"""

pass


@register_neuron_model_for_inference("phi3", "text-generation")
class Phi3ModelForCausalLM(Phi3NxDModelForCausalLM):
"""
Expand All @@ -76,19 +86,19 @@ class Qwen2ModelForCausalLM(Qwen2NxDModelForCausalLM):
pass


@register_neuron_model_for_inference("mixtral", "text-generation")
class MixtralNeuronModelForCausalLM(MixtralNxDModelForCausalLM):
@register_neuron_model_for_inference("qwen3", "text-generation")
class Qwen3NeuronModelForCausalLM(Qwen3NxDModelForCausalLM):
"""
Mixtral model with NxD backend for inference on AWS Neuron.
Qwen3 model with NxD backend for inference on AWS Neuron.
"""

pass


@register_neuron_model_for_inference("qwen3", "text-generation")
class Qwen3NeuronModelForCausalLM(Qwen3NxDModelForCausalLM):
@register_neuron_model_for_inference("smollm3", "text-generation")
class SmolLM3NeuronModelForCausalLM(SmolLM3NxDModelForCausalLM):
"""
Qwen3 model with NxD backend for inference on AWS Neuron.
SomlLM3 model with NxD backend for inference on AWS Neuron.
"""

pass
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,9 @@ def export(
)
# Override torch_dtype in config as it is used by the neuronx_distributed code to cast weights to the correct type
config.torch_dtype = neuron_config.torch_dtype
# Evaluate head_dim if it is defined but set to null (like in Mixtral for transformers 4.54+)
if hasattr(config, "head_dim") and config.head_dim is None:
config.head_dim = config.hidden_size // config.num_attention_heads
context_encoding_model, token_generation_model, speculation_model = cls.create_model_wrappers(
model_cls=cls._model_cls,
config=config,
Expand Down
116 changes: 116 additions & 0 deletions optimum/neuron/models/inference/smollm3/modeling_smollm3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch SmolLM3 model for NXD inference."""

import logging

from neuronx_distributed.parallel_layers.layers import (
ColumnParallelLinear,
ParallelEmbedding,
)
from torch import nn
from transformers.models.smollm3.configuration_smollm3 import SmolLM3Config

from ..backend.config import NxDNeuronConfig # noqa: E402
from ..backend.modules.attention.attention_base import NeuronAttentionBase
from ..backend.modules.attention.utils import RotaryEmbedding
from ..backend.modules.custom_calls import CustomRMSNorm
from ..backend.modules.decoder import NxDDecoderModel
from ..llama.modeling_llama import (
LlamaNxDModelForCausalLM,
NeuronLlamaDecoderLayer,
)


logger = logging.getLogger("Neuron")


class NeuronSmolLM3Attention(NeuronAttentionBase):
"""
The only difference with the NeuronAttentionBase is the definition of the SmolLM3 rotary embedding
"""

def __init__(
self,
config: SmolLM3Config,
neuron_config: NxDNeuronConfig,
layer_idx: int,
qkv_proj_bias: bool | None = False,
o_proj_bias: bool | None = False,
qk_scale: float | None = None,
):
if config.use_sliding_window:
raise ValueError("SmolLM3 for Neuron does not support sliding window attention.")
if getattr(config, "rope_scaling", None) is not None:
raise ValueError("SmolLM3 for Neuron does not support rope scaling.")
super().__init__(
config, neuron_config, qkv_proj_bias=qkv_proj_bias, o_proj_bias=o_proj_bias, qk_scale=qk_scale
)
if config.no_rope_layers[layer_idx]:
# Yes, the condition is slightly counter-intuitive, but that is the transformers convention
head_dim = config.hidden_size // config.num_attention_heads
self.rotary_emb = RotaryEmbedding(
head_dim,
max_position_embeddings=config.max_position_embeddings,
base=config.rope_theta,
)
else:
self.rotary_emb = None


class NeuronSmolLM3DecoderLayer(NeuronLlamaDecoderLayer):
def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig, layer_idx: int):
super().__init__(config, neuron_config)
self.self_attn = NeuronSmolLM3Attention(config, neuron_config, layer_idx)


class NxDSmolLM3Model(NxDDecoderModel):
"""
The neuron version of the SmolLM3Model
"""

def __init__(self, config: SmolLM3Config, neuron_config: NxDNeuronConfig):
super().__init__(config, neuron_config)

self.embed_tokens = ParallelEmbedding(
config.vocab_size,
config.hidden_size,
config.pad_token_id,
dtype=neuron_config.torch_dtype,
shard_across_embedding=not neuron_config.vocab_parallel,
sequence_parallel_enabled=False,
pad=True,
use_spmd_rank=neuron_config.vocab_parallel,
)

self.lm_head = ColumnParallelLinear(
config.hidden_size,
config.vocab_size,
gather_output=not neuron_config.on_device_sampling,
bias=False,
pad=True,
)

self.layers = nn.ModuleList(
[
NeuronSmolLM3DecoderLayer(config, neuron_config, layer_idx)
for layer_idx in range(config.num_hidden_layers)
]
)
self.norm = CustomRMSNorm(config.hidden_size, eps=config.rms_norm_eps)


class SmolLM3NxDModelForCausalLM(LlamaNxDModelForCausalLM):
_model_cls = NxDSmolLM3Model
2 changes: 1 addition & 1 deletion optimum/neuron/models/inference/t5/modeling_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def forward(
attn_output = attn_output.view(batch_size, -1, self.hidden_size_per_partition)
attn_output = self.o(attn_output)

outputs = (attn_output, past_key_value, position_bias)
outputs = (attn_output, position_bias)

if output_attentions:
outputs = outputs + (attn_weights,)
Expand Down
4 changes: 2 additions & 2 deletions optimum/neuron/models/training/granite/modeling_granite.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from transformers.models.granite.configuration_granite import GraniteConfig
from transformers.processing_utils import Unpack
from transformers.utils import LossKwargs, can_return_tuple, logging
from transformers.utils import TransformersKwargs, can_return_tuple, logging

from ..config import TrainingNeuronConfig
from ..llama.modeling_llama import (
Expand Down Expand Up @@ -225,7 +225,7 @@ def forward(
return output


class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...


class GraniteForCausalLM(LlamaForCausalLM):
Expand Down
4 changes: 2 additions & 2 deletions optimum/neuron/models/training/llama/modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.processing_utils import Unpack
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers.utils import LossKwargs, can_return_tuple, logging
from transformers.utils import TransformersKwargs, can_return_tuple, logging

from ..config import TrainingNeuronConfig
from ..loss_utils import ForCausalLMLoss
Expand Down Expand Up @@ -812,7 +812,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
return causal_mask


class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
class KwargsForCausalLM(FlashAttentionKwargs, TransformersKwargs): ...


class LlamaForCausalLM(NeuronModelMixin, LlamaPreTrainedModel):
Expand Down
3 changes: 2 additions & 1 deletion optimum/neuron/pipelines/transformers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,8 @@ def pipeline(
if export:
if neuron_config is not None:
raise ValueError("This model has already been exported to Neuron format")
if not input_shapes:
# Decoder models can select default input shapes from the config
if task != "text-generation" and not input_shapes:
input_shapes = {"batch_size": 1, "sequence_length": 128}
logger.warning(f"No input shapes provided, using default shapes, {input_shapes}")
else:
Expand Down
2 changes: 1 addition & 1 deletion optimum/neuron/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.3.1.dev0"
__version__ = "0.3.1.dev1"

__sdk_version__ = "2.24.0"
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
dependencies = [
"transformers ~= 4.51.0",
"transformers ~= 4.55.4",
"accelerate == 1.8.1",
"optimum ~= 1.24.0",
"huggingface_hub >= 0.29.0",
Expand Down Expand Up @@ -101,7 +101,7 @@ neuronx = [
"wheel",
"neuronx-cc==2.19.8089.0",
"torch-neuronx==2.7.0.2.8.6734+ac864f72",
"torch==2.7.0.*",
"torch==2.7.1.*",
"torchvision==0.22.*",
"neuronx_distributed==0.13.14393",
"libneuronxla==2.2.4410.0",
Expand All @@ -114,7 +114,7 @@ sentence-transformers = [
"sentence-transformers >= 2.2.0",
]
vllm = [
"vllm == 0.9.2",
"vllm == 0.10.0",
]

[project.scripts]
Expand Down
9 changes: 9 additions & 0 deletions tests/decoder/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,15 @@
"auto_cast_type": "bf16",
},
},
"smollm3": {
"model_id": "HuggingFaceTB/SmolLM3-3B",
"export_kwargs": {
"batch_size": 4,
"sequence_length": 4096,
"num_cores": 2,
"auto_cast_type": "bf16",
},
},
}


Expand Down
1 change: 1 addition & 0 deletions tests/decoder/test_decoder_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"granite": "hf-internal-testing/tiny-random-GraniteForCausalLM",
"phi3": "yujiepan/phi-4-tiny-random",
"mixtral": "dacorvo/Mixtral-tiny",
"smollm3": "HuggingFaceTB/SmolLM3-3B",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no tiny version?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, no

}


Expand Down
1 change: 1 addition & 0 deletions tests/decoder/test_decoder_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def test_decoder_generation_greedy_expectations(neuron_decoder_config):
"qwen3": " What is the difference between Deep Learning and Machine Learning?\n\nDeep Learning is a subset of",
"granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and",
"phi": "\n\nDeep learning is a subfield of machine learning that focuses on creating",
"smollm3": " Deep learning is a subset of machine learning that uses neural networks with many layers to learn",
}
config_name = neuron_decoder_config["name"]
generated_text = tokenizer.decode(outputs[0])
Expand Down
Loading
Loading