From 4a074a778e5fe2eb90d122bb64cabea417dcafef Mon Sep 17 00:00:00 2001 From: Abhishek Kumar Singh Date: Mon, 5 Jan 2026 11:07:15 +0000 Subject: [PATCH 01/12] Added all the changes for enabling subfunction for VLMs Signed-off-by: Abhishek Kumar Singh --- .../models/falcon/modeling_falcon.py | 12 ++++++++- .../models/gemma/modeling_gemma.py | 11 +++++++- .../models/gemma2/modeling_gemma2.py | 11 +++++++- .../models/gemma3/modeling_gemma3.py | 20 +++++++++++++- .../transformers/models/gpt2/modeling_gpt2.py | 11 +++++++- .../gpt_bigcode/modeling_gpt_bigcode.py | 11 +++++++- .../models/gpt_oss/modeling_gpt_oss.py | 12 ++++++++- .../transformers/models/gptj/modeling_gptj.py | 11 +++++++- .../models/granite/modeling_granite.py | 11 +++++++- .../models/granitemoe/modeling_granitemoe.py | 11 +++++++- .../models/grok_1/modeling_grok1.py | 11 +++++++- .../models/internvl/modeling_internvl.py | 20 +++++++++++++- .../models/llama/modeling_llama.py | 11 +++++++- .../models/llama4/modeling_llama4.py | 20 +++++++++++++- .../llama_swiftkv/modeling_llama_swiftkv.py | 11 +++++++- .../models/llava/modeling_llava.py | 20 +++++++++++++- .../models/llava_next/modeling_llava_next.py | 11 +++++++- .../models/mistral/modeling_mistral.py | 11 +++++++- .../models/mistral3/modeling_mistral3.py | 20 +++++++++++++- .../models/mixtral_moe/modeling_mixtral.py | 11 +++++++- .../models/molmo/modeling_molmo.py | 20 +++++++++++++- .../transformers/models/mpt/modeling_mpt.py | 11 +++++++- .../models/olmo2/modeling_olmo2.py | 11 +++++++- .../transformers/models/phi/modeling_phi.py | 11 +++++++- .../transformers/models/phi3/modeling_phi3.py | 11 +++++++- .../transformers/models/pytorch_transforms.py | 26 ------------------- .../models/qwen2/modeling_qwen2.py | 11 +++++++- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 25 +++++++++++++++--- .../models/qwen3/modeling_qwen3.py | 11 +++++++- .../models/qwen3_moe/modeling_qwen3_moe.py | 11 +++++++- .../models/starcoder2/modeling_starcoder2.py | 11 +++++++- .../models/whisper/modeling_whisper.py | 11 +++++++- QEfficient/utils/export_utils.py | 15 ++++++++--- QEfficient/utils/torch_patches.py | 8 ++++-- 34 files changed, 394 insertions(+), 66 deletions(-) diff --git a/QEfficient/transformers/models/falcon/modeling_falcon.py b/QEfficient/transformers/models/falcon/modeling_falcon.py index 1cfdf88e1..e29a06241 100644 --- a/QEfficient/transformers/models/falcon/modeling_falcon.py +++ b/QEfficient/transformers/models/falcon/modeling_falcon.py @@ -8,9 +8,10 @@ """PyTorch Falcon model.""" import math -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch +import torch.nn as nn import torch.utils.checkpoint from torch.nn import functional as F from transformers.cache_utils import Cache @@ -353,6 +354,15 @@ class QEffFalconForCausalLM(FalconForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffFalconDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/gemma/modeling_gemma.py b/QEfficient/transformers/models/gemma/modeling_gemma.py index 1edb8ef53..59a9d6809 100644 --- a/QEfficient/transformers/models/gemma/modeling_gemma.py +++ b/QEfficient/transformers/models/gemma/modeling_gemma.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch from torch import nn @@ -336,6 +336,15 @@ class QEffGemmaForCausalLM(GemmaForCausalLM): - add new args cache idx for the kv retention """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGemmaDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/gemma2/modeling_gemma2.py b/QEfficient/transformers/models/gemma2/modeling_gemma2.py index 2944601c9..00df57240 100644 --- a/QEfficient/transformers/models/gemma2/modeling_gemma2.py +++ b/QEfficient/transformers/models/gemma2/modeling_gemma2.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -388,6 +388,15 @@ class QEffGemma2ForCausalLM(Gemma2ForCausalLM, GenerationMixin): - add new args cache idx for the kv retention """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGemma2DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index a6e451bec..29f7b13d0 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- import copy -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch from torch import nn @@ -589,6 +589,15 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_tower.vision_model.encoder.layers[0].__class__} + def forward(self, pixel_values): image_features = self.model.get_image_features(pixel_values=pixel_values) return image_features @@ -602,6 +611,15 @@ def __init__(self, model): self.config = self.model.config self.lm_head = self.model.lm_head + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGemma3DecoderLayer} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/gpt2/modeling_gpt2.py b/QEfficient/transformers/models/gpt2/modeling_gpt2.py index 6136a2c5d..ab452baea 100644 --- a/QEfficient/transformers/models/gpt2/modeling_gpt2.py +++ b/QEfficient/transformers/models/gpt2/modeling_gpt2.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Optional, Tuple, Type, Union import torch from torch import nn @@ -397,6 +397,15 @@ class QEffGPT2LMHeadModel(GPT2LMHeadModel): - add new args position idx for the cache_kwargs for kv retention """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGPT2Block} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 85ea42674..604def959 100644 --- a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -7,7 +7,7 @@ """PyTorch GPTBigCode model.""" -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -378,6 +378,15 @@ def forward( class QEffGPTBigCodeForCausalLM(GPTBigCodeForCausalLM): + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGPTBigCodeBlock} + def forward( self, input_ids: Optional[torch.Tensor] = None, diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index 3efe890b8..b82cd7c81 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- import math import os -from typing import Callable, Optional, Union +from typing import Callable, Optional, Type, Union import torch from torch import nn @@ -1205,6 +1205,16 @@ def forward( class QEffGptOssForCausalLM(GptOssForCausalLM): + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGptOssDecoderLayer} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/gptj/modeling_gptj.py b/QEfficient/transformers/models/gptj/modeling_gptj.py index 1a9e45e97..2a7c475ed 100644 --- a/QEfficient/transformers/models/gptj/modeling_gptj.py +++ b/QEfficient/transformers/models/gptj/modeling_gptj.py @@ -7,7 +7,7 @@ """PyTorch GPT-J model.""" -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch from torch import nn @@ -318,6 +318,15 @@ class QEffGPTJForCausalLM(GPTJForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGPTJBlock} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/granite/modeling_granite.py b/QEfficient/transformers/models/granite/modeling_granite.py index 62be5f54d..c791b02f4 100644 --- a/QEfficient/transformers/models/granite/modeling_granite.py +++ b/QEfficient/transformers/models/granite/modeling_granite.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -347,6 +347,15 @@ class QEffGraniteForCausalLM(GraniteForCausalLM): Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGraniteDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py index b158b4046..fbeaae68c 100644 --- a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py +++ b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.nn.functional as F @@ -493,6 +493,15 @@ class QEffGraniteMoeForCausalLM(GraniteMoeForCausalLM): Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.layers[0].__class__} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/grok_1/modeling_grok1.py b/QEfficient/transformers/models/grok_1/modeling_grok1.py index 2d8fc412d..9c1e7c4b6 100644 --- a/QEfficient/transformers/models/grok_1/modeling_grok1.py +++ b/QEfficient/transformers/models/grok_1/modeling_grok1.py @@ -5,7 +5,7 @@ # # ---------------------------------------------------------------------------- -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.nn as nn @@ -397,6 +397,15 @@ class QEffGrok1ModelForCausalLM(nn.Module): Grok model for causal language modeling. """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGrok1DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index b47db7eda..026b1f9ae 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional +from typing import List, Optional, Type import torch import torch.nn as nn @@ -21,6 +21,15 @@ def __init__(self, model): super().__init__() self.model = model + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_model.encoder.layers[0].__class__} + def forward(self, pixel_values): vision_embeds = self.model.extract_feature(pixel_values) # Reshape from [num_patches, 256, hidden_dim] -> [1, num_patches*256, head_dim] @@ -36,6 +45,15 @@ def __init__(self, model): self.config = self.model.language_model.config self.language_model = self.model.language_model + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.language_model.model.layers[0].__class__} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py index fb3aed556..065db2193 100644 --- a/QEfficient/transformers/models/llama/modeling_llama.py +++ b/QEfficient/transformers/models/llama/modeling_llama.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -404,6 +404,15 @@ class QEffLlamaForCausalLM(LlamaForCausalLM): Copied from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffLlamaDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py index 834ee8880..16a576d02 100644 --- a/QEfficient/transformers/models/llama4/modeling_llama4.py +++ b/QEfficient/transformers/models/llama4/modeling_llama4.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- import math -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -822,6 +822,15 @@ def __init__(self, model): super().__init__() self.model = model + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_model.model.layers[0].__class__} + def forward(self, pixel_values): vision_feature_layer = self.model.config.vision_config.vision_feature_layer vision_feature_select_strategy = self.model.config.vision_config.vision_feature_select_strategy @@ -849,6 +858,15 @@ def __init__(self, model): self.language_model = self.model.language_model self.config = self.model.config + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffLlama4TextDecoderLayer} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py index fa42b3f96..be1cc8cdc 100644 --- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py +++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py @@ -11,7 +11,7 @@ """Inference-only LLaMA model compatible with HuggingFace weights.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch from torch import nn @@ -416,6 +416,15 @@ def __init__(self, config: QEffLlamaSwiftKVConfig): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.config = config + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffLlamaSwiftKVDecoderLayer} + def forward( self, input_ids: torch.Tensor, diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index abdb77ea5..64fc41c09 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional +from typing import List, Optional, Type import torch import torch.nn as nn @@ -30,6 +30,15 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_tower.vision_model.encoder.layers[0].__class__} + def forward(self, pixel_values): # Image features image_outputs = self.model.vision_tower(pixel_values, output_hidden_states=True) @@ -54,6 +63,15 @@ def __init__(self, model): self.language_model = self.model.language_model self.lm_head = self.model.lm_head + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.language_model.layers[0].__class__} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index 627f7393e..a51272980 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- -from typing import List, Optional +from typing import List, Optional, Type import numpy as np import torch @@ -30,6 +30,15 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_tower.vision_model.encoder.layers[0].__class__} + def forward(self, pixel_values, image_sizes): if pixel_values.dim() == constants.GRANITEVISION_PIXEL_VALUE_DIM: pixel_values_new = pixel_values.squeeze(0) diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py index 5edfb8f3a..de9b1a7e6 100644 --- a/QEfficient/transformers/models/mistral/modeling_mistral.py +++ b/QEfficient/transformers/models/mistral/modeling_mistral.py @@ -7,7 +7,7 @@ """PyTorch Mistral model.""" -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -356,6 +356,15 @@ class QEffMistralForCausalLM(MistralForCausalLM): - add new args cache idx for the kv retention """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffMistralDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index d2149b6bd..3bf151b97 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.nn as nn @@ -151,6 +151,15 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_tower.transformer.layers[0].__class__} + def forward(self, pixel_values): image_sizes = torch.tensor([[pixel_values.shape[2], pixel_values.shape[3]]]).repeat(pixel_values.shape[0], 1) image_features = self.model.get_image_features( @@ -168,6 +177,15 @@ def __init__(self, model): self.config = self.model.config self.language_model = self.model.language_model + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return self.model.language_model.layers[0].__class__ + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py index 862714fea..f811bea65 100644 --- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py +++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py @@ -7,7 +7,7 @@ """PyTorch Mixtral model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.nn.functional as F @@ -414,6 +414,15 @@ class QEffMixtralForCausalLM(MixtralForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QeffMixtralDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index b686e6aed..093e468ff 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- import math -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch import torch.nn as nn @@ -568,6 +568,15 @@ def __init__(self, model): super().__init__() self.model = model + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.model.transformer.blocks[0].__class__} + def forward(self, pixel_values, image_masks, image_input_idx, valid_idx): image_features, _ = self.model.model.vision_backbone(pixel_values, image_masks) num_image, num_patch = image_features.shape[1:3] @@ -588,6 +597,15 @@ def __init__(self, model): # self.language_model = self.model.language_model self.config = self.model.config + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.model.vision_backbone.image_vit.transformer.resblocks[0].__class__} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/mpt/modeling_mpt.py b/QEfficient/transformers/models/mpt/modeling_mpt.py index c1d98c1f8..929e157cc 100644 --- a/QEfficient/transformers/models/mpt/modeling_mpt.py +++ b/QEfficient/transformers/models/mpt/modeling_mpt.py @@ -7,7 +7,7 @@ """PyTorch MPT model.""" -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -254,6 +254,15 @@ class QEffMptForCausalLM(MptForCausalLM): - add new args cache idx for the kv retention """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffMptBlock} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/olmo2/modeling_olmo2.py b/QEfficient/transformers/models/olmo2/modeling_olmo2.py index 00755cae5..02645e185 100644 --- a/QEfficient/transformers/models/olmo2/modeling_olmo2.py +++ b/QEfficient/transformers/models/olmo2/modeling_olmo2.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -324,6 +324,15 @@ class QEffOlmo2ForCausalLM(Olmo2ForCausalLM): - add new args cache idx for the kv retention """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffOlmo2DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/phi/modeling_phi.py b/QEfficient/transformers/models/phi/modeling_phi.py index 4bf2e8785..2efbb313e 100644 --- a/QEfficient/transformers/models/phi/modeling_phi.py +++ b/QEfficient/transformers/models/phi/modeling_phi.py @@ -7,7 +7,7 @@ """PyTorch Phi model.""" -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -323,6 +323,15 @@ class QEffPhiForCausalLM(PhiForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffPhiDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py index b97a0ab8d..e25deed37 100644 --- a/QEfficient/transformers/models/phi3/modeling_phi3.py +++ b/QEfficient/transformers/models/phi3/modeling_phi3.py @@ -7,7 +7,7 @@ """PyTorch Phi-3 model.""" -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -351,6 +351,15 @@ class QEffPhi3ForCausalLM(Phi3ForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffPhi3DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index b978b6193..abb364d0a 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -893,32 +893,6 @@ def apply(cls, model: nn.Module, pooling: Union[str, Callable]) -> Tuple[nn.Modu return model, transformed -def get_decoder_layer_classes_for_export(model: nn.Module) -> set: - """ - Dynamically determine which DecoderLayer classes should be exported as functions - based on the model's architecture using the existing KVCacheTransform mapping. - """ - # Define patterns that identify decoder layer classes - DECODER_LAYER_PATTERNS = ["DecoderLayer", "Block", "Layer"] - - # Get all QEff classes that are decoder layers from the existing mapping - decoder_layer_classes = set() - - for original_class, qeff_class in KVCacheTransform._module_mapping.items(): - # Check if the QEff class name contains decoder layer patterns - qeff_class_name = qeff_class.__name__ - if any(pattern in qeff_class_name for pattern in DECODER_LAYER_PATTERNS): - decoder_layer_classes.add(qeff_class) - - # Filter to only include classes that are actually used in the current model - model_decoder_classes = set() - for module in model.modules(): - if module.__class__ in decoder_layer_classes: - model_decoder_classes.add(module.__class__) - - return model_decoder_classes - - class BlockedKVAttentionTransform: _module_mapping = { QEffLlamaAttention, diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py index 7c093a4b0..7404f2f6c 100644 --- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py +++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py @@ -7,7 +7,7 @@ """PyTorch Qwen2 model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -350,6 +350,15 @@ class QEffQwen2ForCausalLM(Qwen2ForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffQwen2DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 21d2e026e..718c50e34 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -7,7 +7,7 @@ import math import os -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import torch import torch.nn as nn @@ -74,12 +74,11 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, mrope_section, unsqu `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. """ - mrope_section = mrope_section * 2 cos = cos[position_ids] sin = sin[position_ids] - cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim) - sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim) + cos = torch.cat([cos[0, ..., 0:32], cos[0, ..., 32:80], cos[0, ..., 80:128]], dim=-1).unsqueeze(0) + sin = torch.cat([sin[0, ..., 0:32], sin[0, ..., 32:80], sin[0, ..., 80:128]], dim=-1).unsqueeze(0) q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) @@ -872,6 +871,15 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.visual + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.visual.blocks[0].__class__} + def forward(self, pixel_values, image_grid_thw): image_embeds = self.model.visual(pixel_values, grid_thw=image_grid_thw) bs = image_grid_thw.shape[0] @@ -887,6 +895,15 @@ def __init__(self, model): self.model = model self.language_model = self.model.model.language_model + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffQwen2_5_VLDecoderLayer} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/qwen3/modeling_qwen3.py b/QEfficient/transformers/models/qwen3/modeling_qwen3.py index 540bad4c7..b310499be 100644 --- a/QEfficient/transformers/models/qwen3/modeling_qwen3.py +++ b/QEfficient/transformers/models/qwen3/modeling_qwen3.py @@ -7,7 +7,7 @@ """PyTorch Qwen3 model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -351,6 +351,15 @@ class QEffQwen3ForCausalLM(Qwen3ForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffQwen3DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py index cbd80d8ca..18e1e7611 100644 --- a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Type import torch import torch.nn.functional as F @@ -371,6 +371,15 @@ def forward( class QEffQwen3MoeForCausalLM(Qwen3MoeForCausalLM): + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffQwen3MoeDecoderLayer} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py index c86e7478b..3387f0fba 100644 --- a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py +++ b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py @@ -7,7 +7,7 @@ """PyTorch Starcoder2 model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch from torch import nn @@ -275,6 +275,15 @@ class QEffStarcoder2ForCausalLM(Starcoder2ForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEFFStarcoder2DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index a03ffecf7..650258328 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -5,7 +5,7 @@ # # ---------------------------------------------------------------------------- -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch from torch import nn @@ -718,6 +718,15 @@ class QEffWhisperForConditionalGeneration(WhisperForConditionalGeneration): - changed forward inputs decoder_input_ids and decoder_position_ids to input_ids and position_ids """ + def get_repeated_layer_class(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.encoder.layers[0].__class__, QEffWhisperDecoderLayer} + def forward( self, input_features: Optional[torch.FloatTensor] = None, diff --git a/QEfficient/utils/export_utils.py b/QEfficient/utils/export_utils.py index 33ba694cf..9380ae440 100644 --- a/QEfficient/utils/export_utils.py +++ b/QEfficient/utils/export_utils.py @@ -14,7 +14,6 @@ from QEfficient.base.onnx_transforms import CustomOpTransform, RenameFunctionOutputsTransform from QEfficient.transformers.cache_utils import InvalidIndexProvider -from QEfficient.transformers.models.pytorch_transforms import get_decoder_layer_classes_for_export from QEfficient.utils.cache import QEFF_HOME from QEfficient.utils.hash_utils import create_export_hash from QEfficient.utils.logging_utils import logger @@ -164,18 +163,26 @@ def _setup_onnx_subfunctions(qeff_model, args, kwargs): # Transform output names for subfunction compatibility if "output_names" in kwargs: kwargs["output_names"] = [ - re.sub("_RetainedState", "_InternalRetainedState", name) for name in kwargs["output_names"] + re.sub("_RetainedState", "_InternalRetainedState", name) + if name.endswith("_RetainedState") and ("key" in name or "value" in name) + else name + for name in kwargs["output_names"] ] else: args = list(args) - args[1] = [re.sub("_RetainedState", "_InternalRetainedState", name) for name in args[1]] + args[1] = [ + re.sub("_RetainedState", "_InternalRetainedState", name) + if name.endswith("_RetainedState") and ("key" in name or "value" in name) + else name + for name in args[1] + ] args = tuple(args) # Add subfunction-specific ONNX transforms qeff_model._onnx_transforms.append(RenameFunctionOutputsTransform) qeff_model._onnx_transforms.append(CustomOpTransform) # TODO: Handle this in the modelling class QEFFTransformersBase,remove from here. Refer diffusers implementation - decoder_layer_classes = get_decoder_layer_classes_for_export(qeff_model.model) + decoder_layer_classes = qeff_model.model.get_repeated_layer_class() if decoder_layer_classes: kwargs["export_modules_as_functions"] = decoder_layer_classes return args, kwargs diff --git a/QEfficient/utils/torch_patches.py b/QEfficient/utils/torch_patches.py index 0b9b37afa..1752b5979 100644 --- a/QEfficient/utils/torch_patches.py +++ b/QEfficient/utils/torch_patches.py @@ -11,6 +11,8 @@ import torch.onnx.utils as onnx_utils from torch import _C +from QEfficient.utils.logging_utils import logger + # Store original references before patching _original_setup_trace_module_map = onnx_utils._setup_trace_module_map _original_get_module_attributes = getattr(onnx_utils, "_get_module_attributes", None) @@ -38,8 +40,10 @@ def _track_module_attributes_forward_hook(module, input, output): onnx_attrs = getattr(module, attr_name) delattr(module, attr_name) # FIX: use empty dict to avoid type mismatch - onnx_attrs = {} - _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs) + try: + _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs) + except Exception as e: + logger.warning(f"Failed to track ONNX scope attributes: {e}. Skipping this step.") for m in model.modules(): m.register_forward_hook(_track_module_attributes_forward_hook) From 4458154d6bffccf378da106c940484032d1a7e22 Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Sun, 11 Jan 2026 20:19:33 +0000 Subject: [PATCH 02/12] Fixed rope method for batch size > 1 Signed-off-by: abhishek-singh591 --- QEfficient/transformers/models/modeling_auto.py | 1 + .../transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index d2cc1e681..55253f9b0 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1030,6 +1030,7 @@ def export( offload_pt_weights=False, use_onnx_subfunctions=use_onnx_subfunctions, ) + self.lang_model.export( inputs["lang"], output_names["lang"], diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 718c50e34..784ebdd84 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -76,9 +76,8 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, mrope_section, unsqu cos = cos[position_ids] sin = sin[position_ids] - - cos = torch.cat([cos[0, ..., 0:32], cos[0, ..., 32:80], cos[0, ..., 80:128]], dim=-1).unsqueeze(0) - sin = torch.cat([sin[0, ..., 0:32], sin[0, ..., 32:80], sin[0, ..., 80:128]], dim=-1).unsqueeze(0) + cos = torch.cat([cos[0, ..., 0:32], cos[1, ..., 32:80], cos[2, ..., 80:128]], dim=-1).unsqueeze(0) + sin = torch.cat([sin[0, ..., 0:32], sin[1, ..., 32:80], sin[2, ..., 80:128]], dim=-1).unsqueeze(0) q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) From d2a81ad9957568e380f1af538221ca18746e92cc Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Mon, 12 Jan 2026 08:44:52 +0000 Subject: [PATCH 03/12] Added test file for subfunction with VLM Signed-off-by: abhishek-singh591 --- .../test_subfunction_vlm.py | 343 ++++++++++++++++++ 1 file changed, 343 insertions(+) create mode 100644 tests/transformers/models/image_text_to_text/test_subfunction_vlm.py diff --git a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py new file mode 100644 index 000000000..88f89c618 --- /dev/null +++ b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py @@ -0,0 +1,343 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +from typing import Optional + +import pytest +import requests +import torch +from PIL import Image +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoModelForImageTextToText, + AutoProcessor, + TextStreamer, +) + +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText +from QEfficient.utils import hf_download +from QEfficient.utils._utils import get_num_layers_vlm +from QEfficient.utils.device_utils import get_available_device_id + +NEW_GENERATION_TOKENS = 10 +test_models_config = [ + # CONFIG PARAMS NEEDED FOR A MODEL TO BE TESTED + # ( + # model_name, + # kv_offload, + # batch_size, + # prompt_len, + # ctx_len, + # img_size, + # img_url", + # text_prompt, + # number of layers of the model, + # ), + ( + "llava-hf/llava-1.5-7b-hf", + True, + 1, + 784, + 1024, + 336, + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + 1, + ), + ( + "llava-hf/llava-1.5-7b-hf", + False, + 1, + 784, + 1024, + 336, + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + 1, + ), + # Disabled in CI due to performance issues + # ( + # "meta-llama/Llama-4-Scout-17B-16E-Instruct", + # True, + # 1, + # 128, + # 3072, + # 336, + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + # 4, + # ), + # ( + # "meta-llama/Llama-4-Scout-17B-16E-Instruct", + # False, + # 1, + # 128, + # 3072, + # 336, + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + # 4, + # ), + ( + "google/gemma-3-4b-it", + True, + 1, + 128, + 3072, + 896, + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "Can you describe the image in detail.", + 1, + ), + ( + "google/gemma-3-4b-it", + False, + 1, + 128, + 3072, + 896, + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "Can you describe the image in detail.", + 1, + ), + ( + "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + True, + 1, + 128, + 4096, + 1540, + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "Can you describe the image in detail.", + 1, + ), + ( + "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + False, + 1, + 128, + 4096, + 1540, + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "Can you describe the image in detail.", + 1, + ), + ( + "Qwen/Qwen2.5-VL-3B-Instruct", + True, + 1, + 128, + 4096, + 1540, + "https://picsum.photos/id/237/536/354", + "Can you describe the image in detail.", + 1, + ), + # ( + # "meta-llama/Llama-3.2-11B-Vision-Instruct", + # True, + # 1, + # 32, + # 512, + # 560, + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + # "Explain this image", + # 7, + # ), +] + +intern_model_config = [ + ( + "OpenGVLab/InternVL2_5-1B", + True, + 1, + 384, + 512, + "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", + "Please describe the image in detail.", + 2, + ), + ( + "OpenGVLab/InternVL3_5-1B", + True, + 1, + 384, + 512, + "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", + "Please describe the image in detail.", + 2, + ), + # ( + # "OpenGVLab/InternVL2_5-1B", + # False, + # 1, + # 384, + # 512, + # "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", + # "Please describe the image in detail.", + # 2, + # ), # commented becuase QNN Convertor is not supported for this model yet. +] + +molmo_model_config = [ + # Disabled in CI due to HF issues + # ( + # "allenai/Molmo-7B-D-0924", + # True, + # 1, + # 128, + # 4096, + # "https://picsum.photos/id/237/536/354", + # "Can you describe the image in detail.", + # 2, + # ), +] + + +def load_image_text_to_text_model(model_config): + model_path = hf_download( + repo_id=model_config._name_or_path, + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + try: + model_hf = AutoModelForImageTextToText.from_pretrained( + model_path, + low_cpu_mem_usage=False, + config=model_config, + ) + except ValueError: + model_hf = AutoModelForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=model_config, + ) + params = sum(p.numel() for p in model_hf.parameters()) + model_hf.eval() + return model_hf, params + + +def set_num_layers(config, n_layer=1): + ## -1 indicates use all the layers of the model. + if n_layer == -1: + return config + elif hasattr(config, "model_type") and "mllama" in config.model_type: + config.text_config.num_hidden_layers = n_layer + config.text_config.cross_attention_layers = [ + x for x in config.text_config.cross_attention_layers if x < n_layer + ] + elif hasattr(config, "text_config"): + config.text_config.num_hidden_layers = n_layer + config.vision_config.num_hidden_layers = n_layer + elif hasattr(config, "llm_config"): + config.llm_config.num_hidden_layers = n_layer + config.vision_config.num_hidden_layers = n_layer + else: + config.num_hidden_layers = n_layer + return config + + +def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( + model_name: str, + img_size: int, + img_url: str, + query: str, + prompt_len: int, + ctx_len: int, + max_gen_len: int = 20, + batch_size: int = 1, + n_layer: int = 1, + kv_offload: bool = False, + num_devices: int = 1, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, +): + model_config = {"model_name": model_name} + model_config["img_size"] = img_size + config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True, padding=True) + config = set_num_layers(config, n_layer=n_layer) + model_hf, _ = load_image_text_to_text_model(config) + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + + n_layer = get_num_layers_vlm(config) + image = Image.open(requests.get(img_url, stream=True).raw) + if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": + image = image.resize((1540, 1540)) + + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": query}, + {"type": "image"}, + ], + }, + ] + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + + inputs = processor(images=image, text=prompt, return_tensors="pt") + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + streamer = TextStreamer(processor.tokenizer) + qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_config["model_name"], + kv_offload=kv_offload, + config=config, + ) + + # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model) + # assert (pytorch_kv_tokens == pytorch_hf_tokens).all(), ( + # "Tokens don't match for pytorch HF output and pytorch KV output" + # ) + + with_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False) + without_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=False) + + if not get_available_device_id(): + pytest.skip("No available devices to run model on Cloud AI 100") + + inputs = processor(images=image, text=prompt, return_tensors="pt") + if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": + inputs = qeff_model.model.prepare_inputs_for_generation( + inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size + ) + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + qeff_model.compile( + img_size=model_config["img_size"], + num_devices=num_devices, + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + mxfp6=False, + enable_qnn=enable_qnn, + qnn_config=qnn_config, + onnx_path=with_sub_func_onnx, + ) + + print("Output With Subfunction Enabled:") + output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) + tokens_sub = output.generated_ids[:, :-1] + + qeff_model.compile( + img_size=model_config["img_size"], + num_devices=num_devices, + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + mxfp6=False, + enable_qnn=enable_qnn, + qnn_config=qnn_config, + onnx_path=without_sub_func_onnx, + ) + + print("Output With Subfunction Not Enabled:") + output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) + tokens_no_sub = output.generated_ids[:, :-1] + + assert (tokens_sub == tokens_no_sub).all(), "Tokens don't match for pytorch HF output and QPC output" + return From 441e2ba1ec7d8867d1bb2ca4b19f1b81795cc186 Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Tue, 13 Jan 2026 06:46:05 +0000 Subject: [PATCH 04/12] Made minor fixes Signed-off-by: abhishek-singh591 --- QEfficient/transformers/models/falcon/modeling_falcon.py | 2 +- QEfficient/transformers/models/gemma/modeling_gemma.py | 2 +- QEfficient/transformers/models/gemma2/modeling_gemma2.py | 2 +- QEfficient/transformers/models/gemma3/modeling_gemma3.py | 4 ++-- QEfficient/transformers/models/gpt2/modeling_gpt2.py | 2 +- .../transformers/models/gpt_bigcode/modeling_gpt_bigcode.py | 2 +- QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py | 2 +- QEfficient/transformers/models/gptj/modeling_gptj.py | 2 +- QEfficient/transformers/models/granite/modeling_granite.py | 2 +- .../transformers/models/granitemoe/modeling_granitemoe.py | 2 +- QEfficient/transformers/models/grok_1/modeling_grok1.py | 2 +- QEfficient/transformers/models/internvl/modeling_internvl.py | 4 ++-- QEfficient/transformers/models/llama/modeling_llama.py | 2 +- QEfficient/transformers/models/llama4/modeling_llama4.py | 4 ++-- .../models/llama_swiftkv/modeling_llama_swiftkv.py | 2 +- QEfficient/transformers/models/llava/modeling_llava.py | 4 ++-- .../transformers/models/llava_next/modeling_llava_next.py | 2 +- QEfficient/transformers/models/mistral/modeling_mistral.py | 2 +- QEfficient/transformers/models/mistral3/modeling_mistral3.py | 4 ++-- .../transformers/models/mixtral_moe/modeling_mixtral.py | 2 +- QEfficient/transformers/models/molmo/modeling_molmo.py | 4 ++-- QEfficient/transformers/models/mpt/modeling_mpt.py | 2 +- QEfficient/transformers/models/olmo2/modeling_olmo2.py | 2 +- QEfficient/transformers/models/phi/modeling_phi.py | 2 +- QEfficient/transformers/models/phi3/modeling_phi3.py | 2 +- QEfficient/transformers/models/qwen2/modeling_qwen2.py | 2 +- .../transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py | 4 ++-- QEfficient/transformers/models/qwen3/modeling_qwen3.py | 2 +- .../transformers/models/qwen3_moe/modeling_qwen3_moe.py | 2 +- .../transformers/models/starcoder2/modeling_starcoder2.py | 2 +- QEfficient/transformers/models/whisper/modeling_whisper.py | 2 +- QEfficient/utils/export_utils.py | 2 +- tests/transformers/test_causal_lm.py | 3 +-- 33 files changed, 40 insertions(+), 41 deletions(-) diff --git a/QEfficient/transformers/models/falcon/modeling_falcon.py b/QEfficient/transformers/models/falcon/modeling_falcon.py index e29a06241..4ebb2fb96 100644 --- a/QEfficient/transformers/models/falcon/modeling_falcon.py +++ b/QEfficient/transformers/models/falcon/modeling_falcon.py @@ -354,7 +354,7 @@ class QEffFalconForCausalLM(FalconForCausalLM): - update the hidden_states, and fix for onnx model """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/gemma/modeling_gemma.py b/QEfficient/transformers/models/gemma/modeling_gemma.py index 59a9d6809..260d1857a 100644 --- a/QEfficient/transformers/models/gemma/modeling_gemma.py +++ b/QEfficient/transformers/models/gemma/modeling_gemma.py @@ -336,7 +336,7 @@ class QEffGemmaForCausalLM(GemmaForCausalLM): - add new args cache idx for the kv retention """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/gemma2/modeling_gemma2.py b/QEfficient/transformers/models/gemma2/modeling_gemma2.py index 00df57240..6dee8c85d 100644 --- a/QEfficient/transformers/models/gemma2/modeling_gemma2.py +++ b/QEfficient/transformers/models/gemma2/modeling_gemma2.py @@ -388,7 +388,7 @@ class QEffGemma2ForCausalLM(Gemma2ForCausalLM, GenerationMixin): - add new args cache idx for the kv retention """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index 29f7b13d0..930cf5141 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -589,7 +589,7 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: @@ -611,7 +611,7 @@ def __init__(self, model): self.config = self.model.config self.lm_head = self.model.lm_head - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/gpt2/modeling_gpt2.py b/QEfficient/transformers/models/gpt2/modeling_gpt2.py index ab452baea..7de674cce 100644 --- a/QEfficient/transformers/models/gpt2/modeling_gpt2.py +++ b/QEfficient/transformers/models/gpt2/modeling_gpt2.py @@ -397,7 +397,7 @@ class QEffGPT2LMHeadModel(GPT2LMHeadModel): - add new args position idx for the cache_kwargs for kv retention """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 604def959..d1220589f 100644 --- a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -378,7 +378,7 @@ def forward( class QEffGPTBigCodeForCausalLM(GPTBigCodeForCausalLM): - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index b82cd7c81..57bcb842d 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -1205,7 +1205,7 @@ def forward( class QEffGptOssForCausalLM(GptOssForCausalLM): - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. diff --git a/QEfficient/transformers/models/gptj/modeling_gptj.py b/QEfficient/transformers/models/gptj/modeling_gptj.py index 2a7c475ed..a4c81dbec 100644 --- a/QEfficient/transformers/models/gptj/modeling_gptj.py +++ b/QEfficient/transformers/models/gptj/modeling_gptj.py @@ -318,7 +318,7 @@ class QEffGPTJForCausalLM(GPTJForCausalLM): - update the hidden_states, and fix for onnx model """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/granite/modeling_granite.py b/QEfficient/transformers/models/granite/modeling_granite.py index c791b02f4..8a32c52ef 100644 --- a/QEfficient/transformers/models/granite/modeling_granite.py +++ b/QEfficient/transformers/models/granite/modeling_granite.py @@ -347,7 +347,7 @@ class QEffGraniteForCausalLM(GraniteForCausalLM): Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py index fbeaae68c..07cba09d5 100644 --- a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py +++ b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py @@ -493,7 +493,7 @@ class QEffGraniteMoeForCausalLM(GraniteMoeForCausalLM): Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/grok_1/modeling_grok1.py b/QEfficient/transformers/models/grok_1/modeling_grok1.py index 9c1e7c4b6..1a1c919bb 100644 --- a/QEfficient/transformers/models/grok_1/modeling_grok1.py +++ b/QEfficient/transformers/models/grok_1/modeling_grok1.py @@ -397,7 +397,7 @@ class QEffGrok1ModelForCausalLM(nn.Module): Grok model for causal language modeling. """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 026b1f9ae..e389e6a84 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -21,7 +21,7 @@ def __init__(self, model): super().__init__() self.model = model - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: @@ -45,7 +45,7 @@ def __init__(self, model): self.config = self.model.language_model.config self.language_model = self.model.language_model - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py index 065db2193..57bccdb1b 100644 --- a/QEfficient/transformers/models/llama/modeling_llama.py +++ b/QEfficient/transformers/models/llama/modeling_llama.py @@ -404,7 +404,7 @@ class QEffLlamaForCausalLM(LlamaForCausalLM): Copied from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py index 16a576d02..3abaef5a7 100644 --- a/QEfficient/transformers/models/llama4/modeling_llama4.py +++ b/QEfficient/transformers/models/llama4/modeling_llama4.py @@ -822,7 +822,7 @@ def __init__(self, model): super().__init__() self.model = model - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: @@ -858,7 +858,7 @@ def __init__(self, model): self.language_model = self.model.language_model self.config = self.model.config - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py index be1cc8cdc..e219d5e03 100644 --- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py +++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py @@ -416,7 +416,7 @@ def __init__(self, config: QEffLlamaSwiftKVConfig): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.config = config - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 64fc41c09..48b002a31 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -30,7 +30,7 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: @@ -63,7 +63,7 @@ def __init__(self, model): self.language_model = self.model.language_model self.lm_head = self.model.lm_head - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index a51272980..8b338420e 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -30,7 +30,7 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py index de9b1a7e6..47107384e 100644 --- a/QEfficient/transformers/models/mistral/modeling_mistral.py +++ b/QEfficient/transformers/models/mistral/modeling_mistral.py @@ -356,7 +356,7 @@ class QEffMistralForCausalLM(MistralForCausalLM): - add new args cache idx for the kv retention """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index 3bf151b97..d1391a71a 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -151,7 +151,7 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: @@ -177,7 +177,7 @@ def __init__(self, model): self.config = self.model.config self.language_model = self.model.language_model - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py index f811bea65..ec7a9a8c8 100644 --- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py +++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py @@ -414,7 +414,7 @@ class QEffMixtralForCausalLM(MixtralForCausalLM): - update the hidden_states, and fix for onnx model """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index 093e468ff..57f2729b9 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -568,7 +568,7 @@ def __init__(self, model): super().__init__() self.model = model - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: @@ -597,7 +597,7 @@ def __init__(self, model): # self.language_model = self.model.language_model self.config = self.model.config - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/mpt/modeling_mpt.py b/QEfficient/transformers/models/mpt/modeling_mpt.py index 929e157cc..5a808c7f2 100644 --- a/QEfficient/transformers/models/mpt/modeling_mpt.py +++ b/QEfficient/transformers/models/mpt/modeling_mpt.py @@ -254,7 +254,7 @@ class QEffMptForCausalLM(MptForCausalLM): - add new args cache idx for the kv retention """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/olmo2/modeling_olmo2.py b/QEfficient/transformers/models/olmo2/modeling_olmo2.py index 02645e185..c79ad7fae 100644 --- a/QEfficient/transformers/models/olmo2/modeling_olmo2.py +++ b/QEfficient/transformers/models/olmo2/modeling_olmo2.py @@ -324,7 +324,7 @@ class QEffOlmo2ForCausalLM(Olmo2ForCausalLM): - add new args cache idx for the kv retention """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/phi/modeling_phi.py b/QEfficient/transformers/models/phi/modeling_phi.py index 2efbb313e..82f18b7e0 100644 --- a/QEfficient/transformers/models/phi/modeling_phi.py +++ b/QEfficient/transformers/models/phi/modeling_phi.py @@ -323,7 +323,7 @@ class QEffPhiForCausalLM(PhiForCausalLM): - update the hidden_states, and fix for onnx model """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py index e25deed37..b48ab2897 100644 --- a/QEfficient/transformers/models/phi3/modeling_phi3.py +++ b/QEfficient/transformers/models/phi3/modeling_phi3.py @@ -351,7 +351,7 @@ class QEffPhi3ForCausalLM(Phi3ForCausalLM): - update the hidden_states, and fix for onnx model """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py index 7404f2f6c..841df6526 100644 --- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py +++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py @@ -350,7 +350,7 @@ class QEffQwen2ForCausalLM(Qwen2ForCausalLM): - update the hidden_states, and fix for onnx model """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 784ebdd84..e8b95dec6 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -870,7 +870,7 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.visual - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: @@ -894,7 +894,7 @@ def __init__(self, model): self.model = model self.language_model = self.model.model.language_model - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/qwen3/modeling_qwen3.py b/QEfficient/transformers/models/qwen3/modeling_qwen3.py index b310499be..ccc4bbac2 100644 --- a/QEfficient/transformers/models/qwen3/modeling_qwen3.py +++ b/QEfficient/transformers/models/qwen3/modeling_qwen3.py @@ -351,7 +351,7 @@ class QEffQwen3ForCausalLM(Qwen3ForCausalLM): - update the hidden_states, and fix for onnx model """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py index 18e1e7611..5270a5c54 100644 --- a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -371,7 +371,7 @@ def forward( class QEffQwen3MoeForCausalLM(Qwen3MoeForCausalLM): - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py index 3387f0fba..fdbbbf05d 100644 --- a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py +++ b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py @@ -275,7 +275,7 @@ class QEffStarcoder2ForCausalLM(Starcoder2ForCausalLM): - update the hidden_states, and fix for onnx model """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index 650258328..246f005a7 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -718,7 +718,7 @@ class QEffWhisperForConditionalGeneration(WhisperForConditionalGeneration): - changed forward inputs decoder_input_ids and decoder_position_ids to input_ids and position_ids """ - def get_repeated_layer_class(self) -> Type[nn.Module]: + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. Notes: diff --git a/QEfficient/utils/export_utils.py b/QEfficient/utils/export_utils.py index 9380ae440..bba282b99 100644 --- a/QEfficient/utils/export_utils.py +++ b/QEfficient/utils/export_utils.py @@ -182,7 +182,7 @@ def _setup_onnx_subfunctions(qeff_model, args, kwargs): qeff_model._onnx_transforms.append(CustomOpTransform) # TODO: Handle this in the modelling class QEFFTransformersBase,remove from here. Refer diffusers implementation - decoder_layer_classes = qeff_model.model.get_repeated_layer_class() + decoder_layer_classes = qeff_model.model.get_submodules_for_export() if decoder_layer_classes: kwargs["export_modules_as_functions"] = decoder_layer_classes return args, kwargs diff --git a/tests/transformers/test_causal_lm.py b/tests/transformers/test_causal_lm.py index 6480fcdc9..fc89fdf8b 100644 --- a/tests/transformers/test_causal_lm.py +++ b/tests/transformers/test_causal_lm.py @@ -14,7 +14,6 @@ from transformers import AutoConfig, AutoModel, AutoModelForCausalLM from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM -from QEfficient.transformers.models.pytorch_transforms import get_decoder_layer_classes_for_export from QEfficient.utils import constants, get_padding_shape_from_config from QEfficient.utils.hash_utils import hash_dict_params @@ -225,7 +224,7 @@ def test_causal_lm_hash_creation(config, cb, subfunc, prefill_only, tmp_path): export_params["dynamic_axes"] = dynamic_axes hash_params["export_params"] = export_params if subfunc: - hash_params["export_modules_as_functions"] = get_decoder_layer_classes_for_export(qeff_model.model) + hash_params["export_modules_as_functions"] = qeff_model.model.get_submodules_for_export() manual_hash = hash_dict_params(hash_params) From fc71b963a98d73b3901b021164df28f93b4e5616 Mon Sep 17 00:00:00 2001 From: Abhishek Kumar Singh Date: Wed, 14 Jan 2026 12:07:57 +0530 Subject: [PATCH 05/12] Update modeling_codegen.py Signed-off-by: Abhishek Kumar Singh --- .../transformers/models/codegen/modeling_codegen.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/QEfficient/transformers/models/codegen/modeling_codegen.py b/QEfficient/transformers/models/codegen/modeling_codegen.py index 3addd7501..d85791912 100644 --- a/QEfficient/transformers/models/codegen/modeling_codegen.py +++ b/QEfficient/transformers/models/codegen/modeling_codegen.py @@ -295,7 +295,15 @@ class QEffCodeGenForCausalLM(CodeGenForCausalLM): - add new args position idx for the cache_kwargs for kv retention - update the hidden_states, and fix for onnx model """ - + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffCodeGenBlock} + def forward( self, input_ids: Optional[torch.LongTensor] = None, From 1b2800240221be10c05403bb44ae6320e779ba4b Mon Sep 17 00:00:00 2001 From: Abhishek kumar singh Date: Wed, 14 Jan 2026 08:04:11 +0000 Subject: [PATCH 06/12] Resolved lint error Signed-off-by: Abhishek kumar singh --- QEfficient/transformers/models/codegen/modeling_codegen.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/QEfficient/transformers/models/codegen/modeling_codegen.py b/QEfficient/transformers/models/codegen/modeling_codegen.py index d85791912..21968a7c0 100644 --- a/QEfficient/transformers/models/codegen/modeling_codegen.py +++ b/QEfficient/transformers/models/codegen/modeling_codegen.py @@ -7,7 +7,7 @@ """PyTorch Codegen model.""" -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch from torch import nn @@ -295,6 +295,7 @@ class QEffCodeGenForCausalLM(CodeGenForCausalLM): - add new args position idx for the cache_kwargs for kv retention - update the hidden_states, and fix for onnx model """ + def get_submodules_for_export(self) -> Type[nn.Module]: """ Return the set of class used as the repeated layer across the model for subfunction extraction. @@ -303,7 +304,7 @@ def get_submodules_for_export(self) -> Type[nn.Module]: Downstream code can use this to find/build subfunctions for repeated blocks. """ return {QEffCodeGenBlock} - + def forward( self, input_ids: Optional[torch.LongTensor] = None, From 2b1f09cf307a6ccc880503f1d7f669a9888e0dda Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Fri, 16 Jan 2026 12:17:30 +0000 Subject: [PATCH 07/12] Made Minor Fixes Signed-off-by: abhishek-singh591 --- .../models/llava_next/modeling_llava_next.py | 9 ++++++ .../test_subfunction_vlm.py | 30 ++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index 8b338420e..59d5cad22 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -137,6 +137,15 @@ def __init__(self, model): self.language_model = self.model.language_model self.lm_head = self.model.lm_head + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.language_model.layers[0].__class__} + def forward( self, input_ids, diff --git a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py index 88f89c618..008280f72 100644 --- a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py +++ b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py @@ -242,7 +242,7 @@ def set_num_layers(config, n_layer=1): return config -def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( +def test_image_text_to_text_subfunction_core( model_name: str, img_size: int, img_url: str, @@ -317,6 +317,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( mxfp6=False, enable_qnn=enable_qnn, qnn_config=qnn_config, + offload_pt_weights=True, onnx_path=with_sub_func_onnx, ) @@ -341,3 +342,30 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( assert (tokens_sub == tokens_no_sub).all(), "Tokens don't match for pytorch HF output and QPC output" return + + +@pytest.mark.on_qaic +@pytest.mark.multimodal +@pytest.mark.parametrize( + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config +) +def test_image_text_to_text_subfunction( + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer +): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + test_image_text_to_text_subfunction_core( + model_name=model_name, + prompt_len=prompt_len, + ctx_len=ctx_len, + max_gen_len=NEW_GENERATION_TOKENS, + img_size=img_size, + img_url=img_url, + query=query, + n_layer=n_layer, + batch_size=batch_size, + kv_offload=kv_offload, + ) From f06028afb6fdc7394f442fc8e999f601f516a260 Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Sun, 18 Jan 2026 14:57:07 +0000 Subject: [PATCH 08/12] Fixed test file for subfunction Signed-off-by: abhishek-singh591 --- .../transformers/models/modeling_auto.py | 3 +- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 4 +- .../test_subfunction_vlm.py | 343 ++++++++++++++---- 3 files changed, 268 insertions(+), 82 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 55253f9b0..bad767b65 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1031,12 +1031,13 @@ def export( use_onnx_subfunctions=use_onnx_subfunctions, ) + offload_pt_weights = kwargs.get("offload_pt_weights", True) self.lang_model.export( inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir=export_dir, - offload_pt_weights=True, + offload_pt_weights=offload_pt_weights, use_onnx_subfunctions=use_onnx_subfunctions, ) diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index e8b95dec6..d6bfbda81 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -76,8 +76,8 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, mrope_section, unsqu cos = cos[position_ids] sin = sin[position_ids] - cos = torch.cat([cos[0, ..., 0:32], cos[1, ..., 32:80], cos[2, ..., 80:128]], dim=-1).unsqueeze(0) - sin = torch.cat([sin[0, ..., 0:32], sin[1, ..., 32:80], sin[2, ..., 80:128]], dim=-1).unsqueeze(0) + cos = torch.cat([cos[0, ..., 0:32], cos[1, ..., 32:80], cos[2, ..., 80:128]], dim=-1).unsqueeze(unsqueeze_dim) + sin = torch.cat([sin[0, ..., 0:32], sin[1, ..., 32:80], sin[2, ..., 80:128]], dim=-1).unsqueeze(unsqueeze_dim) q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) diff --git a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py index 008280f72..e683ea859 100644 --- a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py +++ b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py @@ -5,7 +5,8 @@ # # ---------------------------------------------------------------------------- -from typing import Optional +from io import BytesIO +from typing import List, Optional import pytest import requests @@ -16,13 +17,15 @@ AutoModelForCausalLM, AutoModelForImageTextToText, AutoProcessor, + AutoTokenizer, TextStreamer, ) -from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM, QEFFAutoModelForImageTextToText from QEfficient.utils import hf_download from QEfficient.utils._utils import get_num_layers_vlm from QEfficient.utils.device_utils import get_available_device_id +from QEfficient.utils.test_utils import InternProcessor NEW_GENERATION_TOKENS = 10 test_models_config = [ @@ -49,29 +52,29 @@ "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", 1, ), - ( - "llava-hf/llava-1.5-7b-hf", - False, - 1, - 784, - 1024, - 336, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - 1, - ), - # Disabled in CI due to performance issues # ( - # "meta-llama/Llama-4-Scout-17B-16E-Instruct", + # "llava-hf/llava-1.5-7b-hf", # True, # 1, - # 128, - # 3072, + # 784, + # 1024, # 336, # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - # 4, + # 1, # ), + # Disabled in CI due to performance issues + ( + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + True, + 1, + 128, + 3072, + 336, + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + 4, + ), # ( # "meta-llama/Llama-4-Scout-17B-16E-Instruct", # False, @@ -94,17 +97,17 @@ "Can you describe the image in detail.", 1, ), - ( - "google/gemma-3-4b-it", - False, - 1, - 128, - 3072, - 896, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), + # ( + # "google/gemma-3-4b-it", + # True, + # 1, + # 128, + # 3072, + # 896, + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + # "Can you describe the image in detail.", + # 1, + # ), ( "mistralai/Mistral-Small-3.1-24B-Instruct-2503", True, @@ -116,39 +119,39 @@ "Can you describe the image in detail.", 1, ), + # ( + # "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + # True, + # 1, + # 128, + # 4096, + # 1540, + # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + # "Can you describe the image in detail.", + # 1, + # ), ( - "mistralai/Mistral-Small-3.1-24B-Instruct-2503", - False, + "Qwen/Qwen2.5-VL-3B-Instruct", + True, 1, 128, 4096, 1540, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://picsum.photos/id/237/536/354", "Can you describe the image in detail.", 1, ), ( - "Qwen/Qwen2.5-VL-3B-Instruct", + "meta-llama/Llama-3.2-11B-Vision-Instruct", True, 1, - 128, - 4096, - 1540, - "https://picsum.photos/id/237/536/354", - "Can you describe the image in detail.", - 1, + 32, + 512, + 560, + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "Explain this image", + 7, ), - # ( - # "meta-llama/Llama-3.2-11B-Vision-Instruct", - # True, - # 1, - # 32, - # 512, - # 560, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", - # "Explain this image", - # 7, - # ), ] intern_model_config = [ @@ -186,16 +189,16 @@ molmo_model_config = [ # Disabled in CI due to HF issues - # ( - # "allenai/Molmo-7B-D-0924", - # True, - # 1, - # 128, - # 4096, - # "https://picsum.photos/id/237/536/354", - # "Can you describe the image in detail.", - # 2, - # ), + ( + "allenai/Molmo-7B-D-0924", + True, + 1, + 128, + 4096, + "https://picsum.photos/id/237/536/354", + "Can you describe the image in detail.", + 2, + ), ] @@ -242,7 +245,7 @@ def set_num_layers(config, n_layer=1): return config -def test_image_text_to_text_subfunction_core( +def check_image_text_to_text_subfunction_core( model_name: str, img_size: int, img_url: str, @@ -290,13 +293,7 @@ def test_image_text_to_text_subfunction_core( config=config, ) - # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model) - # assert (pytorch_kv_tokens == pytorch_hf_tokens).all(), ( - # "Tokens don't match for pytorch HF output and pytorch KV output" - # ) - - with_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False) - without_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=False) + qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False) if not get_available_device_id(): pytest.skip("No available devices to run model on Cloud AI 100") @@ -317,30 +314,176 @@ def test_image_text_to_text_subfunction_core( mxfp6=False, enable_qnn=enable_qnn, qnn_config=qnn_config, - offload_pt_weights=True, - onnx_path=with_sub_func_onnx, ) - print("Output With Subfunction Enabled:") output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) - tokens_sub = output.generated_ids[:, :-1] + print("Output With Subfunction Enabled:\n", output) + return + + +def check_image_text_to_text_subfunction_molmo( + model_name: str, + img_url: str, + query: str, + prompt_len: int, + ctx_len: int, + max_gen_len: int = 20, + batch_size: int = 1, + n_layer: int = 1, + kv_offload: bool = False, + num_devices: int = 1, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, +): + model_config = {"model_name": model_name} + + config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) + config._attn_implementation = "eager" + config = set_num_layers(config, n_layer=n_layer) + model_hf, _ = load_image_text_to_text_model(config) + n_layer = (n_layer, n_layer) + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + img = requests.get(img_url, stream=True) + image = Image.open(BytesIO(img.content)).convert("RGB") + image = image.resize((536, 354)) + + inputs = processor.process(images=[image], text=query) + inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} + + batch_size, prompt_len = inputs["input_ids"].shape + inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64) + valid = inputs["image_input_idx"] > 0 + valid = valid.reshape(1, -1) + inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0) + inputs["pixel_values"] = inputs.pop("images") + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_config["model_name"], + kv_offload=kv_offload, + config=config, + ) + + qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False) + + if not get_available_device_id(): + pytest.skip("No available devices to run model on Cloud AI 100") + + if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": + inputs = qeff_model.model.prepare_inputs_for_generation( + inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size + ) + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) qeff_model.compile( - img_size=model_config["img_size"], + num_devices=num_devices, + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + mxfp6=False, + ) + + streamer = TextStreamer(processor.tokenizer) + output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) + print("Output With Subfunction Enabled:\n", output) + return + + +def check_image_text_to_text_subfunction_internvl( + model_name: str, + img_url: str, + query: str, + prompt_len: int, + ctx_len: int, + max_gen_len: int = 20, + batch_size: int = 1, + n_layer: int = 1, + kv_offload: bool = False, + num_devices: int = 1, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, +): + model_config = {"model_name": model_name} + + config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) + config._attn_implementation = "eager" + config = set_num_layers(config, n_layer=n_layer) + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, + ) + n_layer = get_num_layers_vlm(config) + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) + processor = InternProcessor(model_hf, tokenizer) + + prompt = [query] + img_url = [img_url] + pixel_values = [] + num_patches_list = [] + questions = [] + for i in range(len(prompt)): + img = requests.get(img_url[i], stream=True) + image = Image.open(BytesIO(img.content)).convert("RGB") + + image = image.resize((448, 448)) + + # preprocess the resized image + pixel_value = processor.load_image(image, max_num=12) + num_patches_list.append(pixel_value.shape[0]) + pixel_values.append(pixel_value) + + question = "\n" + prompt[i] + questions.append(question) + + pixel_values = torch.cat(pixel_values, dim=0) + + # Chat Template information for prompt preprocessing + messages: List[List[str]] = [] + roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") + prompt = processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list) + + inputs = tokenizer(prompt, return_tensors="pt") + batch_size, prompt_len = inputs["input_ids"].shape + inputs["pixel_values"] = pixel_values.clone() + + generation_config = dict(max_new_tokens=max_gen_len, do_sample=False) + generation_config["eos_token_id"] = tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip()) + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_config["model_name"], + kv_offload=kv_offload, + config=config, + ) + + streamer = TextStreamer(processor.tokenizer) + qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False) + + if not get_available_device_id(): + pytest.skip("No available devices to run model on Cloud AI 100") + + inputs = processor(images=image, text=prompt, return_tensors="pt") + if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": + inputs = qeff_model.model.prepare_inputs_for_generation( + inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size + ) + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + qeff_model.compile( + num_patches=1, num_devices=num_devices, prefill_seq_len=prompt_len, ctx_len=ctx_len, mxfp6=False, enable_qnn=enable_qnn, qnn_config=qnn_config, - onnx_path=without_sub_func_onnx, ) - print("Output With Subfunction Not Enabled:") output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) - tokens_no_sub = output.generated_ids[:, :-1] - - assert (tokens_sub == tokens_no_sub).all(), "Tokens don't match for pytorch HF output and QPC output" + print("Output With Subfunction Enabled:\n", output) return @@ -357,7 +500,7 @@ def test_image_text_to_text_subfunction( ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - test_image_text_to_text_subfunction_core( + check_image_text_to_text_subfunction_core( model_name=model_name, prompt_len=prompt_len, ctx_len=ctx_len, @@ -369,3 +512,45 @@ def test_image_text_to_text_subfunction( batch_size=batch_size, kv_offload=kv_offload, ) + + +@pytest.mark.on_qaic +@pytest.mark.multimodal +@pytest.mark.parametrize( + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config +) +def test_image_text_to_text_subfunction_molmo( + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer +): + check_image_text_to_text_subfunction_molmo( + model_name=model_name, + prompt_len=prompt_len, + ctx_len=ctx_len, + max_gen_len=NEW_GENERATION_TOKENS, + img_url=img_url, + query=query, + n_layer=n_layer, + batch_size=batch_size, + kv_offload=kv_offload, + ) + + +@pytest.mark.on_qaic +@pytest.mark.multimodal +@pytest.mark.parametrize( + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config +) +def test_image_text_to_text_subfunction_internvl( + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer +): + check_image_text_to_text_subfunction_internvl( + model_name=model_name, + prompt_len=prompt_len, + ctx_len=ctx_len, + max_gen_len=NEW_GENERATION_TOKENS, + img_url=img_url, + query=query, + n_layer=n_layer, + batch_size=batch_size, + kv_offload=kv_offload, + ) From 5fd672db940f87eb6e44688604a130db751396f7 Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Mon, 19 Jan 2026 08:23:42 +0000 Subject: [PATCH 09/12] Changed test file for subfunction with VLMs Signed-off-by: abhishek-singh591 --- .../test_subfunction_vlm.py | 424 +----------------- 1 file changed, 24 insertions(+), 400 deletions(-) diff --git a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py index e683ea859..9e98ab7d7 100644 --- a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py +++ b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py @@ -5,27 +5,23 @@ # # ---------------------------------------------------------------------------- -from io import BytesIO -from typing import List, Optional +from typing import Optional +import onnx import pytest import requests import torch from PIL import Image from transformers import ( AutoConfig, - AutoModelForCausalLM, AutoModelForImageTextToText, AutoProcessor, - AutoTokenizer, - TextStreamer, ) -from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM, QEFFAutoModelForImageTextToText +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText from QEfficient.utils import hf_download from QEfficient.utils._utils import get_num_layers_vlm from QEfficient.utils.device_utils import get_available_device_id -from QEfficient.utils.test_utils import InternProcessor NEW_GENERATION_TOKENS = 10 test_models_config = [ @@ -41,95 +37,6 @@ # text_prompt, # number of layers of the model, # ), - ( - "llava-hf/llava-1.5-7b-hf", - True, - 1, - 784, - 1024, - 336, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - 1, - ), - # ( - # "llava-hf/llava-1.5-7b-hf", - # True, - # 1, - # 784, - # 1024, - # 336, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - # 1, - # ), - # Disabled in CI due to performance issues - ( - "meta-llama/Llama-4-Scout-17B-16E-Instruct", - True, - 1, - 128, - 3072, - 336, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - 4, - ), - # ( - # "meta-llama/Llama-4-Scout-17B-16E-Instruct", - # False, - # 1, - # 128, - # 3072, - # 336, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - # 4, - # ), - ( - "google/gemma-3-4b-it", - True, - 1, - 128, - 3072, - 896, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - # ( - # "google/gemma-3-4b-it", - # True, - # 1, - # 128, - # 3072, - # 896, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - # "Can you describe the image in detail.", - # 1, - # ), - ( - "mistralai/Mistral-Small-3.1-24B-Instruct-2503", - True, - 1, - 128, - 4096, - 1540, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - # ( - # "mistralai/Mistral-Small-3.1-24B-Instruct-2503", - # True, - # 1, - # 128, - # 4096, - # 1540, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - # "Can you describe the image in detail.", - # 1, - # ), ( "Qwen/Qwen2.5-VL-3B-Instruct", True, @@ -141,64 +48,6 @@ "Can you describe the image in detail.", 1, ), - ( - "meta-llama/Llama-3.2-11B-Vision-Instruct", - True, - 1, - 32, - 512, - 560, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", - "Explain this image", - 7, - ), -] - -intern_model_config = [ - ( - "OpenGVLab/InternVL2_5-1B", - True, - 1, - 384, - 512, - "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - "Please describe the image in detail.", - 2, - ), - ( - "OpenGVLab/InternVL3_5-1B", - True, - 1, - 384, - 512, - "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - "Please describe the image in detail.", - 2, - ), - # ( - # "OpenGVLab/InternVL2_5-1B", - # False, - # 1, - # 384, - # 512, - # "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - # "Please describe the image in detail.", - # 2, - # ), # commented becuase QNN Convertor is not supported for this model yet. -] - -molmo_model_config = [ - # Disabled in CI due to HF issues - ( - "allenai/Molmo-7B-D-0924", - True, - 1, - 128, - 4096, - "https://picsum.photos/id/237/536/354", - "Can you describe the image in detail.", - 2, - ), ] @@ -207,42 +56,23 @@ def load_image_text_to_text_model(model_config): repo_id=model_config._name_or_path, ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], ) - try: - model_hf = AutoModelForImageTextToText.from_pretrained( - model_path, - low_cpu_mem_usage=False, - config=model_config, - ) - except ValueError: - model_hf = AutoModelForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=False, - trust_remote_code=True, - config=model_config, - ) + + model_hf = AutoModelForImageTextToText.from_pretrained( + model_path, + low_cpu_mem_usage=False, + config=model_config, + ) params = sum(p.numel() for p in model_hf.parameters()) model_hf.eval() return model_hf, params -def set_num_layers(config, n_layer=1): - ## -1 indicates use all the layers of the model. - if n_layer == -1: - return config - elif hasattr(config, "model_type") and "mllama" in config.model_type: - config.text_config.num_hidden_layers = n_layer - config.text_config.cross_attention_layers = [ - x for x in config.text_config.cross_attention_layers if x < n_layer - ] - elif hasattr(config, "text_config"): - config.text_config.num_hidden_layers = n_layer - config.vision_config.num_hidden_layers = n_layer - elif hasattr(config, "llm_config"): - config.llm_config.num_hidden_layers = n_layer - config.vision_config.num_hidden_layers = n_layer - else: - config.num_hidden_layers = n_layer - return config +def has_QwenLayer_function(onnx_path): + """Check if ONNX model contains QEffqwenlayer function definition.""" + model = onnx.load(onnx_path, load_external_data=False) + function_names = [f.name for f in model.functions] + QwenLayer_functions = [name for name in function_names if "QEffQwen2_5_VLDecoderLayer" in name] + return len(QwenLayer_functions) > 0, QwenLayer_functions def check_image_text_to_text_subfunction_core( @@ -263,14 +93,13 @@ def check_image_text_to_text_subfunction_core( model_config = {"model_name": model_name} model_config["img_size"] = img_size config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True, padding=True) - config = set_num_layers(config, n_layer=n_layer) + config.text_config.num_hidden_layers = n_layer + config.vision_config.num_hidden_layers = n_layer model_hf, _ = load_image_text_to_text_model(config) processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) n_layer = get_num_layers_vlm(config) image = Image.open(requests.get(img_url, stream=True).raw) - if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": - image = image.resize((1540, 1540)) conversation = [ { @@ -286,14 +115,13 @@ def check_image_text_to_text_subfunction_core( inputs = processor(images=image, text=prompt, return_tensors="pt") if "pixel_values" in inputs: inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - streamer = TextStreamer(processor.tokenizer) qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( model_config["model_name"], kv_offload=kv_offload, config=config, ) - qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False) + with_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False) if not get_available_device_id(): pytest.skip("No available devices to run model on Cloud AI 100") @@ -306,174 +134,15 @@ def check_image_text_to_text_subfunction_core( if "pixel_values" in inputs: inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - qeff_model.compile( - img_size=model_config["img_size"], - num_devices=num_devices, - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - mxfp6=False, - enable_qnn=enable_qnn, - qnn_config=qnn_config, - ) - - output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) - print("Output With Subfunction Enabled:\n", output) - return - - -def check_image_text_to_text_subfunction_molmo( - model_name: str, - img_url: str, - query: str, - prompt_len: int, - ctx_len: int, - max_gen_len: int = 20, - batch_size: int = 1, - n_layer: int = 1, - kv_offload: bool = False, - num_devices: int = 1, - enable_qnn: Optional[bool] = False, - qnn_config: Optional[str] = None, -): - model_config = {"model_name": model_name} - - config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) - config._attn_implementation = "eager" - config = set_num_layers(config, n_layer=n_layer) - model_hf, _ = load_image_text_to_text_model(config) - n_layer = (n_layer, n_layer) - - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) - img = requests.get(img_url, stream=True) - image = Image.open(BytesIO(img.content)).convert("RGB") - image = image.resize((536, 354)) - - inputs = processor.process(images=[image], text=query) - inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} - - batch_size, prompt_len = inputs["input_ids"].shape - inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64) - valid = inputs["image_input_idx"] > 0 - valid = valid.reshape(1, -1) - inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0) - inputs["pixel_values"] = inputs.pop("images") - - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_config["model_name"], - kv_offload=kv_offload, - config=config, - ) - - qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False) - - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") - - if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": - inputs = qeff_model.model.prepare_inputs_for_generation( - inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size - ) - if "pixel_values" in inputs: - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - - qeff_model.compile( - num_devices=num_devices, - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - mxfp6=False, - ) - - streamer = TextStreamer(processor.tokenizer) - output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) - print("Output With Subfunction Enabled:\n", output) - return - - -def check_image_text_to_text_subfunction_internvl( - model_name: str, - img_url: str, - query: str, - prompt_len: int, - ctx_len: int, - max_gen_len: int = 20, - batch_size: int = 1, - n_layer: int = 1, - kv_offload: bool = False, - num_devices: int = 1, - enable_qnn: Optional[bool] = False, - qnn_config: Optional[str] = None, -): - model_config = {"model_name": model_name} - - config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) - config._attn_implementation = "eager" - config = set_num_layers(config, n_layer=n_layer) - model_hf = AutoModelForCausalLM.from_pretrained( - model_name, - low_cpu_mem_usage=False, - trust_remote_code=True, - config=config, - ) - n_layer = get_num_layers_vlm(config) - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) - processor = InternProcessor(model_hf, tokenizer) - - prompt = [query] - img_url = [img_url] - pixel_values = [] - num_patches_list = [] - questions = [] - for i in range(len(prompt)): - img = requests.get(img_url[i], stream=True) - image = Image.open(BytesIO(img.content)).convert("RGB") - - image = image.resize((448, 448)) - - # preprocess the resized image - pixel_value = processor.load_image(image, max_num=12) - num_patches_list.append(pixel_value.shape[0]) - pixel_values.append(pixel_value) - - question = "\n" + prompt[i] - questions.append(question) - - pixel_values = torch.cat(pixel_values, dim=0) - - # Chat Template information for prompt preprocessing - messages: List[List[str]] = [] - roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") - prompt = processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list) - - inputs = tokenizer(prompt, return_tensors="pt") - batch_size, prompt_len = inputs["input_ids"].shape - inputs["pixel_values"] = pixel_values.clone() - - generation_config = dict(max_new_tokens=max_gen_len, do_sample=False) - generation_config["eos_token_id"] = tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip()) - - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_config["model_name"], - kv_offload=kv_offload, - config=config, + # Verify that the model with subfunctions has QEffQwen2_5_VLDecoderLayer function definition + has_qwenlayer, qwenlayer_names = has_QwenLayer_function(with_sub_func_onnx[-1]) + assert has_qwenlayer, ( + "Model exported with use_onnx_subfunctions=True should contain QEffQwen2_5_VLDecoderLayer function definition" ) - - streamer = TextStreamer(processor.tokenizer) - qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False) - - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") - - inputs = processor(images=image, text=prompt, return_tensors="pt") - if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": - inputs = qeff_model.model.prepare_inputs_for_generation( - inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size - ) - if "pixel_values" in inputs: - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + print(f"\nQwenLayer functions found: {qwenlayer_names}") qeff_model.compile( - num_patches=1, + img_size=model_config["img_size"], num_devices=num_devices, prefill_seq_len=prompt_len, ctx_len=ctx_len, @@ -481,9 +150,6 @@ def check_image_text_to_text_subfunction_internvl( enable_qnn=enable_qnn, qnn_config=qnn_config, ) - - output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) - print("Output With Subfunction Enabled:\n", output) return @@ -512,45 +178,3 @@ def test_image_text_to_text_subfunction( batch_size=batch_size, kv_offload=kv_offload, ) - - -@pytest.mark.on_qaic -@pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config -) -def test_image_text_to_text_subfunction_molmo( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): - check_image_text_to_text_subfunction_molmo( - model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, - max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, - kv_offload=kv_offload, - ) - - -@pytest.mark.on_qaic -@pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config -) -def test_image_text_to_text_subfunction_internvl( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): - check_image_text_to_text_subfunction_internvl( - model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, - max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, - kv_offload=kv_offload, - ) From dca8322240050fdddbb59c36e36bf183f36e3aa9 Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Mon, 19 Jan 2026 09:55:09 +0000 Subject: [PATCH 10/12] Made Minor Fixes Signed-off-by: abhishek-singh591 --- .../transformers/models/mistral3/modeling_mistral3.py | 2 +- QEfficient/utils/export_utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index d1391a71a..a8fb34baf 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -184,7 +184,7 @@ def get_submodules_for_export(self) -> Type[nn.Module]: This method should return the *class object* (not an instance). Downstream code can use this to find/build subfunctions for repeated blocks. """ - return self.model.language_model.layers[0].__class__ + return {self.model.language_model.layers[0].__class__} def forward( self, diff --git a/QEfficient/utils/export_utils.py b/QEfficient/utils/export_utils.py index bba282b99..dbb3aca01 100644 --- a/QEfficient/utils/export_utils.py +++ b/QEfficient/utils/export_utils.py @@ -182,9 +182,9 @@ def _setup_onnx_subfunctions(qeff_model, args, kwargs): qeff_model._onnx_transforms.append(CustomOpTransform) # TODO: Handle this in the modelling class QEFFTransformersBase,remove from here. Refer diffusers implementation - decoder_layer_classes = qeff_model.model.get_submodules_for_export() - if decoder_layer_classes: - kwargs["export_modules_as_functions"] = decoder_layer_classes + submodule_classes = qeff_model.model.get_submodules_for_export() + if submodule_classes: + kwargs["export_modules_as_functions"] = submodule_classes return args, kwargs From 1407f61f8b5a383c7911f2435bb1831295e061d3 Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Mon, 19 Jan 2026 10:27:30 +0000 Subject: [PATCH 11/12] Added support of subfunction to mllama Signed-off-by: abhishek-singh591 --- .../models/mllama/modeling_mllama.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 74de1c6c1..3cba022b4 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -7,7 +7,7 @@ """PyTorch Mllama model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.nn.functional as F @@ -749,6 +749,15 @@ def __init__(self, model): self.model = model self.cross_attention_layers = self.model.config.get_text_config().cross_attention_layers + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_model.transformer.layers[0].__class__} + def forward( self, pixel_values: Optional[torch.FloatTensor] = None, @@ -861,6 +870,15 @@ def get_qeff_vision_encoder(self): def get_qeff_language_decoder(self): return self + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffMllamaSelfAttentionDecoderLayer} + def forward( self, input_ids: Optional[torch.LongTensor] = None, From 129be5bffd30b4721e0f7e902fe5d24499f53a52 Mon Sep 17 00:00:00 2001 From: Abhishek Kumar Singh Date: Wed, 21 Jan 2026 21:58:34 +0530 Subject: [PATCH 12/12] Update torch_patches.py Signed-off-by: Abhishek Kumar Singh --- QEfficient/utils/torch_patches.py | 1 - 1 file changed, 1 deletion(-) diff --git a/QEfficient/utils/torch_patches.py b/QEfficient/utils/torch_patches.py index 1752b5979..cec5455d7 100644 --- a/QEfficient/utils/torch_patches.py +++ b/QEfficient/utils/torch_patches.py @@ -39,7 +39,6 @@ def _track_module_attributes_forward_hook(module, input, output): if hasattr(module, attr_name): onnx_attrs = getattr(module, attr_name) delattr(module, attr_name) - # FIX: use empty dict to avoid type mismatch try: _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs) except Exception as e: