diff --git a/QEfficient/transformers/models/codegen/modeling_codegen.py b/QEfficient/transformers/models/codegen/modeling_codegen.py index 3addd7501..21968a7c0 100644 --- a/QEfficient/transformers/models/codegen/modeling_codegen.py +++ b/QEfficient/transformers/models/codegen/modeling_codegen.py @@ -7,7 +7,7 @@ """PyTorch Codegen model.""" -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch from torch import nn @@ -296,6 +296,15 @@ class QEffCodeGenForCausalLM(CodeGenForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffCodeGenBlock} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/falcon/modeling_falcon.py b/QEfficient/transformers/models/falcon/modeling_falcon.py index 1cfdf88e1..4ebb2fb96 100644 --- a/QEfficient/transformers/models/falcon/modeling_falcon.py +++ b/QEfficient/transformers/models/falcon/modeling_falcon.py @@ -8,9 +8,10 @@ """PyTorch Falcon model.""" import math -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch +import torch.nn as nn import torch.utils.checkpoint from torch.nn import functional as F from transformers.cache_utils import Cache @@ -353,6 +354,15 @@ class QEffFalconForCausalLM(FalconForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffFalconDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/gemma/modeling_gemma.py b/QEfficient/transformers/models/gemma/modeling_gemma.py index 1edb8ef53..260d1857a 100644 --- a/QEfficient/transformers/models/gemma/modeling_gemma.py +++ b/QEfficient/transformers/models/gemma/modeling_gemma.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch from torch import nn @@ -336,6 +336,15 @@ class QEffGemmaForCausalLM(GemmaForCausalLM): - add new args cache idx for the kv retention """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGemmaDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/gemma2/modeling_gemma2.py b/QEfficient/transformers/models/gemma2/modeling_gemma2.py index 2944601c9..6dee8c85d 100644 --- a/QEfficient/transformers/models/gemma2/modeling_gemma2.py +++ b/QEfficient/transformers/models/gemma2/modeling_gemma2.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -388,6 +388,15 @@ class QEffGemma2ForCausalLM(Gemma2ForCausalLM, GenerationMixin): - add new args cache idx for the kv retention """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGemma2DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index 74901401b..61730b17d 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- import copy -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch from torch import nn @@ -589,6 +589,15 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_tower.vision_model.encoder.layers[0].__class__} + def forward(self, pixel_values): image_features = self.model.get_image_features(pixel_values=pixel_values) return image_features @@ -602,6 +611,15 @@ def __init__(self, model): self.config = self.model.config self.lm_head = self.model.lm_head + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGemma3DecoderLayer} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/gpt2/modeling_gpt2.py b/QEfficient/transformers/models/gpt2/modeling_gpt2.py index 6136a2c5d..7de674cce 100644 --- a/QEfficient/transformers/models/gpt2/modeling_gpt2.py +++ b/QEfficient/transformers/models/gpt2/modeling_gpt2.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Optional, Tuple, Type, Union import torch from torch import nn @@ -397,6 +397,15 @@ class QEffGPT2LMHeadModel(GPT2LMHeadModel): - add new args position idx for the cache_kwargs for kv retention """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGPT2Block} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 85ea42674..d1220589f 100644 --- a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -7,7 +7,7 @@ """PyTorch GPTBigCode model.""" -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -378,6 +378,15 @@ def forward( class QEffGPTBigCodeForCausalLM(GPTBigCodeForCausalLM): + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGPTBigCodeBlock} + def forward( self, input_ids: Optional[torch.Tensor] = None, diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index 3efe890b8..57bcb842d 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- import math import os -from typing import Callable, Optional, Union +from typing import Callable, Optional, Type, Union import torch from torch import nn @@ -1205,6 +1205,16 @@ def forward( class QEffGptOssForCausalLM(GptOssForCausalLM): + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGptOssDecoderLayer} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/gptj/modeling_gptj.py b/QEfficient/transformers/models/gptj/modeling_gptj.py index 1a9e45e97..a4c81dbec 100644 --- a/QEfficient/transformers/models/gptj/modeling_gptj.py +++ b/QEfficient/transformers/models/gptj/modeling_gptj.py @@ -7,7 +7,7 @@ """PyTorch GPT-J model.""" -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch from torch import nn @@ -318,6 +318,15 @@ class QEffGPTJForCausalLM(GPTJForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGPTJBlock} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/granite/modeling_granite.py b/QEfficient/transformers/models/granite/modeling_granite.py index 62be5f54d..8a32c52ef 100644 --- a/QEfficient/transformers/models/granite/modeling_granite.py +++ b/QEfficient/transformers/models/granite/modeling_granite.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -347,6 +347,15 @@ class QEffGraniteForCausalLM(GraniteForCausalLM): Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGraniteDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py index b158b4046..07cba09d5 100644 --- a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py +++ b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.nn.functional as F @@ -493,6 +493,15 @@ class QEffGraniteMoeForCausalLM(GraniteMoeForCausalLM): Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.layers[0].__class__} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/grok_1/modeling_grok1.py b/QEfficient/transformers/models/grok_1/modeling_grok1.py index 2d8fc412d..1a1c919bb 100644 --- a/QEfficient/transformers/models/grok_1/modeling_grok1.py +++ b/QEfficient/transformers/models/grok_1/modeling_grok1.py @@ -5,7 +5,7 @@ # # ---------------------------------------------------------------------------- -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.nn as nn @@ -397,6 +397,15 @@ class QEffGrok1ModelForCausalLM(nn.Module): Grok model for causal language modeling. """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffGrok1DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index b47db7eda..e389e6a84 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional +from typing import List, Optional, Type import torch import torch.nn as nn @@ -21,6 +21,15 @@ def __init__(self, model): super().__init__() self.model = model + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_model.encoder.layers[0].__class__} + def forward(self, pixel_values): vision_embeds = self.model.extract_feature(pixel_values) # Reshape from [num_patches, 256, hidden_dim] -> [1, num_patches*256, head_dim] @@ -36,6 +45,15 @@ def __init__(self, model): self.config = self.model.language_model.config self.language_model = self.model.language_model + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.language_model.model.layers[0].__class__} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py index fb3aed556..57bccdb1b 100644 --- a/QEfficient/transformers/models/llama/modeling_llama.py +++ b/QEfficient/transformers/models/llama/modeling_llama.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -404,6 +404,15 @@ class QEffLlamaForCausalLM(LlamaForCausalLM): Copied from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffLlamaDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py index 834ee8880..3abaef5a7 100644 --- a/QEfficient/transformers/models/llama4/modeling_llama4.py +++ b/QEfficient/transformers/models/llama4/modeling_llama4.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- import math -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -822,6 +822,15 @@ def __init__(self, model): super().__init__() self.model = model + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_model.model.layers[0].__class__} + def forward(self, pixel_values): vision_feature_layer = self.model.config.vision_config.vision_feature_layer vision_feature_select_strategy = self.model.config.vision_config.vision_feature_select_strategy @@ -849,6 +858,15 @@ def __init__(self, model): self.language_model = self.model.language_model self.config = self.model.config + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffLlama4TextDecoderLayer} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py index fa42b3f96..e219d5e03 100644 --- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py +++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py @@ -11,7 +11,7 @@ """Inference-only LLaMA model compatible with HuggingFace weights.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch from torch import nn @@ -416,6 +416,15 @@ def __init__(self, config: QEffLlamaSwiftKVConfig): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.config = config + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffLlamaSwiftKVDecoderLayer} + def forward( self, input_ids: torch.Tensor, diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index abdb77ea5..48b002a31 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional +from typing import List, Optional, Type import torch import torch.nn as nn @@ -30,6 +30,15 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_tower.vision_model.encoder.layers[0].__class__} + def forward(self, pixel_values): # Image features image_outputs = self.model.vision_tower(pixel_values, output_hidden_states=True) @@ -54,6 +63,15 @@ def __init__(self, model): self.language_model = self.model.language_model self.lm_head = self.model.lm_head + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.language_model.layers[0].__class__} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index 627f7393e..59d5cad22 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- -from typing import List, Optional +from typing import List, Optional, Type import numpy as np import torch @@ -30,6 +30,15 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_tower.vision_model.encoder.layers[0].__class__} + def forward(self, pixel_values, image_sizes): if pixel_values.dim() == constants.GRANITEVISION_PIXEL_VALUE_DIM: pixel_values_new = pixel_values.squeeze(0) @@ -128,6 +137,15 @@ def __init__(self, model): self.language_model = self.model.language_model self.lm_head = self.model.lm_head + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.language_model.layers[0].__class__} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py index 5edfb8f3a..47107384e 100644 --- a/QEfficient/transformers/models/mistral/modeling_mistral.py +++ b/QEfficient/transformers/models/mistral/modeling_mistral.py @@ -7,7 +7,7 @@ """PyTorch Mistral model.""" -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -356,6 +356,15 @@ class QEffMistralForCausalLM(MistralForCausalLM): - add new args cache idx for the kv retention """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffMistralDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index d2149b6bd..a8fb34baf 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.nn as nn @@ -151,6 +151,15 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.vision_tower + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_tower.transformer.layers[0].__class__} + def forward(self, pixel_values): image_sizes = torch.tensor([[pixel_values.shape[2], pixel_values.shape[3]]]).repeat(pixel_values.shape[0], 1) image_features = self.model.get_image_features( @@ -168,6 +177,15 @@ def __init__(self, model): self.config = self.model.config self.language_model = self.model.language_model + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.language_model.layers[0].__class__} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py index 862714fea..ec7a9a8c8 100644 --- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py +++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py @@ -7,7 +7,7 @@ """PyTorch Mixtral model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.nn.functional as F @@ -414,6 +414,15 @@ class QEffMixtralForCausalLM(MixtralForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QeffMixtralDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 74de1c6c1..3cba022b4 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -7,7 +7,7 @@ """PyTorch Mllama model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.nn.functional as F @@ -749,6 +749,15 @@ def __init__(self, model): self.model = model self.cross_attention_layers = self.model.config.get_text_config().cross_attention_layers + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.vision_model.transformer.layers[0].__class__} + def forward( self, pixel_values: Optional[torch.FloatTensor] = None, @@ -861,6 +870,15 @@ def get_qeff_vision_encoder(self): def get_qeff_language_decoder(self): return self + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffMllamaSelfAttentionDecoderLayer} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 40c7185d2..e45eed259 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1030,12 +1030,14 @@ def export( offload_pt_weights=False, use_onnx_subfunctions=use_onnx_subfunctions, ) + + offload_pt_weights = kwargs.get("offload_pt_weights", True) self.lang_model.export( inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir=export_dir, - offload_pt_weights=True, + offload_pt_weights=offload_pt_weights, use_onnx_subfunctions=use_onnx_subfunctions, ) diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index b686e6aed..57f2729b9 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -6,7 +6,7 @@ # ----------------------------------------------------------------------------- import math -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch import torch.nn as nn @@ -568,6 +568,15 @@ def __init__(self, model): super().__init__() self.model = model + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.model.transformer.blocks[0].__class__} + def forward(self, pixel_values, image_masks, image_input_idx, valid_idx): image_features, _ = self.model.model.vision_backbone(pixel_values, image_masks) num_image, num_patch = image_features.shape[1:3] @@ -588,6 +597,15 @@ def __init__(self, model): # self.language_model = self.model.language_model self.config = self.model.config + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.model.vision_backbone.image_vit.transformer.resblocks[0].__class__} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/mpt/modeling_mpt.py b/QEfficient/transformers/models/mpt/modeling_mpt.py index c1d98c1f8..5a808c7f2 100644 --- a/QEfficient/transformers/models/mpt/modeling_mpt.py +++ b/QEfficient/transformers/models/mpt/modeling_mpt.py @@ -7,7 +7,7 @@ """PyTorch MPT model.""" -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -254,6 +254,15 @@ class QEffMptForCausalLM(MptForCausalLM): - add new args cache idx for the kv retention """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffMptBlock} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/olmo2/modeling_olmo2.py b/QEfficient/transformers/models/olmo2/modeling_olmo2.py index 00755cae5..c79ad7fae 100644 --- a/QEfficient/transformers/models/olmo2/modeling_olmo2.py +++ b/QEfficient/transformers/models/olmo2/modeling_olmo2.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -324,6 +324,15 @@ class QEffOlmo2ForCausalLM(Olmo2ForCausalLM): - add new args cache idx for the kv retention """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffOlmo2DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/phi/modeling_phi.py b/QEfficient/transformers/models/phi/modeling_phi.py index 4bf2e8785..82f18b7e0 100644 --- a/QEfficient/transformers/models/phi/modeling_phi.py +++ b/QEfficient/transformers/models/phi/modeling_phi.py @@ -7,7 +7,7 @@ """PyTorch Phi model.""" -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Type, Union import torch from torch import nn @@ -323,6 +323,15 @@ class QEffPhiForCausalLM(PhiForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffPhiDecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py index b97a0ab8d..b48ab2897 100644 --- a/QEfficient/transformers/models/phi3/modeling_phi3.py +++ b/QEfficient/transformers/models/phi3/modeling_phi3.py @@ -7,7 +7,7 @@ """PyTorch Phi-3 model.""" -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -351,6 +351,15 @@ class QEffPhi3ForCausalLM(Phi3ForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffPhi3DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 2be4ea4d1..abb364d0a 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -893,39 +893,6 @@ def apply(cls, model: nn.Module, pooling: Union[str, Callable]) -> Tuple[nn.Modu return model, transformed -def get_decoder_layer_classes_for_export(model: nn.Module) -> set: - """ - Dynamically determine which DecoderLayer classes should be exported as functions - based on the model's architecture using the existing KVCacheTransform mapping. - """ - # Define patterns that identify decoder layer classes - DECODER_LAYER_PATTERNS = ["DecoderLayer", "Block", "Layer"] - - # Get all QEff classes that are decoder layers from the existing mapping - decoder_layer_classes = set() - - for original_class, qeff_class in KVCacheTransform._module_mapping.items(): - # Check if the QEff class name contains decoder layer patterns - qeff_class_name = qeff_class.__name__ - if any(pattern in qeff_class_name for pattern in DECODER_LAYER_PATTERNS): - decoder_layer_classes.add(qeff_class) - - # Filter to only include classes that are actually used in the current model - model_decoder_classes = set() - model_class_name = model.__class__.__name__ - if "EncoderWrapper" in model_class_name: - model_decoder_classes.update( - module.__class__ for module in model.modules() if "Qwen2_5_VLVisionBlock" in module.__class__.__name__ - ) - return model_decoder_classes - - model_decoder_classes.update( - module.__class__ for module in model.modules() if module.__class__ in decoder_layer_classes - ) - - return model_decoder_classes - - class BlockedKVAttentionTransform: _module_mapping = { QEffLlamaAttention, diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py index 7c093a4b0..841df6526 100644 --- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py +++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py @@ -7,7 +7,7 @@ """PyTorch Qwen2 model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -350,6 +350,15 @@ class QEffQwen2ForCausalLM(Qwen2ForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffQwen2DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index fa1bdd9b9..d6bfbda81 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -7,7 +7,7 @@ import math import os -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import torch import torch.nn as nn @@ -870,6 +870,15 @@ def __init__(self, model): self.model = model self.model.vision_model = self.model.visual + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.visual.blocks[0].__class__} + def forward(self, pixel_values, image_grid_thw): image_embeds = self.model.visual(pixel_values, grid_thw=image_grid_thw) bs = image_grid_thw.shape[0] @@ -885,6 +894,15 @@ def __init__(self, model): self.model = model self.language_model = self.model.model.language_model + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffQwen2_5_VLDecoderLayer} + def forward( self, input_ids, diff --git a/QEfficient/transformers/models/qwen3/modeling_qwen3.py b/QEfficient/transformers/models/qwen3/modeling_qwen3.py index 540bad4c7..ccc4bbac2 100644 --- a/QEfficient/transformers/models/qwen3/modeling_qwen3.py +++ b/QEfficient/transformers/models/qwen3/modeling_qwen3.py @@ -7,7 +7,7 @@ """PyTorch Qwen3 model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch import torch.utils.checkpoint @@ -351,6 +351,15 @@ class QEffQwen3ForCausalLM(Qwen3ForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffQwen3DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py index cbd80d8ca..5270a5c54 100644 --- a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Type import torch import torch.nn.functional as F @@ -371,6 +371,15 @@ def forward( class QEffQwen3MoeForCausalLM(Qwen3MoeForCausalLM): + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEffQwen3MoeDecoderLayer} + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py index c86e7478b..fdbbbf05d 100644 --- a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py +++ b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py @@ -7,7 +7,7 @@ """PyTorch Starcoder2 model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import torch from torch import nn @@ -275,6 +275,15 @@ class QEffStarcoder2ForCausalLM(Starcoder2ForCausalLM): - update the hidden_states, and fix for onnx model """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {QEFFStarcoder2DecoderLayer} + def forward( self, input_ids: torch.LongTensor = None, diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index a03ffecf7..246f005a7 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -5,7 +5,7 @@ # # ---------------------------------------------------------------------------- -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Type, Union import torch from torch import nn @@ -718,6 +718,15 @@ class QEffWhisperForConditionalGeneration(WhisperForConditionalGeneration): - changed forward inputs decoder_input_ids and decoder_position_ids to input_ids and position_ids """ + def get_submodules_for_export(self) -> Type[nn.Module]: + """ + Return the set of class used as the repeated layer across the model for subfunction extraction. + Notes: + This method should return the *class object* (not an instance). + Downstream code can use this to find/build subfunctions for repeated blocks. + """ + return {self.model.encoder.layers[0].__class__, QEffWhisperDecoderLayer} + def forward( self, input_features: Optional[torch.FloatTensor] = None, diff --git a/QEfficient/utils/export_utils.py b/QEfficient/utils/export_utils.py index 32b34557e..3a954556f 100644 --- a/QEfficient/utils/export_utils.py +++ b/QEfficient/utils/export_utils.py @@ -14,7 +14,6 @@ from QEfficient.base.onnx_transforms import CustomOpTransform, RenameFunctionOutputsTransform from QEfficient.transformers.cache_utils import InvalidIndexProvider -from QEfficient.transformers.models.pytorch_transforms import get_decoder_layer_classes_for_export from QEfficient.utils.cache import QEFF_HOME from QEfficient.utils.hash_utils import create_export_hash from QEfficient.utils.logging_utils import logger @@ -165,7 +164,10 @@ def _setup_onnx_subfunctions(qeff_model, args, kwargs): # Transform output names for subfunction compatibility if "output_names" in kwargs: kwargs["output_names"] = [ - re.sub("_RetainedState", "_InternalRetainedState", name) for name in kwargs["output_names"] + re.sub("_RetainedState", "_InternalRetainedState", name) + if name.endswith("_RetainedState") and ("key" in name or "value" in name) + else name + for name in kwargs["output_names"] ] else: warnings.warn( @@ -178,9 +180,9 @@ def _setup_onnx_subfunctions(qeff_model, args, kwargs): qeff_model._onnx_transforms.append(CustomOpTransform) # TODO: Handle this in the modelling class QEFFTransformersBase,remove from here. Refer diffusers implementation - decoder_layer_classes = get_decoder_layer_classes_for_export(qeff_model.model) - if decoder_layer_classes: - kwargs["export_modules_as_functions"] = decoder_layer_classes + submodule_classes = qeff_model.model.get_submodules_for_export() + if submodule_classes: + kwargs["export_modules_as_functions"] = submodule_classes return args, kwargs diff --git a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py new file mode 100644 index 000000000..9e98ab7d7 --- /dev/null +++ b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py @@ -0,0 +1,180 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +from typing import Optional + +import onnx +import pytest +import requests +import torch +from PIL import Image +from transformers import ( + AutoConfig, + AutoModelForImageTextToText, + AutoProcessor, +) + +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText +from QEfficient.utils import hf_download +from QEfficient.utils._utils import get_num_layers_vlm +from QEfficient.utils.device_utils import get_available_device_id + +NEW_GENERATION_TOKENS = 10 +test_models_config = [ + # CONFIG PARAMS NEEDED FOR A MODEL TO BE TESTED + # ( + # model_name, + # kv_offload, + # batch_size, + # prompt_len, + # ctx_len, + # img_size, + # img_url", + # text_prompt, + # number of layers of the model, + # ), + ( + "Qwen/Qwen2.5-VL-3B-Instruct", + True, + 1, + 128, + 4096, + 1540, + "https://picsum.photos/id/237/536/354", + "Can you describe the image in detail.", + 1, + ), +] + + +def load_image_text_to_text_model(model_config): + model_path = hf_download( + repo_id=model_config._name_or_path, + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + + model_hf = AutoModelForImageTextToText.from_pretrained( + model_path, + low_cpu_mem_usage=False, + config=model_config, + ) + params = sum(p.numel() for p in model_hf.parameters()) + model_hf.eval() + return model_hf, params + + +def has_QwenLayer_function(onnx_path): + """Check if ONNX model contains QEffqwenlayer function definition.""" + model = onnx.load(onnx_path, load_external_data=False) + function_names = [f.name for f in model.functions] + QwenLayer_functions = [name for name in function_names if "QEffQwen2_5_VLDecoderLayer" in name] + return len(QwenLayer_functions) > 0, QwenLayer_functions + + +def check_image_text_to_text_subfunction_core( + model_name: str, + img_size: int, + img_url: str, + query: str, + prompt_len: int, + ctx_len: int, + max_gen_len: int = 20, + batch_size: int = 1, + n_layer: int = 1, + kv_offload: bool = False, + num_devices: int = 1, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, +): + model_config = {"model_name": model_name} + model_config["img_size"] = img_size + config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True, padding=True) + config.text_config.num_hidden_layers = n_layer + config.vision_config.num_hidden_layers = n_layer + model_hf, _ = load_image_text_to_text_model(config) + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + + n_layer = get_num_layers_vlm(config) + image = Image.open(requests.get(img_url, stream=True).raw) + + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": query}, + {"type": "image"}, + ], + }, + ] + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + + inputs = processor(images=image, text=prompt, return_tensors="pt") + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_config["model_name"], + kv_offload=kv_offload, + config=config, + ) + + with_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False) + + if not get_available_device_id(): + pytest.skip("No available devices to run model on Cloud AI 100") + + inputs = processor(images=image, text=prompt, return_tensors="pt") + if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": + inputs = qeff_model.model.prepare_inputs_for_generation( + inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size + ) + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + # Verify that the model with subfunctions has QEffQwen2_5_VLDecoderLayer function definition + has_qwenlayer, qwenlayer_names = has_QwenLayer_function(with_sub_func_onnx[-1]) + assert has_qwenlayer, ( + "Model exported with use_onnx_subfunctions=True should contain QEffQwen2_5_VLDecoderLayer function definition" + ) + print(f"\nQwenLayer functions found: {qwenlayer_names}") + + qeff_model.compile( + img_size=model_config["img_size"], + num_devices=num_devices, + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + mxfp6=False, + enable_qnn=enable_qnn, + qnn_config=qnn_config, + ) + return + + +@pytest.mark.on_qaic +@pytest.mark.multimodal +@pytest.mark.parametrize( + "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config +) +def test_image_text_to_text_subfunction( + model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer +): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + check_image_text_to_text_subfunction_core( + model_name=model_name, + prompt_len=prompt_len, + ctx_len=ctx_len, + max_gen_len=NEW_GENERATION_TOKENS, + img_size=img_size, + img_url=img_url, + query=query, + n_layer=n_layer, + batch_size=batch_size, + kv_offload=kv_offload, + ) diff --git a/tests/transformers/test_causal_lm.py b/tests/transformers/test_causal_lm.py index 6480fcdc9..fc89fdf8b 100644 --- a/tests/transformers/test_causal_lm.py +++ b/tests/transformers/test_causal_lm.py @@ -14,7 +14,6 @@ from transformers import AutoConfig, AutoModel, AutoModelForCausalLM from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM -from QEfficient.transformers.models.pytorch_transforms import get_decoder_layer_classes_for_export from QEfficient.utils import constants, get_padding_shape_from_config from QEfficient.utils.hash_utils import hash_dict_params @@ -225,7 +224,7 @@ def test_causal_lm_hash_creation(config, cb, subfunc, prefill_only, tmp_path): export_params["dynamic_axes"] = dynamic_axes hash_params["export_params"] = export_params if subfunc: - hash_params["export_modules_as_functions"] = get_decoder_layer_classes_for_export(qeff_model.model) + hash_params["export_modules_as_functions"] = qeff_model.model.get_submodules_for_export() manual_hash = hash_dict_params(hash_params)