From 4a074a778e5fe2eb90d122bb64cabea417dcafef Mon Sep 17 00:00:00 2001
From: Abhishek Kumar Singh <sabhis@qti.qualcomm.com>
Date: Mon, 5 Jan 2026 11:07:15 +0000
Subject: [PATCH 01/12] Added all the changes for enabling subfunction for VLMs

Signed-off-by: Abhishek Kumar Singh <sabhis@qti.qualcomm.com>
---
 .../models/falcon/modeling_falcon.py          | 12 ++++++++-
 .../models/gemma/modeling_gemma.py            | 11 +++++++-
 .../models/gemma2/modeling_gemma2.py          | 11 +++++++-
 .../models/gemma3/modeling_gemma3.py          | 20 +++++++++++++-
 .../transformers/models/gpt2/modeling_gpt2.py | 11 +++++++-
 .../gpt_bigcode/modeling_gpt_bigcode.py       | 11 +++++++-
 .../models/gpt_oss/modeling_gpt_oss.py        | 12 ++++++++-
 .../transformers/models/gptj/modeling_gptj.py | 11 +++++++-
 .../models/granite/modeling_granite.py        | 11 +++++++-
 .../models/granitemoe/modeling_granitemoe.py  | 11 +++++++-
 .../models/grok_1/modeling_grok1.py           | 11 +++++++-
 .../models/internvl/modeling_internvl.py      | 20 +++++++++++++-
 .../models/llama/modeling_llama.py            | 11 +++++++-
 .../models/llama4/modeling_llama4.py          | 20 +++++++++++++-
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 11 +++++++-
 .../models/llava/modeling_llava.py            | 20 +++++++++++++-
 .../models/llava_next/modeling_llava_next.py  | 11 +++++++-
 .../models/mistral/modeling_mistral.py        | 11 +++++++-
 .../models/mistral3/modeling_mistral3.py      | 20 +++++++++++++-
 .../models/mixtral_moe/modeling_mixtral.py    | 11 +++++++-
 .../models/molmo/modeling_molmo.py            | 20 +++++++++++++-
 .../transformers/models/mpt/modeling_mpt.py   | 11 +++++++-
 .../models/olmo2/modeling_olmo2.py            | 11 +++++++-
 .../transformers/models/phi/modeling_phi.py   | 11 +++++++-
 .../transformers/models/phi3/modeling_phi3.py | 11 +++++++-
 .../transformers/models/pytorch_transforms.py | 26 -------------------
 .../models/qwen2/modeling_qwen2.py            | 11 +++++++-
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py  | 25 +++++++++++++++---
 .../models/qwen3/modeling_qwen3.py            | 11 +++++++-
 .../models/qwen3_moe/modeling_qwen3_moe.py    | 11 +++++++-
 .../models/starcoder2/modeling_starcoder2.py  | 11 +++++++-
 .../models/whisper/modeling_whisper.py        | 11 +++++++-
 QEfficient/utils/export_utils.py              | 15 ++++++++---
 QEfficient/utils/torch_patches.py             |  8 ++++--
 34 files changed, 394 insertions(+), 66 deletions(-)

diff --git a/QEfficient/transformers/models/falcon/modeling_falcon.py b/QEfficient/transformers/models/falcon/modeling_falcon.py
index 1cfdf88e1..e29a06241 100644
--- a/QEfficient/transformers/models/falcon/modeling_falcon.py
+++ b/QEfficient/transformers/models/falcon/modeling_falcon.py
@@ -8,9 +8,10 @@
 """PyTorch Falcon model."""
 
 import math
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Type, Union
 
 import torch
+import torch.nn as nn
 import torch.utils.checkpoint
 from torch.nn import functional as F
 from transformers.cache_utils import Cache
@@ -353,6 +354,15 @@ class QEffFalconForCausalLM(FalconForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffFalconDecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/gemma/modeling_gemma.py b/QEfficient/transformers/models/gemma/modeling_gemma.py
index 1edb8ef53..59a9d6809 100644
--- a/QEfficient/transformers/models/gemma/modeling_gemma.py
+++ b/QEfficient/transformers/models/gemma/modeling_gemma.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -336,6 +336,15 @@ class QEffGemmaForCausalLM(GemmaForCausalLM):
     - add new args cache idx for the kv retention
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffGemmaDecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/gemma2/modeling_gemma2.py b/QEfficient/transformers/models/gemma2/modeling_gemma2.py
index 2944601c9..00df57240 100644
--- a/QEfficient/transformers/models/gemma2/modeling_gemma2.py
+++ b/QEfficient/transformers/models/gemma2/modeling_gemma2.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -388,6 +388,15 @@ class QEffGemma2ForCausalLM(Gemma2ForCausalLM, GenerationMixin):
     - add new args cache idx for the kv retention
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffGemma2DecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py
index a6e451bec..29f7b13d0 100644
--- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py
+++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py
@@ -6,7 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import copy
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -589,6 +589,15 @@ def __init__(self, model):
         self.model = model
         self.model.vision_model = self.model.vision_tower
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.vision_tower.vision_model.encoder.layers[0].__class__}
+
     def forward(self, pixel_values):
         image_features = self.model.get_image_features(pixel_values=pixel_values)
         return image_features
@@ -602,6 +611,15 @@ def __init__(self, model):
         self.config = self.model.config
         self.lm_head = self.model.lm_head
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffGemma3DecoderLayer}
+
     def forward(
         self,
         input_ids,
diff --git a/QEfficient/transformers/models/gpt2/modeling_gpt2.py b/QEfficient/transformers/models/gpt2/modeling_gpt2.py
index 6136a2c5d..ab452baea 100644
--- a/QEfficient/transformers/models/gpt2/modeling_gpt2.py
+++ b/QEfficient/transformers/models/gpt2/modeling_gpt2.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -397,6 +397,15 @@ class QEffGPT2LMHeadModel(GPT2LMHeadModel):
     - add new args position idx for the cache_kwargs for kv retention
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffGPT2Block}
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 85ea42674..604def959 100644
--- a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -7,7 +7,7 @@
 
 """PyTorch GPTBigCode model."""
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Type, Union
 
 import torch
 import torch.utils.checkpoint
@@ -378,6 +378,15 @@ def forward(
 
 
 class QEffGPTBigCodeForCausalLM(GPTBigCodeForCausalLM):
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffGPTBigCodeBlock}
+
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
index 3efe890b8..b82cd7c81 100644
--- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
+++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -6,7 +6,7 @@
 # -----------------------------------------------------------------------------
 import math
 import os
-from typing import Callable, Optional, Union
+from typing import Callable, Optional, Type, Union
 
 import torch
 from torch import nn
@@ -1205,6 +1205,16 @@ def forward(
 
 
 class QEffGptOssForCausalLM(GptOssForCausalLM):
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffGptOssDecoderLayer}
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/QEfficient/transformers/models/gptj/modeling_gptj.py b/QEfficient/transformers/models/gptj/modeling_gptj.py
index 1a9e45e97..2a7c475ed 100644
--- a/QEfficient/transformers/models/gptj/modeling_gptj.py
+++ b/QEfficient/transformers/models/gptj/modeling_gptj.py
@@ -7,7 +7,7 @@
 
 """PyTorch GPT-J model."""
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -318,6 +318,15 @@ class QEffGPTJForCausalLM(GPTJForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffGPTJBlock}
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/QEfficient/transformers/models/granite/modeling_granite.py b/QEfficient/transformers/models/granite/modeling_granite.py
index 62be5f54d..c791b02f4 100644
--- a/QEfficient/transformers/models/granite/modeling_granite.py
+++ b/QEfficient/transformers/models/granite/modeling_granite.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -347,6 +347,15 @@ class QEffGraniteForCausalLM(GraniteForCausalLM):
     Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffGraniteDecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py
index b158b4046..fbeaae68c 100644
--- a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn.functional as F
@@ -493,6 +493,15 @@ class QEffGraniteMoeForCausalLM(GraniteMoeForCausalLM):
     Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.layers[0].__class__}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/grok_1/modeling_grok1.py b/QEfficient/transformers/models/grok_1/modeling_grok1.py
index 2d8fc412d..9c1e7c4b6 100644
--- a/QEfficient/transformers/models/grok_1/modeling_grok1.py
+++ b/QEfficient/transformers/models/grok_1/modeling_grok1.py
@@ -5,7 +5,7 @@
 #
 # ----------------------------------------------------------------------------
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
@@ -397,6 +397,15 @@ class QEffGrok1ModelForCausalLM(nn.Module):
     Grok model for causal language modeling.
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffGrok1DecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
index b47db7eda..026b1f9ae 100644
--- a/QEfficient/transformers/models/internvl/modeling_internvl.py
+++ b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import List, Optional
+from typing import List, Optional, Type
 
 import torch
 import torch.nn as nn
@@ -21,6 +21,15 @@ def __init__(self, model):
         super().__init__()
         self.model = model
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.vision_model.encoder.layers[0].__class__}
+
     def forward(self, pixel_values):
         vision_embeds = self.model.extract_feature(pixel_values)
         # Reshape from [num_patches, 256, hidden_dim] -> [1, num_patches*256, head_dim]
@@ -36,6 +45,15 @@ def __init__(self, model):
         self.config = self.model.language_model.config
         self.language_model = self.model.language_model
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of  class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.language_model.model.layers[0].__class__}
+
     def forward(
         self,
         input_ids,
diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py
index fb3aed556..065db2193 100644
--- a/QEfficient/transformers/models/llama/modeling_llama.py
+++ b/QEfficient/transformers/models/llama/modeling_llama.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -404,6 +404,15 @@ class QEffLlamaForCausalLM(LlamaForCausalLM):
     Copied from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffLlamaDecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py
index 834ee8880..16a576d02 100644
--- a/QEfficient/transformers/models/llama4/modeling_llama4.py
+++ b/QEfficient/transformers/models/llama4/modeling_llama4.py
@@ -6,7 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import math
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -822,6 +822,15 @@ def __init__(self, model):
         super().__init__()
         self.model = model
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.vision_model.model.layers[0].__class__}
+
     def forward(self, pixel_values):
         vision_feature_layer = self.model.config.vision_config.vision_feature_layer
         vision_feature_select_strategy = self.model.config.vision_config.vision_feature_select_strategy
@@ -849,6 +858,15 @@ def __init__(self, model):
         self.language_model = self.model.language_model
         self.config = self.model.config
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffLlama4TextDecoderLayer}
+
     def forward(
         self,
         input_ids,
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index fa42b3f96..be1cc8cdc 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -11,7 +11,7 @@
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -416,6 +416,15 @@ def __init__(self, config: QEffLlamaSwiftKVConfig):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.config = config
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffLlamaSwiftKVDecoderLayer}
+
     def forward(
         self,
         input_ids: torch.Tensor,
diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py
index abdb77ea5..64fc41c09 100644
--- a/QEfficient/transformers/models/llava/modeling_llava.py
+++ b/QEfficient/transformers/models/llava/modeling_llava.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import List, Optional
+from typing import List, Optional, Type
 
 import torch
 import torch.nn as nn
@@ -30,6 +30,15 @@ def __init__(self, model):
         self.model = model
         self.model.vision_model = self.model.vision_tower
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.vision_tower.vision_model.encoder.layers[0].__class__}
+
     def forward(self, pixel_values):
         # Image features
         image_outputs = self.model.vision_tower(pixel_values, output_hidden_states=True)
@@ -54,6 +63,15 @@ def __init__(self, model):
         self.language_model = self.model.language_model
         self.lm_head = self.model.lm_head
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.language_model.layers[0].__class__}
+
     def forward(
         self,
         input_ids,
diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py
index 627f7393e..a51272980 100755
--- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py
+++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py
@@ -6,7 +6,7 @@
 # -----------------------------------------------------------------------------
 
 
-from typing import List, Optional
+from typing import List, Optional, Type
 
 import numpy as np
 import torch
@@ -30,6 +30,15 @@ def __init__(self, model):
         self.model = model
         self.model.vision_model = self.model.vision_tower
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.vision_tower.vision_model.encoder.layers[0].__class__}
+
     def forward(self, pixel_values, image_sizes):
         if pixel_values.dim() == constants.GRANITEVISION_PIXEL_VALUE_DIM:
             pixel_values_new = pixel_values.squeeze(0)
diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py
index 5edfb8f3a..de9b1a7e6 100644
--- a/QEfficient/transformers/models/mistral/modeling_mistral.py
+++ b/QEfficient/transformers/models/mistral/modeling_mistral.py
@@ -7,7 +7,7 @@
 
 """PyTorch Mistral model."""
 
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Type, Union
 
 import torch
 import torch.utils.checkpoint
@@ -356,6 +356,15 @@ class QEffMistralForCausalLM(MistralForCausalLM):
     - add new args cache idx for the kv retention
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffMistralDecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py
index d2149b6bd..3bf151b97 100644
--- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py
+++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
@@ -151,6 +151,15 @@ def __init__(self, model):
         self.model = model
         self.model.vision_model = self.model.vision_tower
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.vision_tower.transformer.layers[0].__class__}
+
     def forward(self, pixel_values):
         image_sizes = torch.tensor([[pixel_values.shape[2], pixel_values.shape[3]]]).repeat(pixel_values.shape[0], 1)
         image_features = self.model.get_image_features(
@@ -168,6 +177,15 @@ def __init__(self, model):
         self.config = self.model.config
         self.language_model = self.model.language_model
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return self.model.language_model.layers[0].__class__
+
     def forward(
         self,
         input_ids,
diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
index 862714fea..f811bea65 100644
--- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
+++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
@@ -7,7 +7,7 @@
 
 """PyTorch Mixtral model."""
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn.functional as F
@@ -414,6 +414,15 @@ class QEffMixtralForCausalLM(MixtralForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QeffMixtralDecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py
index b686e6aed..093e468ff 100644
--- a/QEfficient/transformers/models/molmo/modeling_molmo.py
+++ b/QEfficient/transformers/models/molmo/modeling_molmo.py
@@ -6,7 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import math
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
@@ -568,6 +568,15 @@ def __init__(self, model):
         super().__init__()
         self.model = model
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.model.transformer.blocks[0].__class__}
+
     def forward(self, pixel_values, image_masks, image_input_idx, valid_idx):
         image_features, _ = self.model.model.vision_backbone(pixel_values, image_masks)
         num_image, num_patch = image_features.shape[1:3]
@@ -588,6 +597,15 @@ def __init__(self, model):
         # self.language_model = self.model.language_model
         self.config = self.model.config
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.model.vision_backbone.image_vit.transformer.resblocks[0].__class__}
+
     def forward(
         self,
         input_ids,
diff --git a/QEfficient/transformers/models/mpt/modeling_mpt.py b/QEfficient/transformers/models/mpt/modeling_mpt.py
index c1d98c1f8..929e157cc 100644
--- a/QEfficient/transformers/models/mpt/modeling_mpt.py
+++ b/QEfficient/transformers/models/mpt/modeling_mpt.py
@@ -7,7 +7,7 @@
 
 """PyTorch MPT model."""
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Type, Union
 
 import torch
 import torch.utils.checkpoint
@@ -254,6 +254,15 @@ class QEffMptForCausalLM(MptForCausalLM):
     - add new args cache idx for the kv retention
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffMptBlock}
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/QEfficient/transformers/models/olmo2/modeling_olmo2.py b/QEfficient/transformers/models/olmo2/modeling_olmo2.py
index 00755cae5..02645e185 100644
--- a/QEfficient/transformers/models/olmo2/modeling_olmo2.py
+++ b/QEfficient/transformers/models/olmo2/modeling_olmo2.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -324,6 +324,15 @@ class QEffOlmo2ForCausalLM(Olmo2ForCausalLM):
     - add new args cache idx for the kv retention
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffOlmo2DecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/phi/modeling_phi.py b/QEfficient/transformers/models/phi/modeling_phi.py
index 4bf2e8785..2efbb313e 100644
--- a/QEfficient/transformers/models/phi/modeling_phi.py
+++ b/QEfficient/transformers/models/phi/modeling_phi.py
@@ -7,7 +7,7 @@
 
 """PyTorch Phi model."""
 
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -323,6 +323,15 @@ class QEffPhiForCausalLM(PhiForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffPhiDecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py
index b97a0ab8d..e25deed37 100644
--- a/QEfficient/transformers/models/phi3/modeling_phi3.py
+++ b/QEfficient/transformers/models/phi3/modeling_phi3.py
@@ -7,7 +7,7 @@
 
 """PyTorch Phi-3 model."""
 
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Optional, Tuple, Type, Union
 
 import torch
 import torch.utils.checkpoint
@@ -351,6 +351,15 @@ class QEffPhi3ForCausalLM(Phi3ForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffPhi3DecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index b978b6193..abb364d0a 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -893,32 +893,6 @@ def apply(cls, model: nn.Module, pooling: Union[str, Callable]) -> Tuple[nn.Modu
         return model, transformed
 
 
-def get_decoder_layer_classes_for_export(model: nn.Module) -> set:
-    """
-    Dynamically determine which DecoderLayer classes should be exported as functions
-    based on the model's architecture using the existing KVCacheTransform mapping.
-    """
-    # Define patterns that identify decoder layer classes
-    DECODER_LAYER_PATTERNS = ["DecoderLayer", "Block", "Layer"]
-
-    # Get all QEff classes that are decoder layers from the existing mapping
-    decoder_layer_classes = set()
-
-    for original_class, qeff_class in KVCacheTransform._module_mapping.items():
-        # Check if the QEff class name contains decoder layer patterns
-        qeff_class_name = qeff_class.__name__
-        if any(pattern in qeff_class_name for pattern in DECODER_LAYER_PATTERNS):
-            decoder_layer_classes.add(qeff_class)
-
-    # Filter to only include classes that are actually used in the current model
-    model_decoder_classes = set()
-    for module in model.modules():
-        if module.__class__ in decoder_layer_classes:
-            model_decoder_classes.add(module.__class__)
-
-    return model_decoder_classes
-
-
 class BlockedKVAttentionTransform:
     _module_mapping = {
         QEffLlamaAttention,
diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
index 7c093a4b0..7404f2f6c 100644
--- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py
+++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
@@ -7,7 +7,7 @@
 
 """PyTorch Qwen2 model."""
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 import torch.utils.checkpoint
@@ -350,6 +350,15 @@ class QEffQwen2ForCausalLM(Qwen2ForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffQwen2DecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 21d2e026e..718c50e34 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -7,7 +7,7 @@
 
 import math
 import os
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
@@ -74,12 +74,11 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, mrope_section, unsqu
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
 
-    mrope_section = mrope_section * 2
     cos = cos[position_ids]
     sin = sin[position_ids]
 
-    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim)
-    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim)
+    cos = torch.cat([cos[0, ..., 0:32], cos[0, ..., 32:80], cos[0, ..., 80:128]], dim=-1).unsqueeze(0)
+    sin = torch.cat([sin[0, ..., 0:32], sin[0, ..., 32:80], sin[0, ..., 80:128]], dim=-1).unsqueeze(0)
 
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
@@ -872,6 +871,15 @@ def __init__(self, model):
         self.model = model
         self.model.vision_model = self.model.visual
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.visual.blocks[0].__class__}
+
     def forward(self, pixel_values, image_grid_thw):
         image_embeds = self.model.visual(pixel_values, grid_thw=image_grid_thw)
         bs = image_grid_thw.shape[0]
@@ -887,6 +895,15 @@ def __init__(self, model):
         self.model = model
         self.language_model = self.model.model.language_model
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffQwen2_5_VLDecoderLayer}
+
     def forward(
         self,
         input_ids,
diff --git a/QEfficient/transformers/models/qwen3/modeling_qwen3.py b/QEfficient/transformers/models/qwen3/modeling_qwen3.py
index 540bad4c7..b310499be 100644
--- a/QEfficient/transformers/models/qwen3/modeling_qwen3.py
+++ b/QEfficient/transformers/models/qwen3/modeling_qwen3.py
@@ -7,7 +7,7 @@
 
 """PyTorch Qwen3 model."""
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 import torch.utils.checkpoint
@@ -351,6 +351,15 @@ class QEffQwen3ForCausalLM(Qwen3ForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffQwen3DecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py
index cbd80d8ca..18e1e7611 100644
--- a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py
+++ b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Type
 
 import torch
 import torch.nn.functional as F
@@ -371,6 +371,15 @@ def forward(
 
 
 class QEffQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffQwen3MoeDecoderLayer}
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py
index c86e7478b..3387f0fba 100644
--- a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py
@@ -7,7 +7,7 @@
 
 """PyTorch Starcoder2 model."""
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -275,6 +275,15 @@ class QEffStarcoder2ForCausalLM(Starcoder2ForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEFFStarcoder2DecoderLayer}
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py
index a03ffecf7..650258328 100644
--- a/QEfficient/transformers/models/whisper/modeling_whisper.py
+++ b/QEfficient/transformers/models/whisper/modeling_whisper.py
@@ -5,7 +5,7 @@
 #
 # ----------------------------------------------------------------------------
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -718,6 +718,15 @@ class QEffWhisperForConditionalGeneration(WhisperForConditionalGeneration):
     - changed forward inputs decoder_input_ids and decoder_position_ids to input_ids and position_ids
     """
 
+    def get_repeated_layer_class(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.encoder.layers[0].__class__, QEffWhisperDecoderLayer}
+
     def forward(
         self,
         input_features: Optional[torch.FloatTensor] = None,
diff --git a/QEfficient/utils/export_utils.py b/QEfficient/utils/export_utils.py
index 33ba694cf..9380ae440 100644
--- a/QEfficient/utils/export_utils.py
+++ b/QEfficient/utils/export_utils.py
@@ -14,7 +14,6 @@
 
 from QEfficient.base.onnx_transforms import CustomOpTransform, RenameFunctionOutputsTransform
 from QEfficient.transformers.cache_utils import InvalidIndexProvider
-from QEfficient.transformers.models.pytorch_transforms import get_decoder_layer_classes_for_export
 from QEfficient.utils.cache import QEFF_HOME
 from QEfficient.utils.hash_utils import create_export_hash
 from QEfficient.utils.logging_utils import logger
@@ -164,18 +163,26 @@ def _setup_onnx_subfunctions(qeff_model, args, kwargs):
     # Transform output names for subfunction compatibility
     if "output_names" in kwargs:
         kwargs["output_names"] = [
-            re.sub("_RetainedState", "_InternalRetainedState", name) for name in kwargs["output_names"]
+            re.sub("_RetainedState", "_InternalRetainedState", name)
+            if name.endswith("_RetainedState") and ("key" in name or "value" in name)
+            else name
+            for name in kwargs["output_names"]
         ]
     else:
         args = list(args)
-        args[1] = [re.sub("_RetainedState", "_InternalRetainedState", name) for name in args[1]]
+        args[1] = [
+            re.sub("_RetainedState", "_InternalRetainedState", name)
+            if name.endswith("_RetainedState") and ("key" in name or "value" in name)
+            else name
+            for name in args[1]
+        ]
         args = tuple(args)
     # Add subfunction-specific ONNX transforms
     qeff_model._onnx_transforms.append(RenameFunctionOutputsTransform)
     qeff_model._onnx_transforms.append(CustomOpTransform)
 
     # TODO: Handle this in the modelling class QEFFTransformersBase,remove from here. Refer diffusers implementation
-    decoder_layer_classes = get_decoder_layer_classes_for_export(qeff_model.model)
+    decoder_layer_classes = qeff_model.model.get_repeated_layer_class()
     if decoder_layer_classes:
         kwargs["export_modules_as_functions"] = decoder_layer_classes
     return args, kwargs
diff --git a/QEfficient/utils/torch_patches.py b/QEfficient/utils/torch_patches.py
index 0b9b37afa..1752b5979 100644
--- a/QEfficient/utils/torch_patches.py
+++ b/QEfficient/utils/torch_patches.py
@@ -11,6 +11,8 @@
 import torch.onnx.utils as onnx_utils
 from torch import _C
 
+from QEfficient.utils.logging_utils import logger
+
 # Store original references before patching
 _original_setup_trace_module_map = onnx_utils._setup_trace_module_map
 _original_get_module_attributes = getattr(onnx_utils, "_get_module_attributes", None)
@@ -38,8 +40,10 @@ def _track_module_attributes_forward_hook(module, input, output):
                 onnx_attrs = getattr(module, attr_name)
                 delattr(module, attr_name)
             # FIX: use empty dict to avoid type mismatch
-            onnx_attrs = {}
-            _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+            try:
+                _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+            except Exception as e:
+                logger.warning(f"Failed to track ONNX scope attributes: {e}. Skipping this step.")
 
         for m in model.modules():
             m.register_forward_hook(_track_module_attributes_forward_hook)

From 4458154d6bffccf378da106c940484032d1a7e22 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Sun, 11 Jan 2026 20:19:33 +0000
Subject: [PATCH 02/12] Fixed rope method for batch size > 1

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 QEfficient/transformers/models/modeling_auto.py              | 1 +
 .../transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py    | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index d2cc1e681..55253f9b0 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1030,6 +1030,7 @@ def export(
             offload_pt_weights=False,
             use_onnx_subfunctions=use_onnx_subfunctions,
         )
+
         self.lang_model.export(
             inputs["lang"],
             output_names["lang"],
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 718c50e34..784ebdd84 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -76,9 +76,8 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, mrope_section, unsqu
 
     cos = cos[position_ids]
     sin = sin[position_ids]
-
-    cos = torch.cat([cos[0, ..., 0:32], cos[0, ..., 32:80], cos[0, ..., 80:128]], dim=-1).unsqueeze(0)
-    sin = torch.cat([sin[0, ..., 0:32], sin[0, ..., 32:80], sin[0, ..., 80:128]], dim=-1).unsqueeze(0)
+    cos = torch.cat([cos[0, ..., 0:32], cos[1, ..., 32:80], cos[2, ..., 80:128]], dim=-1).unsqueeze(0)
+    sin = torch.cat([sin[0, ..., 0:32], sin[1, ..., 32:80], sin[2, ..., 80:128]], dim=-1).unsqueeze(0)
 
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)

From d2a81ad9957568e380f1af538221ca18746e92cc Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Mon, 12 Jan 2026 08:44:52 +0000
Subject: [PATCH 03/12] Added test file for subfunction with VLM

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 .../test_subfunction_vlm.py                   | 343 ++++++++++++++++++
 1 file changed, 343 insertions(+)
 create mode 100644 tests/transformers/models/image_text_to_text/test_subfunction_vlm.py

diff --git a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
new file mode 100644
index 000000000..88f89c618
--- /dev/null
+++ b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
@@ -0,0 +1,343 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+from typing import Optional
+
+import pytest
+import requests
+import torch
+from PIL import Image
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoModelForImageTextToText,
+    AutoProcessor,
+    TextStreamer,
+)
+
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText
+from QEfficient.utils import hf_download
+from QEfficient.utils._utils import get_num_layers_vlm
+from QEfficient.utils.device_utils import get_available_device_id
+
+NEW_GENERATION_TOKENS = 10
+test_models_config = [
+    # CONFIG PARAMS NEEDED FOR A MODEL TO BE TESTED
+    # (
+    # model_name,
+    # kv_offload,
+    # batch_size,
+    # prompt_len,
+    # ctx_len,
+    # img_size,
+    # img_url",
+    # text_prompt,
+    # number of layers of the model,
+    # ),
+    (
+        "llava-hf/llava-1.5-7b-hf",
+        True,
+        1,
+        784,
+        1024,
+        336,
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+        "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
+        1,
+    ),
+    (
+        "llava-hf/llava-1.5-7b-hf",
+        False,
+        1,
+        784,
+        1024,
+        336,
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+        "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
+        1,
+    ),
+    # Disabled in CI due to performance issues
+    # (
+    #     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    #     True,
+    #     1,
+    #     128,
+    #     3072,
+    #     336,
+    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+    #     "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
+    #     4,
+    # ),
+    # (
+    #     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    #     False,
+    #     1,
+    #     128,
+    #     3072,
+    #     336,
+    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+    #     "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
+    #     4,
+    # ),
+    (
+        "google/gemma-3-4b-it",
+        True,
+        1,
+        128,
+        3072,
+        896,
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+        "Can you describe the image in detail.",
+        1,
+    ),
+    (
+        "google/gemma-3-4b-it",
+        False,
+        1,
+        128,
+        3072,
+        896,
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+        "Can you describe the image in detail.",
+        1,
+    ),
+    (
+        "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+        True,
+        1,
+        128,
+        4096,
+        1540,
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+        "Can you describe the image in detail.",
+        1,
+    ),
+    (
+        "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+        False,
+        1,
+        128,
+        4096,
+        1540,
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+        "Can you describe the image in detail.",
+        1,
+    ),
+    (
+        "Qwen/Qwen2.5-VL-3B-Instruct",
+        True,
+        1,
+        128,
+        4096,
+        1540,
+        "https://picsum.photos/id/237/536/354",
+        "Can you describe the image in detail.",
+        1,
+    ),
+    # (
+    #     "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    #     True,
+    #     1,
+    #     32,
+    #     512,
+    #     560,
+    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",
+    #     "Explain this image",
+    #     7,
+    # ),
+]
+
+intern_model_config = [
+    (
+        "OpenGVLab/InternVL2_5-1B",
+        True,
+        1,
+        384,
+        512,
+        "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg",
+        "Please describe the image in detail.",
+        2,
+    ),
+    (
+        "OpenGVLab/InternVL3_5-1B",
+        True,
+        1,
+        384,
+        512,
+        "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg",
+        "Please describe the image in detail.",
+        2,
+    ),
+    # (
+    #     "OpenGVLab/InternVL2_5-1B",
+    #     False,
+    #     1,
+    #     384,
+    #     512,
+    #     "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg",
+    #     "Please describe the image in detail.",
+    #     2,
+    # ), # commented becuase QNN Convertor is not supported for this model yet.
+]
+
+molmo_model_config = [
+    # Disabled in CI due to HF issues
+    # (
+    #     "allenai/Molmo-7B-D-0924",
+    #     True,
+    #     1,
+    #     128,
+    #     4096,
+    #     "https://picsum.photos/id/237/536/354",
+    #     "Can you describe the image in detail.",
+    #     2,
+    # ),
+]
+
+
+def load_image_text_to_text_model(model_config):
+    model_path = hf_download(
+        repo_id=model_config._name_or_path,
+        ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
+    )
+    try:
+        model_hf = AutoModelForImageTextToText.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=False,
+            config=model_config,
+        )
+    except ValueError:
+        model_hf = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=False,
+            trust_remote_code=True,
+            config=model_config,
+        )
+    params = sum(p.numel() for p in model_hf.parameters())
+    model_hf.eval()
+    return model_hf, params
+
+
+def set_num_layers(config, n_layer=1):
+    ## -1 indicates use all the layers of the model.
+    if n_layer == -1:
+        return config
+    elif hasattr(config, "model_type") and "mllama" in config.model_type:
+        config.text_config.num_hidden_layers = n_layer
+        config.text_config.cross_attention_layers = [
+            x for x in config.text_config.cross_attention_layers if x < n_layer
+        ]
+    elif hasattr(config, "text_config"):
+        config.text_config.num_hidden_layers = n_layer
+        config.vision_config.num_hidden_layers = n_layer
+    elif hasattr(config, "llm_config"):
+        config.llm_config.num_hidden_layers = n_layer
+        config.vision_config.num_hidden_layers = n_layer
+    else:
+        config.num_hidden_layers = n_layer
+    return config
+
+
+def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
+    model_name: str,
+    img_size: int,
+    img_url: str,
+    query: str,
+    prompt_len: int,
+    ctx_len: int,
+    max_gen_len: int = 20,
+    batch_size: int = 1,
+    n_layer: int = 1,
+    kv_offload: bool = False,
+    num_devices: int = 1,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
+):
+    model_config = {"model_name": model_name}
+    model_config["img_size"] = img_size
+    config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True, padding=True)
+    config = set_num_layers(config, n_layer=n_layer)
+    model_hf, _ = load_image_text_to_text_model(config)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True)
+
+    n_layer = get_num_layers_vlm(config)
+    image = Image.open(requests.get(img_url, stream=True).raw)
+    if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503":
+        image = image.resize((1540, 1540))
+
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": query},
+                {"type": "image"},
+            ],
+        },
+    ]
+    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+
+    inputs = processor(images=image, text=prompt, return_tensors="pt")
+    if "pixel_values" in inputs:
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
+    streamer = TextStreamer(processor.tokenizer)
+    qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+        model_config["model_name"],
+        kv_offload=kv_offload,
+        config=config,
+    )
+
+    # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model)
+    # assert (pytorch_kv_tokens == pytorch_hf_tokens).all(), (
+    #     "Tokens don't match for pytorch HF output and pytorch KV output"
+    # )
+
+    with_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False)
+    without_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=False)
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+
+    inputs = processor(images=image, text=prompt, return_tensors="pt")
+    if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl":
+        inputs = qeff_model.model.prepare_inputs_for_generation(
+            inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size
+        )
+    if "pixel_values" in inputs:
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
+
+    qeff_model.compile(
+        img_size=model_config["img_size"],
+        num_devices=num_devices,
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        mxfp6=False,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
+        onnx_path=with_sub_func_onnx,
+    )
+
+    print("Output With Subfunction Enabled:")
+    output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer)
+    tokens_sub = output.generated_ids[:, :-1]
+
+    qeff_model.compile(
+        img_size=model_config["img_size"],
+        num_devices=num_devices,
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        mxfp6=False,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
+        onnx_path=without_sub_func_onnx,
+    )
+
+    print("Output With Subfunction Not Enabled:")
+    output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer)
+    tokens_no_sub = output.generated_ids[:, :-1]
+
+    assert (tokens_sub == tokens_no_sub).all(), "Tokens don't match for pytorch HF output and QPC output"
+    return

From 441e2ba1ec7d8867d1bb2ca4b19f1b81795cc186 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Tue, 13 Jan 2026 06:46:05 +0000
Subject: [PATCH 04/12] Made minor fixes

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 QEfficient/transformers/models/falcon/modeling_falcon.py      | 2 +-
 QEfficient/transformers/models/gemma/modeling_gemma.py        | 2 +-
 QEfficient/transformers/models/gemma2/modeling_gemma2.py      | 2 +-
 QEfficient/transformers/models/gemma3/modeling_gemma3.py      | 4 ++--
 QEfficient/transformers/models/gpt2/modeling_gpt2.py          | 2 +-
 .../transformers/models/gpt_bigcode/modeling_gpt_bigcode.py   | 2 +-
 QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py    | 2 +-
 QEfficient/transformers/models/gptj/modeling_gptj.py          | 2 +-
 QEfficient/transformers/models/granite/modeling_granite.py    | 2 +-
 .../transformers/models/granitemoe/modeling_granitemoe.py     | 2 +-
 QEfficient/transformers/models/grok_1/modeling_grok1.py       | 2 +-
 QEfficient/transformers/models/internvl/modeling_internvl.py  | 4 ++--
 QEfficient/transformers/models/llama/modeling_llama.py        | 2 +-
 QEfficient/transformers/models/llama4/modeling_llama4.py      | 4 ++--
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 2 +-
 QEfficient/transformers/models/llava/modeling_llava.py        | 4 ++--
 .../transformers/models/llava_next/modeling_llava_next.py     | 2 +-
 QEfficient/transformers/models/mistral/modeling_mistral.py    | 2 +-
 QEfficient/transformers/models/mistral3/modeling_mistral3.py  | 4 ++--
 .../transformers/models/mixtral_moe/modeling_mixtral.py       | 2 +-
 QEfficient/transformers/models/molmo/modeling_molmo.py        | 4 ++--
 QEfficient/transformers/models/mpt/modeling_mpt.py            | 2 +-
 QEfficient/transformers/models/olmo2/modeling_olmo2.py        | 2 +-
 QEfficient/transformers/models/phi/modeling_phi.py            | 2 +-
 QEfficient/transformers/models/phi3/modeling_phi3.py          | 2 +-
 QEfficient/transformers/models/qwen2/modeling_qwen2.py        | 2 +-
 .../transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py     | 4 ++--
 QEfficient/transformers/models/qwen3/modeling_qwen3.py        | 2 +-
 .../transformers/models/qwen3_moe/modeling_qwen3_moe.py       | 2 +-
 .../transformers/models/starcoder2/modeling_starcoder2.py     | 2 +-
 QEfficient/transformers/models/whisper/modeling_whisper.py    | 2 +-
 QEfficient/utils/export_utils.py                              | 2 +-
 tests/transformers/test_causal_lm.py                          | 3 +--
 33 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/QEfficient/transformers/models/falcon/modeling_falcon.py b/QEfficient/transformers/models/falcon/modeling_falcon.py
index e29a06241..4ebb2fb96 100644
--- a/QEfficient/transformers/models/falcon/modeling_falcon.py
+++ b/QEfficient/transformers/models/falcon/modeling_falcon.py
@@ -354,7 +354,7 @@ class QEffFalconForCausalLM(FalconForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/gemma/modeling_gemma.py b/QEfficient/transformers/models/gemma/modeling_gemma.py
index 59a9d6809..260d1857a 100644
--- a/QEfficient/transformers/models/gemma/modeling_gemma.py
+++ b/QEfficient/transformers/models/gemma/modeling_gemma.py
@@ -336,7 +336,7 @@ class QEffGemmaForCausalLM(GemmaForCausalLM):
     - add new args cache idx for the kv retention
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/gemma2/modeling_gemma2.py b/QEfficient/transformers/models/gemma2/modeling_gemma2.py
index 00df57240..6dee8c85d 100644
--- a/QEfficient/transformers/models/gemma2/modeling_gemma2.py
+++ b/QEfficient/transformers/models/gemma2/modeling_gemma2.py
@@ -388,7 +388,7 @@ class QEffGemma2ForCausalLM(Gemma2ForCausalLM, GenerationMixin):
     - add new args cache idx for the kv retention
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py
index 29f7b13d0..930cf5141 100644
--- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py
+++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py
@@ -589,7 +589,7 @@ def __init__(self, model):
         self.model = model
         self.model.vision_model = self.model.vision_tower
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
@@ -611,7 +611,7 @@ def __init__(self, model):
         self.config = self.model.config
         self.lm_head = self.model.lm_head
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/gpt2/modeling_gpt2.py b/QEfficient/transformers/models/gpt2/modeling_gpt2.py
index ab452baea..7de674cce 100644
--- a/QEfficient/transformers/models/gpt2/modeling_gpt2.py
+++ b/QEfficient/transformers/models/gpt2/modeling_gpt2.py
@@ -397,7 +397,7 @@ class QEffGPT2LMHeadModel(GPT2LMHeadModel):
     - add new args position idx for the cache_kwargs for kv retention
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 604def959..d1220589f 100644
--- a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -378,7 +378,7 @@ def forward(
 
 
 class QEffGPTBigCodeForCausalLM(GPTBigCodeForCausalLM):
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
index b82cd7c81..57bcb842d 100644
--- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
+++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -1205,7 +1205,7 @@ def forward(
 
 
 class QEffGptOssForCausalLM(GptOssForCausalLM):
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
 
diff --git a/QEfficient/transformers/models/gptj/modeling_gptj.py b/QEfficient/transformers/models/gptj/modeling_gptj.py
index 2a7c475ed..a4c81dbec 100644
--- a/QEfficient/transformers/models/gptj/modeling_gptj.py
+++ b/QEfficient/transformers/models/gptj/modeling_gptj.py
@@ -318,7 +318,7 @@ class QEffGPTJForCausalLM(GPTJForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/granite/modeling_granite.py b/QEfficient/transformers/models/granite/modeling_granite.py
index c791b02f4..8a32c52ef 100644
--- a/QEfficient/transformers/models/granite/modeling_granite.py
+++ b/QEfficient/transformers/models/granite/modeling_granite.py
@@ -347,7 +347,7 @@ class QEffGraniteForCausalLM(GraniteForCausalLM):
     Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py
index fbeaae68c..07cba09d5 100644
--- a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py
@@ -493,7 +493,7 @@ class QEffGraniteMoeForCausalLM(GraniteMoeForCausalLM):
     Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/grok_1/modeling_grok1.py b/QEfficient/transformers/models/grok_1/modeling_grok1.py
index 9c1e7c4b6..1a1c919bb 100644
--- a/QEfficient/transformers/models/grok_1/modeling_grok1.py
+++ b/QEfficient/transformers/models/grok_1/modeling_grok1.py
@@ -397,7 +397,7 @@ class QEffGrok1ModelForCausalLM(nn.Module):
     Grok model for causal language modeling.
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
index 026b1f9ae..e389e6a84 100644
--- a/QEfficient/transformers/models/internvl/modeling_internvl.py
+++ b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -21,7 +21,7 @@ def __init__(self, model):
         super().__init__()
         self.model = model
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
@@ -45,7 +45,7 @@ def __init__(self, model):
         self.config = self.model.language_model.config
         self.language_model = self.model.language_model
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of  class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py
index 065db2193..57bccdb1b 100644
--- a/QEfficient/transformers/models/llama/modeling_llama.py
+++ b/QEfficient/transformers/models/llama/modeling_llama.py
@@ -404,7 +404,7 @@ class QEffLlamaForCausalLM(LlamaForCausalLM):
     Copied from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py
index 16a576d02..3abaef5a7 100644
--- a/QEfficient/transformers/models/llama4/modeling_llama4.py
+++ b/QEfficient/transformers/models/llama4/modeling_llama4.py
@@ -822,7 +822,7 @@ def __init__(self, model):
         super().__init__()
         self.model = model
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
@@ -858,7 +858,7 @@ def __init__(self, model):
         self.language_model = self.model.language_model
         self.config = self.model.config
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index be1cc8cdc..e219d5e03 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -416,7 +416,7 @@ def __init__(self, config: QEffLlamaSwiftKVConfig):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.config = config
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py
index 64fc41c09..48b002a31 100644
--- a/QEfficient/transformers/models/llava/modeling_llava.py
+++ b/QEfficient/transformers/models/llava/modeling_llava.py
@@ -30,7 +30,7 @@ def __init__(self, model):
         self.model = model
         self.model.vision_model = self.model.vision_tower
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
@@ -63,7 +63,7 @@ def __init__(self, model):
         self.language_model = self.model.language_model
         self.lm_head = self.model.lm_head
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py
index a51272980..8b338420e 100755
--- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py
+++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py
@@ -30,7 +30,7 @@ def __init__(self, model):
         self.model = model
         self.model.vision_model = self.model.vision_tower
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py
index de9b1a7e6..47107384e 100644
--- a/QEfficient/transformers/models/mistral/modeling_mistral.py
+++ b/QEfficient/transformers/models/mistral/modeling_mistral.py
@@ -356,7 +356,7 @@ class QEffMistralForCausalLM(MistralForCausalLM):
     - add new args cache idx for the kv retention
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py
index 3bf151b97..d1391a71a 100644
--- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py
+++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py
@@ -151,7 +151,7 @@ def __init__(self, model):
         self.model = model
         self.model.vision_model = self.model.vision_tower
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
@@ -177,7 +177,7 @@ def __init__(self, model):
         self.config = self.model.config
         self.language_model = self.model.language_model
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
index f811bea65..ec7a9a8c8 100644
--- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
+++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
@@ -414,7 +414,7 @@ class QEffMixtralForCausalLM(MixtralForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py
index 093e468ff..57f2729b9 100644
--- a/QEfficient/transformers/models/molmo/modeling_molmo.py
+++ b/QEfficient/transformers/models/molmo/modeling_molmo.py
@@ -568,7 +568,7 @@ def __init__(self, model):
         super().__init__()
         self.model = model
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
@@ -597,7 +597,7 @@ def __init__(self, model):
         # self.language_model = self.model.language_model
         self.config = self.model.config
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/mpt/modeling_mpt.py b/QEfficient/transformers/models/mpt/modeling_mpt.py
index 929e157cc..5a808c7f2 100644
--- a/QEfficient/transformers/models/mpt/modeling_mpt.py
+++ b/QEfficient/transformers/models/mpt/modeling_mpt.py
@@ -254,7 +254,7 @@ class QEffMptForCausalLM(MptForCausalLM):
     - add new args cache idx for the kv retention
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/olmo2/modeling_olmo2.py b/QEfficient/transformers/models/olmo2/modeling_olmo2.py
index 02645e185..c79ad7fae 100644
--- a/QEfficient/transformers/models/olmo2/modeling_olmo2.py
+++ b/QEfficient/transformers/models/olmo2/modeling_olmo2.py
@@ -324,7 +324,7 @@ class QEffOlmo2ForCausalLM(Olmo2ForCausalLM):
     - add new args cache idx for the kv retention
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/phi/modeling_phi.py b/QEfficient/transformers/models/phi/modeling_phi.py
index 2efbb313e..82f18b7e0 100644
--- a/QEfficient/transformers/models/phi/modeling_phi.py
+++ b/QEfficient/transformers/models/phi/modeling_phi.py
@@ -323,7 +323,7 @@ class QEffPhiForCausalLM(PhiForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py
index e25deed37..b48ab2897 100644
--- a/QEfficient/transformers/models/phi3/modeling_phi3.py
+++ b/QEfficient/transformers/models/phi3/modeling_phi3.py
@@ -351,7 +351,7 @@ class QEffPhi3ForCausalLM(Phi3ForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
index 7404f2f6c..841df6526 100644
--- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py
+++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
@@ -350,7 +350,7 @@ class QEffQwen2ForCausalLM(Qwen2ForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 784ebdd84..e8b95dec6 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -870,7 +870,7 @@ def __init__(self, model):
         self.model = model
         self.model.vision_model = self.model.visual
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
@@ -894,7 +894,7 @@ def __init__(self, model):
         self.model = model
         self.language_model = self.model.model.language_model
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/qwen3/modeling_qwen3.py b/QEfficient/transformers/models/qwen3/modeling_qwen3.py
index b310499be..ccc4bbac2 100644
--- a/QEfficient/transformers/models/qwen3/modeling_qwen3.py
+++ b/QEfficient/transformers/models/qwen3/modeling_qwen3.py
@@ -351,7 +351,7 @@ class QEffQwen3ForCausalLM(Qwen3ForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py
index 18e1e7611..5270a5c54 100644
--- a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py
+++ b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py
@@ -371,7 +371,7 @@ def forward(
 
 
 class QEffQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py
index 3387f0fba..fdbbbf05d 100644
--- a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py
@@ -275,7 +275,7 @@ class QEffStarcoder2ForCausalLM(Starcoder2ForCausalLM):
     - update the hidden_states, and fix for onnx model
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py
index 650258328..246f005a7 100644
--- a/QEfficient/transformers/models/whisper/modeling_whisper.py
+++ b/QEfficient/transformers/models/whisper/modeling_whisper.py
@@ -718,7 +718,7 @@ class QEffWhisperForConditionalGeneration(WhisperForConditionalGeneration):
     - changed forward inputs decoder_input_ids and decoder_position_ids to input_ids and position_ids
     """
 
-    def get_repeated_layer_class(self) -> Type[nn.Module]:
+    def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
         Notes:
diff --git a/QEfficient/utils/export_utils.py b/QEfficient/utils/export_utils.py
index 9380ae440..bba282b99 100644
--- a/QEfficient/utils/export_utils.py
+++ b/QEfficient/utils/export_utils.py
@@ -182,7 +182,7 @@ def _setup_onnx_subfunctions(qeff_model, args, kwargs):
     qeff_model._onnx_transforms.append(CustomOpTransform)
 
     # TODO: Handle this in the modelling class QEFFTransformersBase,remove from here. Refer diffusers implementation
-    decoder_layer_classes = qeff_model.model.get_repeated_layer_class()
+    decoder_layer_classes = qeff_model.model.get_submodules_for_export()
     if decoder_layer_classes:
         kwargs["export_modules_as_functions"] = decoder_layer_classes
     return args, kwargs
diff --git a/tests/transformers/test_causal_lm.py b/tests/transformers/test_causal_lm.py
index 6480fcdc9..fc89fdf8b 100644
--- a/tests/transformers/test_causal_lm.py
+++ b/tests/transformers/test_causal_lm.py
@@ -14,7 +14,6 @@
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
-from QEfficient.transformers.models.pytorch_transforms import get_decoder_layer_classes_for_export
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.hash_utils import hash_dict_params
 
@@ -225,7 +224,7 @@ def test_causal_lm_hash_creation(config, cb, subfunc, prefill_only, tmp_path):
     export_params["dynamic_axes"] = dynamic_axes
     hash_params["export_params"] = export_params
     if subfunc:
-        hash_params["export_modules_as_functions"] = get_decoder_layer_classes_for_export(qeff_model.model)
+        hash_params["export_modules_as_functions"] = qeff_model.model.get_submodules_for_export()
 
     manual_hash = hash_dict_params(hash_params)
 

From fc71b963a98d73b3901b021164df28f93b4e5616 Mon Sep 17 00:00:00 2001
From: Abhishek Kumar Singh <sabhis@qti.qualcomm.com>
Date: Wed, 14 Jan 2026 12:07:57 +0530
Subject: [PATCH 05/12] Update modeling_codegen.py

Signed-off-by: Abhishek Kumar Singh <sabhis@qti.qualcomm.com>
---
 .../transformers/models/codegen/modeling_codegen.py    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/codegen/modeling_codegen.py b/QEfficient/transformers/models/codegen/modeling_codegen.py
index 3addd7501..d85791912 100644
--- a/QEfficient/transformers/models/codegen/modeling_codegen.py
+++ b/QEfficient/transformers/models/codegen/modeling_codegen.py
@@ -295,7 +295,15 @@ class QEffCodeGenForCausalLM(CodeGenForCausalLM):
     - add new args position idx for the cache_kwargs for kv retention
     - update the hidden_states, and fix for onnx model
     """
-
+    def get_submodules_for_export(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffCodeGenBlock}
+    
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,

From 1b2800240221be10c05403bb44ae6320e779ba4b Mon Sep 17 00:00:00 2001
From: Abhishek kumar singh <sabhis@qti.qualcomm.com>
Date: Wed, 14 Jan 2026 08:04:11 +0000
Subject: [PATCH 06/12] Resolved lint error

Signed-off-by: Abhishek kumar singh <sabhis@qti.qualcomm.com>
---
 QEfficient/transformers/models/codegen/modeling_codegen.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/QEfficient/transformers/models/codegen/modeling_codegen.py b/QEfficient/transformers/models/codegen/modeling_codegen.py
index d85791912..21968a7c0 100644
--- a/QEfficient/transformers/models/codegen/modeling_codegen.py
+++ b/QEfficient/transformers/models/codegen/modeling_codegen.py
@@ -7,7 +7,7 @@
 
 """PyTorch Codegen model."""
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -295,6 +295,7 @@ class QEffCodeGenForCausalLM(CodeGenForCausalLM):
     - add new args position idx for the cache_kwargs for kv retention
     - update the hidden_states, and fix for onnx model
     """
+
     def get_submodules_for_export(self) -> Type[nn.Module]:
         """
         Return the set of class used as the repeated layer across the model for subfunction extraction.
@@ -303,7 +304,7 @@ def get_submodules_for_export(self) -> Type[nn.Module]:
             Downstream code can use this to find/build subfunctions for repeated blocks.
         """
         return {QEffCodeGenBlock}
-    
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,

From 2b1f09cf307a6ccc880503f1d7f669a9888e0dda Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Fri, 16 Jan 2026 12:17:30 +0000
Subject: [PATCH 07/12] Made Minor Fixes

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 .../models/llava_next/modeling_llava_next.py  |  9 ++++++
 .../test_subfunction_vlm.py                   | 30 ++++++++++++++++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py
index 8b338420e..59d5cad22 100755
--- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py
+++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py
@@ -137,6 +137,15 @@ def __init__(self, model):
         self.language_model = self.model.language_model
         self.lm_head = self.model.lm_head
 
+    def get_submodules_for_export(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.language_model.layers[0].__class__}
+
     def forward(
         self,
         input_ids,
diff --git a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
index 88f89c618..008280f72 100644
--- a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
+++ b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
@@ -242,7 +242,7 @@ def set_num_layers(config, n_layer=1):
     return config
 
 
-def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
+def test_image_text_to_text_subfunction_core(
     model_name: str,
     img_size: int,
     img_url: str,
@@ -317,6 +317,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
         mxfp6=False,
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
+        offload_pt_weights=True,
         onnx_path=with_sub_func_onnx,
     )
 
@@ -341,3 +342,30 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
 
     assert (tokens_sub == tokens_no_sub).all(), "Tokens don't match for pytorch HF output and QPC output"
     return
+
+
+@pytest.mark.on_qaic
+@pytest.mark.multimodal
+@pytest.mark.parametrize(
+    "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config
+)
+def test_image_text_to_text_subfunction(
+    model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer
+):
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model,  without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+    test_image_text_to_text_subfunction_core(
+        model_name=model_name,
+        prompt_len=prompt_len,
+        ctx_len=ctx_len,
+        max_gen_len=NEW_GENERATION_TOKENS,
+        img_size=img_size,
+        img_url=img_url,
+        query=query,
+        n_layer=n_layer,
+        batch_size=batch_size,
+        kv_offload=kv_offload,
+    )

From f06028afb6fdc7394f442fc8e999f601f516a260 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Sun, 18 Jan 2026 14:57:07 +0000
Subject: [PATCH 08/12] Fixed test file for subfunction

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 .../transformers/models/modeling_auto.py      |   3 +-
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py  |   4 +-
 .../test_subfunction_vlm.py                   | 343 ++++++++++++++----
 3 files changed, 268 insertions(+), 82 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 55253f9b0..bad767b65 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1031,12 +1031,13 @@ def export(
             use_onnx_subfunctions=use_onnx_subfunctions,
         )
 
+        offload_pt_weights = kwargs.get("offload_pt_weights", True)
         self.lang_model.export(
             inputs["lang"],
             output_names["lang"],
             dynamic_axes["lang"],
             export_dir=export_dir,
-            offload_pt_weights=True,
+            offload_pt_weights=offload_pt_weights,
             use_onnx_subfunctions=use_onnx_subfunctions,
         )
 
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index e8b95dec6..d6bfbda81 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -76,8 +76,8 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, mrope_section, unsqu
 
     cos = cos[position_ids]
     sin = sin[position_ids]
-    cos = torch.cat([cos[0, ..., 0:32], cos[1, ..., 32:80], cos[2, ..., 80:128]], dim=-1).unsqueeze(0)
-    sin = torch.cat([sin[0, ..., 0:32], sin[1, ..., 32:80], sin[2, ..., 80:128]], dim=-1).unsqueeze(0)
+    cos = torch.cat([cos[0, ..., 0:32], cos[1, ..., 32:80], cos[2, ..., 80:128]], dim=-1).unsqueeze(unsqueeze_dim)
+    sin = torch.cat([sin[0, ..., 0:32], sin[1, ..., 32:80], sin[2, ..., 80:128]], dim=-1).unsqueeze(unsqueeze_dim)
 
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
diff --git a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
index 008280f72..e683ea859 100644
--- a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
+++ b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
@@ -5,7 +5,8 @@
 #
 # ----------------------------------------------------------------------------
 
-from typing import Optional
+from io import BytesIO
+from typing import List, Optional
 
 import pytest
 import requests
@@ -16,13 +17,15 @@
     AutoModelForCausalLM,
     AutoModelForImageTextToText,
     AutoProcessor,
+    AutoTokenizer,
     TextStreamer,
 )
 
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM, QEFFAutoModelForImageTextToText
 from QEfficient.utils import hf_download
 from QEfficient.utils._utils import get_num_layers_vlm
 from QEfficient.utils.device_utils import get_available_device_id
+from QEfficient.utils.test_utils import InternProcessor
 
 NEW_GENERATION_TOKENS = 10
 test_models_config = [
@@ -49,29 +52,29 @@
         "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
         1,
     ),
-    (
-        "llava-hf/llava-1.5-7b-hf",
-        False,
-        1,
-        784,
-        1024,
-        336,
-        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
-        "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
-        1,
-    ),
-    # Disabled in CI due to performance issues
     # (
-    #     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    #     "llava-hf/llava-1.5-7b-hf",
     #     True,
     #     1,
-    #     128,
-    #     3072,
+    #     784,
+    #     1024,
     #     336,
     #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
     #     "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
-    #     4,
+    #     1,
     # ),
+    # Disabled in CI due to performance issues
+    (
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        True,
+        1,
+        128,
+        3072,
+        336,
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+        "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
+        4,
+    ),
     # (
     #     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
     #     False,
@@ -94,17 +97,17 @@
         "Can you describe the image in detail.",
         1,
     ),
-    (
-        "google/gemma-3-4b-it",
-        False,
-        1,
-        128,
-        3072,
-        896,
-        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
-        "Can you describe the image in detail.",
-        1,
-    ),
+    # (
+    #     "google/gemma-3-4b-it",
+    #     True,
+    #     1,
+    #     128,
+    #     3072,
+    #     896,
+    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+    #     "Can you describe the image in detail.",
+    #     1,
+    # ),
     (
         "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
         True,
@@ -116,39 +119,39 @@
         "Can you describe the image in detail.",
         1,
     ),
+    # (
+    #     "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+    #     True,
+    #     1,
+    #     128,
+    #     4096,
+    #     1540,
+    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+    #     "Can you describe the image in detail.",
+    #     1,
+    # ),
     (
-        "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
-        False,
+        "Qwen/Qwen2.5-VL-3B-Instruct",
+        True,
         1,
         128,
         4096,
         1540,
-        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+        "https://picsum.photos/id/237/536/354",
         "Can you describe the image in detail.",
         1,
     ),
     (
-        "Qwen/Qwen2.5-VL-3B-Instruct",
+        "meta-llama/Llama-3.2-11B-Vision-Instruct",
         True,
         1,
-        128,
-        4096,
-        1540,
-        "https://picsum.photos/id/237/536/354",
-        "Can you describe the image in detail.",
-        1,
+        32,
+        512,
+        560,
+        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",
+        "Explain this image",
+        7,
     ),
-    # (
-    #     "meta-llama/Llama-3.2-11B-Vision-Instruct",
-    #     True,
-    #     1,
-    #     32,
-    #     512,
-    #     560,
-    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",
-    #     "Explain this image",
-    #     7,
-    # ),
 ]
 
 intern_model_config = [
@@ -186,16 +189,16 @@
 
 molmo_model_config = [
     # Disabled in CI due to HF issues
-    # (
-    #     "allenai/Molmo-7B-D-0924",
-    #     True,
-    #     1,
-    #     128,
-    #     4096,
-    #     "https://picsum.photos/id/237/536/354",
-    #     "Can you describe the image in detail.",
-    #     2,
-    # ),
+    (
+        "allenai/Molmo-7B-D-0924",
+        True,
+        1,
+        128,
+        4096,
+        "https://picsum.photos/id/237/536/354",
+        "Can you describe the image in detail.",
+        2,
+    ),
 ]
 
 
@@ -242,7 +245,7 @@ def set_num_layers(config, n_layer=1):
     return config
 
 
-def test_image_text_to_text_subfunction_core(
+def check_image_text_to_text_subfunction_core(
     model_name: str,
     img_size: int,
     img_url: str,
@@ -290,13 +293,7 @@ def test_image_text_to_text_subfunction_core(
         config=config,
     )
 
-    # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model)
-    # assert (pytorch_kv_tokens == pytorch_hf_tokens).all(), (
-    #     "Tokens don't match for pytorch HF output and pytorch KV output"
-    # )
-
-    with_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False)
-    without_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=False)
+    qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False)
 
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
@@ -317,30 +314,176 @@ def test_image_text_to_text_subfunction_core(
         mxfp6=False,
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
-        offload_pt_weights=True,
-        onnx_path=with_sub_func_onnx,
     )
 
-    print("Output With Subfunction Enabled:")
     output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer)
-    tokens_sub = output.generated_ids[:, :-1]
+    print("Output With Subfunction Enabled:\n", output)
+    return
+
+
+def check_image_text_to_text_subfunction_molmo(
+    model_name: str,
+    img_url: str,
+    query: str,
+    prompt_len: int,
+    ctx_len: int,
+    max_gen_len: int = 20,
+    batch_size: int = 1,
+    n_layer: int = 1,
+    kv_offload: bool = False,
+    num_devices: int = 1,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
+):
+    model_config = {"model_name": model_name}
+
+    config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True)
+    config._attn_implementation = "eager"
+    config = set_num_layers(config, n_layer=n_layer)
+    model_hf, _ = load_image_text_to_text_model(config)
+    n_layer = (n_layer, n_layer)
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True)
+    img = requests.get(img_url, stream=True)
+    image = Image.open(BytesIO(img.content)).convert("RGB")
+    image = image.resize((536, 354))
+
+    inputs = processor.process(images=[image], text=query)
+    inputs = {k: v.unsqueeze(0) for k, v in inputs.items()}
+
+    batch_size, prompt_len = inputs["input_ids"].shape
+    inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64)
+    valid = inputs["image_input_idx"] > 0
+    valid = valid.reshape(1, -1)
+    inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0)
+    inputs["pixel_values"] = inputs.pop("images")
+
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
+        model_config["model_name"],
+        kv_offload=kv_offload,
+        config=config,
+    )
+
+    qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False)
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+
+    if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl":
+        inputs = qeff_model.model.prepare_inputs_for_generation(
+            inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size
+        )
+    if "pixel_values" in inputs:
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
 
     qeff_model.compile(
-        img_size=model_config["img_size"],
+        num_devices=num_devices,
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        mxfp6=False,
+    )
+
+    streamer = TextStreamer(processor.tokenizer)
+    output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer)
+    print("Output With Subfunction Enabled:\n", output)
+    return
+
+
+def check_image_text_to_text_subfunction_internvl(
+    model_name: str,
+    img_url: str,
+    query: str,
+    prompt_len: int,
+    ctx_len: int,
+    max_gen_len: int = 20,
+    batch_size: int = 1,
+    n_layer: int = 1,
+    kv_offload: bool = False,
+    num_devices: int = 1,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
+):
+    model_config = {"model_name": model_name}
+
+    config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True)
+    config._attn_implementation = "eager"
+    config = set_num_layers(config, n_layer=n_layer)
+    model_hf = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        low_cpu_mem_usage=False,
+        trust_remote_code=True,
+        config=config,
+    )
+    n_layer = get_num_layers_vlm(config)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
+    processor = InternProcessor(model_hf, tokenizer)
+
+    prompt = [query]
+    img_url = [img_url]
+    pixel_values = []
+    num_patches_list = []
+    questions = []
+    for i in range(len(prompt)):
+        img = requests.get(img_url[i], stream=True)
+        image = Image.open(BytesIO(img.content)).convert("RGB")
+
+        image = image.resize((448, 448))
+
+        # preprocess the resized image
+        pixel_value = processor.load_image(image, max_num=12)
+        num_patches_list.append(pixel_value.shape[0])
+        pixel_values.append(pixel_value)
+
+        question = "<image>\n" + prompt[i]
+        questions.append(question)
+
+    pixel_values = torch.cat(pixel_values, dim=0)
+
+    # Chat Template information for prompt preprocessing
+    messages: List[List[str]] = []
+    roles = ("<|im_start|>user\n", "<|im_start|>assistant\n")
+    prompt = processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list)
+
+    inputs = tokenizer(prompt, return_tensors="pt")
+    batch_size, prompt_len = inputs["input_ids"].shape
+    inputs["pixel_values"] = pixel_values.clone()
+
+    generation_config = dict(max_new_tokens=max_gen_len, do_sample=False)
+    generation_config["eos_token_id"] = tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip())
+
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
+        model_config["model_name"],
+        kv_offload=kv_offload,
+        config=config,
+    )
+
+    streamer = TextStreamer(processor.tokenizer)
+    qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False)
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+
+    inputs = processor(images=image, text=prompt, return_tensors="pt")
+    if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl":
+        inputs = qeff_model.model.prepare_inputs_for_generation(
+            inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size
+        )
+    if "pixel_values" in inputs:
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
+
+    qeff_model.compile(
+        num_patches=1,
         num_devices=num_devices,
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         mxfp6=False,
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
-        onnx_path=without_sub_func_onnx,
     )
 
-    print("Output With Subfunction Not Enabled:")
     output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer)
-    tokens_no_sub = output.generated_ids[:, :-1]
-
-    assert (tokens_sub == tokens_no_sub).all(), "Tokens don't match for pytorch HF output and QPC output"
+    print("Output With Subfunction Enabled:\n", output)
     return
 
 
@@ -357,7 +500,7 @@ def test_image_text_to_text_subfunction(
     ``Mandatory`` Args:
         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
     """
-    test_image_text_to_text_subfunction_core(
+    check_image_text_to_text_subfunction_core(
         model_name=model_name,
         prompt_len=prompt_len,
         ctx_len=ctx_len,
@@ -369,3 +512,45 @@ def test_image_text_to_text_subfunction(
         batch_size=batch_size,
         kv_offload=kv_offload,
     )
+
+
+@pytest.mark.on_qaic
+@pytest.mark.multimodal
+@pytest.mark.parametrize(
+    "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config
+)
+def test_image_text_to_text_subfunction_molmo(
+    model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer
+):
+    check_image_text_to_text_subfunction_molmo(
+        model_name=model_name,
+        prompt_len=prompt_len,
+        ctx_len=ctx_len,
+        max_gen_len=NEW_GENERATION_TOKENS,
+        img_url=img_url,
+        query=query,
+        n_layer=n_layer,
+        batch_size=batch_size,
+        kv_offload=kv_offload,
+    )
+
+
+@pytest.mark.on_qaic
+@pytest.mark.multimodal
+@pytest.mark.parametrize(
+    "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config
+)
+def test_image_text_to_text_subfunction_internvl(
+    model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer
+):
+    check_image_text_to_text_subfunction_internvl(
+        model_name=model_name,
+        prompt_len=prompt_len,
+        ctx_len=ctx_len,
+        max_gen_len=NEW_GENERATION_TOKENS,
+        img_url=img_url,
+        query=query,
+        n_layer=n_layer,
+        batch_size=batch_size,
+        kv_offload=kv_offload,
+    )

From 5fd672db940f87eb6e44688604a130db751396f7 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Mon, 19 Jan 2026 08:23:42 +0000
Subject: [PATCH 09/12] Changed test file for subfunction with VLMs

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 .../test_subfunction_vlm.py                   | 424 +-----------------
 1 file changed, 24 insertions(+), 400 deletions(-)

diff --git a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
index e683ea859..9e98ab7d7 100644
--- a/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
+++ b/tests/transformers/models/image_text_to_text/test_subfunction_vlm.py
@@ -5,27 +5,23 @@
 #
 # ----------------------------------------------------------------------------
 
-from io import BytesIO
-from typing import List, Optional
+from typing import Optional
 
+import onnx
 import pytest
 import requests
 import torch
 from PIL import Image
 from transformers import (
     AutoConfig,
-    AutoModelForCausalLM,
     AutoModelForImageTextToText,
     AutoProcessor,
-    AutoTokenizer,
-    TextStreamer,
 )
 
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM, QEFFAutoModelForImageTextToText
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText
 from QEfficient.utils import hf_download
 from QEfficient.utils._utils import get_num_layers_vlm
 from QEfficient.utils.device_utils import get_available_device_id
-from QEfficient.utils.test_utils import InternProcessor
 
 NEW_GENERATION_TOKENS = 10
 test_models_config = [
@@ -41,95 +37,6 @@
     # text_prompt,
     # number of layers of the model,
     # ),
-    (
-        "llava-hf/llava-1.5-7b-hf",
-        True,
-        1,
-        784,
-        1024,
-        336,
-        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
-        "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
-        1,
-    ),
-    # (
-    #     "llava-hf/llava-1.5-7b-hf",
-    #     True,
-    #     1,
-    #     784,
-    #     1024,
-    #     336,
-    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
-    #     "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
-    #     1,
-    # ),
-    # Disabled in CI due to performance issues
-    (
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        True,
-        1,
-        128,
-        3072,
-        336,
-        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
-        "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
-        4,
-    ),
-    # (
-    #     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    #     False,
-    #     1,
-    #     128,
-    #     3072,
-    #     336,
-    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
-    #     "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
-    #     4,
-    # ),
-    (
-        "google/gemma-3-4b-it",
-        True,
-        1,
-        128,
-        3072,
-        896,
-        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
-        "Can you describe the image in detail.",
-        1,
-    ),
-    # (
-    #     "google/gemma-3-4b-it",
-    #     True,
-    #     1,
-    #     128,
-    #     3072,
-    #     896,
-    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
-    #     "Can you describe the image in detail.",
-    #     1,
-    # ),
-    (
-        "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
-        True,
-        1,
-        128,
-        4096,
-        1540,
-        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
-        "Can you describe the image in detail.",
-        1,
-    ),
-    # (
-    #     "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
-    #     True,
-    #     1,
-    #     128,
-    #     4096,
-    #     1540,
-    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
-    #     "Can you describe the image in detail.",
-    #     1,
-    # ),
     (
         "Qwen/Qwen2.5-VL-3B-Instruct",
         True,
@@ -141,64 +48,6 @@
         "Can you describe the image in detail.",
         1,
     ),
-    (
-        "meta-llama/Llama-3.2-11B-Vision-Instruct",
-        True,
-        1,
-        32,
-        512,
-        560,
-        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",
-        "Explain this image",
-        7,
-    ),
-]
-
-intern_model_config = [
-    (
-        "OpenGVLab/InternVL2_5-1B",
-        True,
-        1,
-        384,
-        512,
-        "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg",
-        "Please describe the image in detail.",
-        2,
-    ),
-    (
-        "OpenGVLab/InternVL3_5-1B",
-        True,
-        1,
-        384,
-        512,
-        "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg",
-        "Please describe the image in detail.",
-        2,
-    ),
-    # (
-    #     "OpenGVLab/InternVL2_5-1B",
-    #     False,
-    #     1,
-    #     384,
-    #     512,
-    #     "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg",
-    #     "Please describe the image in detail.",
-    #     2,
-    # ), # commented becuase QNN Convertor is not supported for this model yet.
-]
-
-molmo_model_config = [
-    # Disabled in CI due to HF issues
-    (
-        "allenai/Molmo-7B-D-0924",
-        True,
-        1,
-        128,
-        4096,
-        "https://picsum.photos/id/237/536/354",
-        "Can you describe the image in detail.",
-        2,
-    ),
 ]
 
 
@@ -207,42 +56,23 @@ def load_image_text_to_text_model(model_config):
         repo_id=model_config._name_or_path,
         ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
     )
-    try:
-        model_hf = AutoModelForImageTextToText.from_pretrained(
-            model_path,
-            low_cpu_mem_usage=False,
-            config=model_config,
-        )
-    except ValueError:
-        model_hf = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            low_cpu_mem_usage=False,
-            trust_remote_code=True,
-            config=model_config,
-        )
+
+    model_hf = AutoModelForImageTextToText.from_pretrained(
+        model_path,
+        low_cpu_mem_usage=False,
+        config=model_config,
+    )
     params = sum(p.numel() for p in model_hf.parameters())
     model_hf.eval()
     return model_hf, params
 
 
-def set_num_layers(config, n_layer=1):
-    ## -1 indicates use all the layers of the model.
-    if n_layer == -1:
-        return config
-    elif hasattr(config, "model_type") and "mllama" in config.model_type:
-        config.text_config.num_hidden_layers = n_layer
-        config.text_config.cross_attention_layers = [
-            x for x in config.text_config.cross_attention_layers if x < n_layer
-        ]
-    elif hasattr(config, "text_config"):
-        config.text_config.num_hidden_layers = n_layer
-        config.vision_config.num_hidden_layers = n_layer
-    elif hasattr(config, "llm_config"):
-        config.llm_config.num_hidden_layers = n_layer
-        config.vision_config.num_hidden_layers = n_layer
-    else:
-        config.num_hidden_layers = n_layer
-    return config
+def has_QwenLayer_function(onnx_path):
+    """Check if ONNX model contains QEffqwenlayer function definition."""
+    model = onnx.load(onnx_path, load_external_data=False)
+    function_names = [f.name for f in model.functions]
+    QwenLayer_functions = [name for name in function_names if "QEffQwen2_5_VLDecoderLayer" in name]
+    return len(QwenLayer_functions) > 0, QwenLayer_functions
 
 
 def check_image_text_to_text_subfunction_core(
@@ -263,14 +93,13 @@ def check_image_text_to_text_subfunction_core(
     model_config = {"model_name": model_name}
     model_config["img_size"] = img_size
     config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True, padding=True)
-    config = set_num_layers(config, n_layer=n_layer)
+    config.text_config.num_hidden_layers = n_layer
+    config.vision_config.num_hidden_layers = n_layer
     model_hf, _ = load_image_text_to_text_model(config)
     processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True)
 
     n_layer = get_num_layers_vlm(config)
     image = Image.open(requests.get(img_url, stream=True).raw)
-    if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503":
-        image = image.resize((1540, 1540))
 
     conversation = [
         {
@@ -286,14 +115,13 @@ def check_image_text_to_text_subfunction_core(
     inputs = processor(images=image, text=prompt, return_tensors="pt")
     if "pixel_values" in inputs:
         inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
-    streamer = TextStreamer(processor.tokenizer)
     qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
         model_config["model_name"],
         kv_offload=kv_offload,
         config=config,
     )
 
-    qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False)
+    with_sub_func_onnx = qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False)
 
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
@@ -306,174 +134,15 @@ def check_image_text_to_text_subfunction_core(
     if "pixel_values" in inputs:
         inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
 
-    qeff_model.compile(
-        img_size=model_config["img_size"],
-        num_devices=num_devices,
-        prefill_seq_len=prompt_len,
-        ctx_len=ctx_len,
-        mxfp6=False,
-        enable_qnn=enable_qnn,
-        qnn_config=qnn_config,
-    )
-
-    output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer)
-    print("Output With Subfunction Enabled:\n", output)
-    return
-
-
-def check_image_text_to_text_subfunction_molmo(
-    model_name: str,
-    img_url: str,
-    query: str,
-    prompt_len: int,
-    ctx_len: int,
-    max_gen_len: int = 20,
-    batch_size: int = 1,
-    n_layer: int = 1,
-    kv_offload: bool = False,
-    num_devices: int = 1,
-    enable_qnn: Optional[bool] = False,
-    qnn_config: Optional[str] = None,
-):
-    model_config = {"model_name": model_name}
-
-    config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True)
-    config._attn_implementation = "eager"
-    config = set_num_layers(config, n_layer=n_layer)
-    model_hf, _ = load_image_text_to_text_model(config)
-    n_layer = (n_layer, n_layer)
-
-    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True)
-    img = requests.get(img_url, stream=True)
-    image = Image.open(BytesIO(img.content)).convert("RGB")
-    image = image.resize((536, 354))
-
-    inputs = processor.process(images=[image], text=query)
-    inputs = {k: v.unsqueeze(0) for k, v in inputs.items()}
-
-    batch_size, prompt_len = inputs["input_ids"].shape
-    inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64)
-    valid = inputs["image_input_idx"] > 0
-    valid = valid.reshape(1, -1)
-    inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0)
-    inputs["pixel_values"] = inputs.pop("images")
-
-    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
-        model_config["model_name"],
-        kv_offload=kv_offload,
-        config=config,
-    )
-
-    qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False)
-
-    if not get_available_device_id():
-        pytest.skip("No available devices to run model on Cloud AI 100")
-
-    if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl":
-        inputs = qeff_model.model.prepare_inputs_for_generation(
-            inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size
-        )
-    if "pixel_values" in inputs:
-        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
-
-    qeff_model.compile(
-        num_devices=num_devices,
-        prefill_seq_len=prompt_len,
-        ctx_len=ctx_len,
-        mxfp6=False,
-    )
-
-    streamer = TextStreamer(processor.tokenizer)
-    output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer)
-    print("Output With Subfunction Enabled:\n", output)
-    return
-
-
-def check_image_text_to_text_subfunction_internvl(
-    model_name: str,
-    img_url: str,
-    query: str,
-    prompt_len: int,
-    ctx_len: int,
-    max_gen_len: int = 20,
-    batch_size: int = 1,
-    n_layer: int = 1,
-    kv_offload: bool = False,
-    num_devices: int = 1,
-    enable_qnn: Optional[bool] = False,
-    qnn_config: Optional[str] = None,
-):
-    model_config = {"model_name": model_name}
-
-    config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True)
-    config._attn_implementation = "eager"
-    config = set_num_layers(config, n_layer=n_layer)
-    model_hf = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        low_cpu_mem_usage=False,
-        trust_remote_code=True,
-        config=config,
-    )
-    n_layer = get_num_layers_vlm(config)
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
-    processor = InternProcessor(model_hf, tokenizer)
-
-    prompt = [query]
-    img_url = [img_url]
-    pixel_values = []
-    num_patches_list = []
-    questions = []
-    for i in range(len(prompt)):
-        img = requests.get(img_url[i], stream=True)
-        image = Image.open(BytesIO(img.content)).convert("RGB")
-
-        image = image.resize((448, 448))
-
-        # preprocess the resized image
-        pixel_value = processor.load_image(image, max_num=12)
-        num_patches_list.append(pixel_value.shape[0])
-        pixel_values.append(pixel_value)
-
-        question = "<image>\n" + prompt[i]
-        questions.append(question)
-
-    pixel_values = torch.cat(pixel_values, dim=0)
-
-    # Chat Template information for prompt preprocessing
-    messages: List[List[str]] = []
-    roles = ("<|im_start|>user\n", "<|im_start|>assistant\n")
-    prompt = processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list)
-
-    inputs = tokenizer(prompt, return_tensors="pt")
-    batch_size, prompt_len = inputs["input_ids"].shape
-    inputs["pixel_values"] = pixel_values.clone()
-
-    generation_config = dict(max_new_tokens=max_gen_len, do_sample=False)
-    generation_config["eos_token_id"] = tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip())
-
-    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
-        model_config["model_name"],
-        kv_offload=kv_offload,
-        config=config,
+    # Verify that the model with subfunctions has QEffQwen2_5_VLDecoderLayer function definition
+    has_qwenlayer, qwenlayer_names = has_QwenLayer_function(with_sub_func_onnx[-1])
+    assert has_qwenlayer, (
+        "Model exported with use_onnx_subfunctions=True should contain QEffQwen2_5_VLDecoderLayer function definition"
     )
-
-    streamer = TextStreamer(processor.tokenizer)
-    qeff_model.export(use_onnx_subfunctions=True, offload_pt_weights=False)
-
-    if not get_available_device_id():
-        pytest.skip("No available devices to run model on Cloud AI 100")
-
-    inputs = processor(images=image, text=prompt, return_tensors="pt")
-    if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl":
-        inputs = qeff_model.model.prepare_inputs_for_generation(
-            inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size
-        )
-    if "pixel_values" in inputs:
-        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
+    print(f"\nQwenLayer functions found: {qwenlayer_names}")
 
     qeff_model.compile(
-        num_patches=1,
+        img_size=model_config["img_size"],
         num_devices=num_devices,
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
@@ -481,9 +150,6 @@ def check_image_text_to_text_subfunction_internvl(
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
     )
-
-    output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer)
-    print("Output With Subfunction Enabled:\n", output)
     return
 
 
@@ -512,45 +178,3 @@ def test_image_text_to_text_subfunction(
         batch_size=batch_size,
         kv_offload=kv_offload,
     )
-
-
-@pytest.mark.on_qaic
-@pytest.mark.multimodal
-@pytest.mark.parametrize(
-    "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config
-)
-def test_image_text_to_text_subfunction_molmo(
-    model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer
-):
-    check_image_text_to_text_subfunction_molmo(
-        model_name=model_name,
-        prompt_len=prompt_len,
-        ctx_len=ctx_len,
-        max_gen_len=NEW_GENERATION_TOKENS,
-        img_url=img_url,
-        query=query,
-        n_layer=n_layer,
-        batch_size=batch_size,
-        kv_offload=kv_offload,
-    )
-
-
-@pytest.mark.on_qaic
-@pytest.mark.multimodal
-@pytest.mark.parametrize(
-    "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config
-)
-def test_image_text_to_text_subfunction_internvl(
-    model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer
-):
-    check_image_text_to_text_subfunction_internvl(
-        model_name=model_name,
-        prompt_len=prompt_len,
-        ctx_len=ctx_len,
-        max_gen_len=NEW_GENERATION_TOKENS,
-        img_url=img_url,
-        query=query,
-        n_layer=n_layer,
-        batch_size=batch_size,
-        kv_offload=kv_offload,
-    )

From dca8322240050fdddbb59c36e36bf183f36e3aa9 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Mon, 19 Jan 2026 09:55:09 +0000
Subject: [PATCH 10/12] Made Minor Fixes

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 .../transformers/models/mistral3/modeling_mistral3.py       | 2 +-
 QEfficient/utils/export_utils.py                            | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py
index d1391a71a..a8fb34baf 100644
--- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py
+++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py
@@ -184,7 +184,7 @@ def get_submodules_for_export(self) -> Type[nn.Module]:
             This method should return the *class object* (not an instance).
             Downstream code can use this to find/build subfunctions for repeated blocks.
         """
-        return self.model.language_model.layers[0].__class__
+        return {self.model.language_model.layers[0].__class__}
 
     def forward(
         self,
diff --git a/QEfficient/utils/export_utils.py b/QEfficient/utils/export_utils.py
index bba282b99..dbb3aca01 100644
--- a/QEfficient/utils/export_utils.py
+++ b/QEfficient/utils/export_utils.py
@@ -182,9 +182,9 @@ def _setup_onnx_subfunctions(qeff_model, args, kwargs):
     qeff_model._onnx_transforms.append(CustomOpTransform)
 
     # TODO: Handle this in the modelling class QEFFTransformersBase,remove from here. Refer diffusers implementation
-    decoder_layer_classes = qeff_model.model.get_submodules_for_export()
-    if decoder_layer_classes:
-        kwargs["export_modules_as_functions"] = decoder_layer_classes
+    submodule_classes = qeff_model.model.get_submodules_for_export()
+    if submodule_classes:
+        kwargs["export_modules_as_functions"] = submodule_classes
     return args, kwargs
 
 

From 1407f61f8b5a383c7911f2435bb1831295e061d3 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Mon, 19 Jan 2026 10:27:30 +0000
Subject: [PATCH 11/12] Added support of subfunction to mllama

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 .../models/mllama/modeling_mllama.py          | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py
index 74de1c6c1..3cba022b4 100644
--- a/QEfficient/transformers/models/mllama/modeling_mllama.py
+++ b/QEfficient/transformers/models/mllama/modeling_mllama.py
@@ -7,7 +7,7 @@
 
 """PyTorch Mllama model."""
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn.functional as F
@@ -749,6 +749,15 @@ def __init__(self, model):
         self.model = model
         self.cross_attention_layers = self.model.config.get_text_config().cross_attention_layers
 
+    def get_submodules_for_export(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {self.model.vision_model.transformer.layers[0].__class__}
+
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,
@@ -861,6 +870,15 @@ def get_qeff_vision_encoder(self):
     def get_qeff_language_decoder(self):
         return self
 
+    def get_submodules_for_export(self) -> Type[nn.Module]:
+        """
+        Return the set of class used as the repeated layer across the model for subfunction extraction.
+        Notes:
+            This method should return the *class object* (not an instance).
+            Downstream code can use this to find/build subfunctions for repeated blocks.
+        """
+        return {QEffMllamaSelfAttentionDecoderLayer}
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,

From 129be5bffd30b4721e0f7e902fe5d24499f53a52 Mon Sep 17 00:00:00 2001
From: Abhishek Kumar Singh <sabhis@qti.qualcomm.com>
Date: Wed, 21 Jan 2026 21:58:34 +0530
Subject: [PATCH 12/12] Update torch_patches.py

Signed-off-by: Abhishek Kumar Singh <sabhis@qti.qualcomm.com>
---
 QEfficient/utils/torch_patches.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/utils/torch_patches.py b/QEfficient/utils/torch_patches.py
index 1752b5979..cec5455d7 100644
--- a/QEfficient/utils/torch_patches.py
+++ b/QEfficient/utils/torch_patches.py
@@ -39,7 +39,6 @@ def _track_module_attributes_forward_hook(module, input, output):
             if hasattr(module, attr_name):
                 onnx_attrs = getattr(module, attr_name)
                 delattr(module, attr_name)
-            # FIX: use empty dict to avoid type mismatch
             try:
                 _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
             except Exception as e: