From f0e4c8cf473eb0cebb0d9aee2017acf6e87224fb Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Mon, 26 May 2025 10:32:37 +0000
Subject: [PATCH 01/14] Working code Embedding polling intital stage

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 QEfficient/base/pytorch_transforms.py         |   1 +
 .../transformers/embeddings/__init__.py       |   6 +
 .../embeddings/embedding_utils.py             | 157 ++++++++++++++++++
 .../transformers/models/modeling_auto.py      |  16 +-
 .../transformers/models/pytorch_transforms.py |  13 ++
 .../models/test_embedding_models.py           |  56 +++----
 6 files changed, 209 insertions(+), 40 deletions(-)
 create mode 100644 QEfficient/transformers/embeddings/__init__.py
 create mode 100644 QEfficient/transformers/embeddings/embedding_utils.py

diff --git a/QEfficient/base/pytorch_transforms.py b/QEfficient/base/pytorch_transforms.py
index 2883339e3..91ea5788a 100644
--- a/QEfficient/base/pytorch_transforms.py
+++ b/QEfficient/base/pytorch_transforms.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # ----------------------------------------------------------------------------
+import json
 from types import MethodType
 from typing import Callable, Dict, Tuple, Type
 
diff --git a/QEfficient/transformers/embeddings/__init__.py b/QEfficient/transformers/embeddings/__init__.py
new file mode 100644
index 000000000..d647b73a6
--- /dev/null
+++ b/QEfficient/transformers/embeddings/__init__.py
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
new file mode 100644
index 000000000..a6b991e9a
--- /dev/null
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -0,0 +1,157 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import json
+import os
+import types
+from typing import Optional
+from unittest import result
+import torch
+# from QEfficient.transformers.models.pytorch_transforms import Embedding_Transform
+from QEfficient.utils import hf_download
+from huggingface_hub import hf_hub_download
+
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output  # First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+def average_pool(last_hidden_states: torch.Tensor,
+                 attention_mask: torch.Tensor) -> torch.Tensor:
+    last_hidden = last_hidden_states[0].masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+
+def cls_pooling(token_embeddings,attention_mask):
+    return token_embeddings[:, 0]
+
+def max_pooling(token_embeddings,attention_mask):
+    return torch.max(token_embeddings, 1)[0]
+
+def min_pooling(token_embeddings,attention_mask):
+    return torch.min(token_embeddings, 1)[0]
+
+POOLING_MAP={
+    "mean":mean_pooling,
+    "avg":average_pool,
+    "cls":cls_pooling,
+    "max":max_pooling,
+    "min":min_pooling,
+}
+# def define_pooling(modules_json_path):
+#     dir_name=os.path.join(os.path.dirname(modules_json_path),"1_Pooling/config.json")
+#     if os.path.exists(dir_name):
+#         with open(dir_name) as fIn:
+#             pooling_config = json.load(fIn)
+#             pooling=[POOLING_MAP[k] for k,v in pooling_config.items() if v is True and k in POOLING_MAP]
+#             return pooling    
+#     else:
+#         print("Pooling config not found")
+#         return None
+
+# def get_modules(repo_id):
+#     modules_json_path=hf_download(repo_id,filename="modules.json")
+#     with open(modules_json_path) as fIn:
+#         modules_json = json.load(fIn)
+#     for module in modules_json:
+#         if module["type"] == "Pooling":
+#             pooling=define_pooling(modules_json_path)
+            
+import torch.nn as nn
+
+class PooledModel(nn.Module):
+    def __init__(self, base_model, pooling_fn):
+        super().__init__()
+        self.config=base_model.config
+        self.base_model = base_model
+        self.pooling_fn = pooling_fn
+
+    def forward(self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs):
+        output = self.base_model(input_ids, attention_mask, **kwargs)
+        # attention_mask = kwargs.get('attention_mask', None)
+        return self.pooling_fn(output, attention_mask)
+
+ 
+def patch_model_with_pooling(model, pooling):
+    def custom_forward(self, *args, **kwargs):
+        output=self.base_forward(*args, **kwargs)
+        return pooling[0](output, kwargs['attention_mask'])
+    
+    model.base_forward=model.forward
+    model.forward=types.MethodType(custom_forward, model)
+
+                
+            
+            
+                                  
+                
+                
+            
+            
+            
+    # modules = {}
+    # for module in modules_json:
+    #     modules[module["name"]] = module["type"]
+    # return modules
+
+
+
+
+def get_modules_json_path(model_name_or_path):
+    if os.path.isdir(model_name_or_path):
+       # It's a local path
+       local_json_path = os.path.join(model_name_or_path, "module.json")
+       if os.path.isfile(local_json_path):
+           return local_json_path   
+    else:
+       # It's a Hugging Face model ID
+        try:
+           json_path = hf_hub_download(repo_id=model_name_or_path, filename="modules.json", token="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")
+           return json_path
+        except Exception as e:
+           print(f"Error: {e}")
+           return None
+    
+# def embedding_transform_temp(func):
+#     def wrapper(self,model, **kwargs):
+#         model_name_or_path = kwargs['pretrained_model_name_or_path']
+#         modules_json_path = get_modules_json_path(model_name_or_path)
+#         if modules_json_path is not None:
+#             with open(modules_json_path) as fIn:
+#                 modules_json = json.load(fIn)
+#                 for module in modules_json:
+#                     if "Pooling" in module["type"]:
+#                         pooling=average_pool
+#                         model=PooledModel(args[1],pooling)
+#         result = func(, **kwargs)
+#         return result
+#     return wrapper
+
+
+def embedding_transform(func):
+    def wrapper(self,model, **kwargs):
+        if kwargs.get('pooling') is not None:
+            pooling=kwargs['pooling']
+            pooling_method=POOLING_MAP[pooling]
+            model=PooledModel(model,pooling_method)
+        result = func(self,model, **kwargs)
+        return result
+    return wrapper
+
+# def embedding_transform_temp(func):
+#     def wrapper(self,model, **kwargs):
+#         model_name_or_path = kwargs['pretrained_model_name_or_path']
+#         modules_json_path = get_modules_json_path(model_name_or_path)
+#         if modules_json_path is not None:
+#             with open(modules_json_path) as fIn:
+#                 modules_json = json.load(fIn)
+#                 for module in modules_json:
+#                     if "Pooling" in module["type"]:
+#                         pooling=average_pool
+#                         model=PooledModel(args[1],pooling)
+#         result = func(, **kwargs)
+#         return result
+#     return wrapper
\ No newline at end of file
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index dcb0f2306..b6fe24ccc 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -5,6 +5,7 @@
 #
 # ----------------------------------------------------------------------------
 
+from curses.ascii import EM
 import hashlib
 import warnings
 from pathlib import Path
@@ -35,6 +36,7 @@
     calculate_latency,
     get_compilation_dims,
 )
+from QEfficient.transformers.embeddings.embedding_utils import embedding_transform
 from QEfficient.transformers.models.pytorch_transforms import (
     CustomOpsTransform,
     KVCacheModuleMethodMapperTransform,
@@ -157,15 +159,17 @@ class QEFFAutoModel(QEFFTransformersBase):
     _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
+    @embedding_transform
     def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
-        self.model.config.use_cache = True
-        self.num_layers = model.config.num_hidden_layers
+        self.model.base_model.config.use_cache=True
+        # self.model.config.use_cache = True
+        self.num_layers = self.model.base_model.config.num_hidden_layers
         self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None)
 
     @classmethod
     @with_replaced_quantizers
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path,pooling, *args, **kwargs):
         """
         This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel.
         Once the model is initialized, you can use other methods such as export, compile, and generate on the same object.
@@ -214,7 +218,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
                 model, kv_offload=kv_offload
             )
 
-        return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path)
+        return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path, pooling=pooling, **kwargs)
 
     @property
     def model_hash(self) -> str:
@@ -380,13 +384,13 @@ def cloud_ai_100_feature_generate(
         inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
 
         outputs = {
-            "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype(
+            "output": np.random.randn(self.batch_size, self.qpc_session.bindings[2].dims[1]).astype(
                 np.float32
             ),
         }
         self.qpc_session.set_buffers(outputs)
         outputs = self.qpc_session.run(inputs)
-        outputs = outputs["output"][:, :input_ids_len, :]
+        # outputs = outputs["output"][:, :input_ids_len, :]
         return outputs
 
     def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]:
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index edac05248..f04a15857 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -524,3 +524,16 @@ class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform):
         "InternVisionEmbeddings": {"forward": QEffInternVisionEmbeddings.forward},
     }
     _match_class_replace_method = {}
+
+# class Embedding_Transform(PytorchTransform):
+#     def apply(self, model: nn.Module, modules_json_path) -> Tuple[nn.Module, bool]:
+#         transformed = False
+#         with open(modules_json_path) as fIn:
+#             modules_json = json.load(fIn)
+#         for module in modules_json:
+#             if module["type"] == "Pooling":
+#                 pooling=define_pooling(modules_json_path)
+#                 model=patch_model_with_pooling(model,pooling)
+#                 transformed = True
+#             # if module["type"] == "Normalization":
+#         return model, transformed
\ No newline at end of file
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 71b2ec314..e849253f7 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -11,8 +11,10 @@
 import numpy as np
 import onnxruntime as ort
 import pytest
+import torch
 from transformers import AutoModel, AutoTokenizer
 
+from QEfficient.transformers.embeddings.embedding_utils import average_pool
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
 from QEfficient.utils._utils import create_json
 from QEfficient.utils.constants import Constants, QnnConstants
@@ -20,8 +22,8 @@
 embed_test_models = [
     # model_name, architecture
     "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
-    "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
-    "BAAI/bge-small-en-v1.5",  # BertModel
+    # "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
+    # "BAAI/bge-small-en-v1.5",  # BertModel
 ]
 
 
@@ -33,26 +35,32 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     qnn_config: Optional[str] = None,
 ):
     # Prepare input
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
     inputs = tokenizer("My name is", return_tensors="pt")
 
     # Original PyTorch model
     pt_model = AutoModel.from_pretrained(
         model_name,
-        num_hidden_layers=n_layer,
+        # num_hidden_layers=n_layer,
         attn_implementation="eager",
         trust_remote_code=True,
+        token="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"
     )
 
     pt_outputs = pt_model(**inputs)
     pt_embeddings = pt_outputs[0][0].detach().numpy()
+    
+    pt_pooled_embedding = average_pool(pt_outputs.last_hidden_state, inputs["attention_mask"])
+    
+    
     # Pytorch transformed model
-    qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name)
+    qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name, pooling="avg")
     qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
-    qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
-    mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
-    print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
-    assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
+    
+    
+    mad = torch.mean(torch.abs(pt_pooled_embedding - qeff_pt_outputs))
+    # print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
+    # assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
 
     onnx_model = qeff_model.export()
     ort_session = ort.InferenceSession(str(onnx_model))
@@ -69,7 +77,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
 
     pt_embeddings = pt_outputs[0][0].detach().numpy()
     onnx_embeddings = onnx_outputs[0]
-    mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
+    mad = np.mean(np.abs(pt_pooled_embedding.detach().numpy() - onnx_embeddings))
     print("Mad for onnx and PyTorch is ", mad)
     assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
 
@@ -81,32 +89,12 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     ai100_output = qeff_model.generate(inputs=inputs)
 
     # Compare ONNX and AI 100 outputs
-    mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
+    mad = np.mean(np.abs(ai100_output['output'] - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
     assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
     assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
-@pytest.mark.on_qaic
-@pytest.mark.parametrize("model_name", embed_test_models)
-def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
-    """
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
-    """
-    check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
-
-
-@pytest.mark.on_qaic
-@pytest.mark.qnn
-@pytest.mark.parametrize("model_name", embed_test_models)
-def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
-    """
-    QNN Compilation path test.
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
-    """
-    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
-
-    check_embed_pytorch_vs_ort_vs_ai100(
-        model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
-    )
+model_name="intfloat/e5-large"
+check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
+

From 6ac599753a38c032a385e90a1f5562139a5061ad Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Mon, 26 May 2025 14:42:32 +0000
Subject: [PATCH 02/14] Working code Embedding polling intital stage

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 QEfficient/transformers/embeddings/embedding_utils.py | 3 +--
 QEfficient/transformers/models/modeling_auto.py       | 2 +-
 examples/embedding_model.py                           | 5 +++--
 tests/transformers/models/test_embedding_models.py    | 6 ++----
 4 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
index a6b991e9a..88404a6f1 100644
--- a/QEfficient/transformers/embeddings/embedding_utils.py
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -71,7 +71,6 @@ def __init__(self, base_model, pooling_fn):
 
     def forward(self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs):
         output = self.base_model(input_ids, attention_mask, **kwargs)
-        # attention_mask = kwargs.get('attention_mask', None)
         return self.pooling_fn(output, attention_mask)
 
  
@@ -109,7 +108,7 @@ def get_modules_json_path(model_name_or_path):
     else:
        # It's a Hugging Face model ID
         try:
-           json_path = hf_hub_download(repo_id=model_name_or_path, filename="modules.json", token="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")
+           json_path = hf_hub_download(repo_id=model_name_or_path, filename="modules.json")
            return json_path
         except Exception as e:
            print(f"Error: {e}")
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index b6fe24ccc..5d6d3c7ce 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -384,7 +384,7 @@ def cloud_ai_100_feature_generate(
         inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
 
         outputs = {
-            "output": np.random.randn(self.batch_size, self.qpc_session.bindings[2].dims[1]).astype(
+            "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(
                 np.float32
             ),
         }
diff --git a/examples/embedding_model.py b/examples/embedding_model.py
index ecced4259..d5c429d14 100644
--- a/examples/embedding_model.py
+++ b/examples/embedding_model.py
@@ -21,14 +21,15 @@ def mean_pooling(model_output, attention_mask):
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
 
+
 # Sentences we want sentence embeddings for
 sentences = "This is an example sentence"
 
 # Load model from HuggingFace Hub
-tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-large", token="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")
 
 
-qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+qeff_model = AutoModel.from_pretrained("intfloat/e5-large", token="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")
 qeff_model.compile(num_cores=14)
 
 # Tokenize sentences
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index e849253f7..cf0833953 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -35,7 +35,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     qnn_config: Optional[str] = None,
 ):
     # Prepare input
-    tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
+    tokenizer = AutoTokenizer.from_pretrained(model_name, )
     inputs = tokenizer("My name is", return_tensors="pt")
 
     # Original PyTorch model
@@ -44,9 +44,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
         # num_hidden_layers=n_layer,
         attn_implementation="eager",
         trust_remote_code=True,
-        token="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"
-    )
-
+        )
     pt_outputs = pt_model(**inputs)
     pt_embeddings = pt_outputs[0][0].detach().numpy()
     

From e609838df84f56d8f3648031dcc1715e7cf9e527 Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Tue, 27 May 2025 08:13:07 +0000
Subject: [PATCH 03/14] Code cleaning and formating

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 QEfficient/base/pytorch_transforms.py         |   1 -
 .../embeddings/embedding_utils.py             | 167 +++++-------------
 .../transformers/models/modeling_auto.py      |  21 +--
 .../transformers/models/pytorch_transforms.py |  13 --
 4 files changed, 53 insertions(+), 149 deletions(-)

diff --git a/QEfficient/base/pytorch_transforms.py b/QEfficient/base/pytorch_transforms.py
index 91ea5788a..2883339e3 100644
--- a/QEfficient/base/pytorch_transforms.py
+++ b/QEfficient/base/pytorch_transforms.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # ----------------------------------------------------------------------------
-import json
 from types import MethodType
 from typing import Callable, Dict, Tuple, Type
 
diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
index 88404a6f1..cc0eaefb9 100644
--- a/QEfficient/transformers/embeddings/embedding_utils.py
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -5,152 +5,77 @@
 #
 # -----------------------------------------------------------------------------
 
-import json
-import os
-import types
+from logging import warning
 from typing import Optional
-from unittest import result
+
 import torch
-# from QEfficient.transformers.models.pytorch_transforms import Embedding_Transform
-from QEfficient.utils import hf_download
-from huggingface_hub import hf_hub_download
+import torch.nn as nn
+
 
 def mean_pooling(model_output, attention_mask):
-    token_embeddings = model_output  # First element of model_output contains all token embeddings
+    token_embeddings = model_output 
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# def mean_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+#     # Apply the attention mask to the hidden states
+#     masked_hidden = last_hidden_states[0] * attention_mask[..., None]
+    
+#     # Sum the masked hidden states along the sequence dimension
+#     sum_hidden = masked_hidden.sum(dim=1)
+    
+#     # Count the number of valid (non-masked) tokens
+#     valid_token_count = attention_mask.sum(dim=1)[..., None]
+    
+#     # Compute the mean by dividing summed hidden states by the count of valid tokens
+#     return sum_hidden / valid_token_count
+
 
-def average_pool(last_hidden_states: torch.Tensor,
-                 attention_mask: torch.Tensor) -> torch.Tensor:
+def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
     last_hidden = last_hidden_states[0].masked_fill(~attention_mask[..., None].bool(), 0.0)
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 
-def cls_pooling(token_embeddings,attention_mask):
-    return token_embeddings[:, 0]
 
-def max_pooling(token_embeddings,attention_mask):
+def max_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]  
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    token_embeddings[input_mask_expanded == 0] = -1e9  
     return torch.max(token_embeddings, 1)[0]
 
-def min_pooling(token_embeddings,attention_mask):
-    return torch.min(token_embeddings, 1)[0]
 
-POOLING_MAP={
-    "mean":mean_pooling,
-    "avg":average_pool,
-    "cls":cls_pooling,
-    "max":max_pooling,
-    "min":min_pooling,
+def cls_pooling(token_embeddings, attention_mask):
+    return token_embeddings[:, 0]
+
+
+POOLING_MAP = {
+    "mean": mean_pooling,
+    "avg": average_pool,
+    "cls": cls_pooling,
+    "max": max_pooling,
 }
-# def define_pooling(modules_json_path):
-#     dir_name=os.path.join(os.path.dirname(modules_json_path),"1_Pooling/config.json")
-#     if os.path.exists(dir_name):
-#         with open(dir_name) as fIn:
-#             pooling_config = json.load(fIn)
-#             pooling=[POOLING_MAP[k] for k,v in pooling_config.items() if v is True and k in POOLING_MAP]
-#             return pooling    
-#     else:
-#         print("Pooling config not found")
-#         return None
-
-# def get_modules(repo_id):
-#     modules_json_path=hf_download(repo_id,filename="modules.json")
-#     with open(modules_json_path) as fIn:
-#         modules_json = json.load(fIn)
-#     for module in modules_json:
-#         if module["type"] == "Pooling":
-#             pooling=define_pooling(modules_json_path)
-            
-import torch.nn as nn
+
 
 class PooledModel(nn.Module):
     def __init__(self, base_model, pooling_fn):
         super().__init__()
-        self.config=base_model.config
+        self.config = base_model.config
         self.base_model = base_model
         self.pooling_fn = pooling_fn
 
-    def forward(self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs):
+    def forward(
+        self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs
+    ):
+        warning("")
         output = self.base_model(input_ids, attention_mask, **kwargs)
-        return self.pooling_fn(output, attention_mask)
-
- 
-def patch_model_with_pooling(model, pooling):
-    def custom_forward(self, *args, **kwargs):
-        output=self.base_forward(*args, **kwargs)
-        return pooling[0](output, kwargs['attention_mask'])
-    
-    model.base_forward=model.forward
-    model.forward=types.MethodType(custom_forward, model)
-
-                
-            
-            
-                                  
-                
-                
-            
-            
-            
-    # modules = {}
-    # for module in modules_json:
-    #     modules[module["name"]] = module["type"]
-    # return modules
-
-
-
-
-def get_modules_json_path(model_name_or_path):
-    if os.path.isdir(model_name_or_path):
-       # It's a local path
-       local_json_path = os.path.join(model_name_or_path, "module.json")
-       if os.path.isfile(local_json_path):
-           return local_json_path   
-    else:
-       # It's a Hugging Face model ID
-        try:
-           json_path = hf_hub_download(repo_id=model_name_or_path, filename="modules.json")
-           return json_path
-        except Exception as e:
-           print(f"Error: {e}")
-           return None
-    
-# def embedding_transform_temp(func):
-#     def wrapper(self,model, **kwargs):
-#         model_name_or_path = kwargs['pretrained_model_name_or_path']
-#         modules_json_path = get_modules_json_path(model_name_or_path)
-#         if modules_json_path is not None:
-#             with open(modules_json_path) as fIn:
-#                 modules_json = json.load(fIn)
-#                 for module in modules_json:
-#                     if "Pooling" in module["type"]:
-#                         pooling=average_pool
-#                         model=PooledModel(args[1],pooling)
-#         result = func(, **kwargs)
-#         return result
-#     return wrapper
+        return self.pooling_fn(output[0], attention_mask)
 
 
 def embedding_transform(func):
-    def wrapper(self,model, **kwargs):
-        if kwargs.get('pooling') is not None:
-            pooling=kwargs['pooling']
-            pooling_method=POOLING_MAP[pooling]
-            model=PooledModel(model,pooling_method)
-        result = func(self,model, **kwargs)
+    def wrapper(self, model, **kwargs):
+        if kwargs.get("pooling") is not None:
+            pooling = kwargs["pooling"]
+            pooling_method = POOLING_MAP[pooling]
+            model = PooledModel(model, pooling_method)
+        result = func(self, model, **kwargs)
         return result
-    return wrapper
 
-# def embedding_transform_temp(func):
-#     def wrapper(self,model, **kwargs):
-#         model_name_or_path = kwargs['pretrained_model_name_or_path']
-#         modules_json_path = get_modules_json_path(model_name_or_path)
-#         if modules_json_path is not None:
-#             with open(modules_json_path) as fIn:
-#                 modules_json = json.load(fIn)
-#                 for module in modules_json:
-#                     if "Pooling" in module["type"]:
-#                         pooling=average_pool
-#                         model=PooledModel(args[1],pooling)
-#         result = func(, **kwargs)
-#         return result
-#     return wrapper
\ No newline at end of file
+    return wrapper
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 5d6d3c7ce..e5e873d35 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -5,7 +5,6 @@
 #
 # ----------------------------------------------------------------------------
 
-from curses.ascii import EM
 import hashlib
 import warnings
 from pathlib import Path
@@ -162,14 +161,14 @@ class QEFFAutoModel(QEFFTransformersBase):
     @embedding_transform
     def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
-        self.model.base_model.config.use_cache=True
+        self.model.base_model.config.use_cache = True
         # self.model.config.use_cache = True
-        self.num_layers = self.model.base_model.config.num_hidden_layers
+        # self.num_layers = self.model.base_model.config.num_hidden_layers
         self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None)
 
     @classmethod
     @with_replaced_quantizers
-    def from_pretrained(cls, pretrained_model_name_or_path,pooling, *args, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **kwargs):
         """
         This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel.
         Once the model is initialized, you can use other methods such as export, compile, and generate on the same object.
@@ -203,13 +202,9 @@ def from_pretrained(cls, pretrained_model_name_or_path,pooling, *args, **kwargs)
         if kwargs.get("low_cpu_mem_usage", None):
             logger.warning("Updating low_cpu_mem_usage=False")
 
-        kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False})
-        try:
-            model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-            warnings.warn("Removing pooling layer from the model if exist")
-        except TypeError:
-            kwargs.pop("add_pooling_layer", None)
-            model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+        
+        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
 
         # This is support models that should be classified to in a different auto class but transformers load them via this class
         kv_offload = kwargs.pop("kv_offload", None)
@@ -384,9 +379,7 @@ def cloud_ai_100_feature_generate(
         inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
 
         outputs = {
-            "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(
-                np.float32
-            ),
+            "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(np.float32),
         }
         self.qpc_session.set_buffers(outputs)
         outputs = self.qpc_session.run(inputs)
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index f04a15857..edac05248 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -524,16 +524,3 @@ class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform):
         "InternVisionEmbeddings": {"forward": QEffInternVisionEmbeddings.forward},
     }
     _match_class_replace_method = {}
-
-# class Embedding_Transform(PytorchTransform):
-#     def apply(self, model: nn.Module, modules_json_path) -> Tuple[nn.Module, bool]:
-#         transformed = False
-#         with open(modules_json_path) as fIn:
-#             modules_json = json.load(fIn)
-#         for module in modules_json:
-#             if module["type"] == "Pooling":
-#                 pooling=define_pooling(modules_json_path)
-#                 model=patch_model_with_pooling(model,pooling)
-#                 transformed = True
-#             # if module["type"] == "Normalization":
-#         return model, transformed
\ No newline at end of file

From 9528ee845fb520783e2192c254c1e25df6ebdec2 Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Thu, 29 May 2025 08:08:38 +0000
Subject: [PATCH 04/14] Major-changes-1

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 QEfficient/__init__.py                        |   7 +-
 QEfficient/base/modeling_qeff.py              |   6 -
 .../embeddings/embedding_utils.py             |  36 ++----
 .../transformers/models/modeling_auto.py      |  12 +-
 QEfficient/utils/__init__.py                  |   1 +
 QEfficient/utils/_utils.py                    |   6 +
 .../models/test_embedding_models.py           | 114 +++++++++++++++---
 7 files changed, 126 insertions(+), 56 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 8e60d698b..be4b86321 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -6,16 +6,21 @@
 # -----------------------------------------------------------------------------
 
 import os
+import warnings
+
+from QEfficient.utils import custom_format_warning
 
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-
 # Placeholder for all non-transformer models registered in QEfficient
 import QEfficient.utils.model_registery  # noqa: F401
 from QEfficient.utils.logging_utils import logger
 
+# custom warning for the better logging experience
+warnings.formatwarning = custom_format_warning
+
 
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index cf53a8c70..e956b95a4 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -11,7 +11,6 @@
 import logging
 import shutil
 import subprocess
-import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Dict, List, Optional
@@ -59,11 +58,6 @@ def __init__(self, model: torch.nn.Module) -> None:
             self.model, transformed = transform.apply(self.model)
             any_transformed = any_transformed or transformed
 
-        if not any_transformed:
-            warnings.warn(f"No transforms applied to model: {self.model_name}. It may be an unsupported model!")
-        else:
-            logger.info(f"Pytorch transforms applied to model: {self.model_name}")
-
     @property
     @abstractmethod
     def model_name(self) -> str: ...
diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
index cc0eaefb9..57f8d5fe9 100644
--- a/QEfficient/transformers/embeddings/embedding_utils.py
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -5,29 +5,16 @@
 #
 # -----------------------------------------------------------------------------
 
-from logging import warning
+import warnings
 from typing import Optional
 
 import torch
 import torch.nn as nn
 
 
-def mean_pooling(model_output, attention_mask):
-    token_embeddings = model_output 
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-# def mean_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-#     # Apply the attention mask to the hidden states
-#     masked_hidden = last_hidden_states[0] * attention_mask[..., None]
-    
-#     # Sum the masked hidden states along the sequence dimension
-#     sum_hidden = masked_hidden.sum(dim=1)
-    
-#     # Count the number of valid (non-masked) tokens
-#     valid_token_count = attention_mask.sum(dim=1)[..., None]
-    
-#     # Compute the mean by dividing summed hidden states by the count of valid tokens
-#     return sum_hidden / valid_token_count
+def mean_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
+    return torch.sum(last_hidden_states * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
 
 def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
@@ -35,15 +22,14 @@ def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor)
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 
 
-def max_pooling(model_output, attention_mask):
-    token_embeddings = model_output[0]  
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    token_embeddings[input_mask_expanded == 0] = -1e9  
-    return torch.max(token_embeddings, 1)[0]
+def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
+    last_hidden_states[input_mask_expanded == 0] = -1e9
+    return torch.max(last_hidden_states, 1)[0]
 
 
-def cls_pooling(token_embeddings, attention_mask):
-    return token_embeddings[:, 0]
+def cls_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    return last_hidden_states[:, 0]
 
 
 POOLING_MAP = {
@@ -64,7 +50,6 @@ def __init__(self, base_model, pooling_fn):
     def forward(
         self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs
     ):
-        warning("")
         output = self.base_model(input_ids, attention_mask, **kwargs)
         return self.pooling_fn(output[0], attention_mask)
 
@@ -75,6 +60,7 @@ def wrapper(self, model, **kwargs):
             pooling = kwargs["pooling"]
             pooling_method = POOLING_MAP[pooling]
             model = PooledModel(model, pooling_method)
+            warnings.warn(f"Pooling method {pooling} is applied to the model.")
         result = func(self, model, **kwargs)
         return result
 
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index e5e873d35..bd35115c6 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -175,8 +175,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k
 
         This API can also be used as exception for VLM model since transformers support loading InternChatVL models via AutoModel API we support it via AutoModelForCausalLM API
         Args:
-            :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory.
-            :args, kwargs: Additional arguments to pass to transformers.AutoModel.
+            pretrained_model_name_or_path (str): The name or path of the pre-trained model.
+            pooling (Optional[str], optional): The pooling method to use. Defaults to None.
+                Options:
+                    - "mean": Mean pooling
+                    - "max": Max pooling
+                    - "cls": CLS token pooling
+                    - "avg": Average pooling
 
         .. code-block:: python
 
@@ -203,7 +208,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
-        
+
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
 
         # This is support models that should be classified to in a different auto class but transformers load them via this class
@@ -366,7 +371,6 @@ def cloud_ai_100_feature_generate(
             self.batch_size = self.qpc_session.bindings[0].dims[0]
             self.seq_len = self.qpc_session.bindings[0].dims[1]
         # Prepare input
-        input_ids_len = inputs["input_ids"].shape[1]
         input_ids = np.array(
             torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - inputs["input_ids"].size(1)), "constant", 0)
         )
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index f73998302..8294a3d0a 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -11,6 +11,7 @@
 )
 from QEfficient.utils._utils import (  # noqa: F401
     check_and_assign_cache_dir,
+    custom_format_warning,
     dump_qconfig,
     get_num_layers_from_config,
     get_num_layers_vlm,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index f8bc5753c..f8adff236 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -662,3 +662,9 @@ def filter_kwargs(func, kwargs):
     """
     valid_args = inspect.signature(func).parameters
     return {key: value for key, value in kwargs.items() if key in valid_args}
+
+
+def custom_format_warning(msg, category, *args, **kwargs):
+    YELLOW = "\033[93m"
+    RESET = "\033[0m"
+    return f"{YELLOW}[Warning]: {msg}{RESET}\n"
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index cf0833953..7804f0bdb 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -14,16 +14,20 @@
 import torch
 from transformers import AutoModel, AutoTokenizer
 
-from QEfficient.transformers.embeddings.embedding_utils import average_pool
+from QEfficient.transformers.embeddings.embedding_utils import mean_pooling
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
 from QEfficient.utils._utils import create_json
 from QEfficient.utils.constants import Constants, QnnConstants
 
 embed_test_models = [
     # model_name, architecture
-    "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
+    # "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
     # "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
     # "BAAI/bge-small-en-v1.5",  # BertModel
+    # "jinaai/jina-embeddings-v2-base-en",
+    # "intfloat/e5-large"
+    "sentence-transformers/gtr-t5-large"
+    # ]
 ]
 
 
@@ -35,30 +39,37 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     qnn_config: Optional[str] = None,
 ):
     # Prepare input
-    tokenizer = AutoTokenizer.from_pretrained(model_name, )
+    tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
     inputs = tokenizer("My name is", return_tensors="pt")
 
     # Original PyTorch model
     pt_model = AutoModel.from_pretrained(
         model_name,
-        # num_hidden_layers=n_layer,
+        num_hidden_layers=n_layer,
         attn_implementation="eager",
         trust_remote_code=True,
-        )
+        token="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",
+    )
+
     pt_outputs = pt_model(**inputs)
     pt_embeddings = pt_outputs[0][0].detach().numpy()
-    
-    pt_pooled_embedding = average_pool(pt_outputs.last_hidden_state, inputs["attention_mask"])
-    
-    
+
+    # checking without pooling
     # Pytorch transformed model
-    qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name, pooling="avg")
+    qeff_model = QEFFAutoModel.from_pretrained(
+        pretrained_model_name_or_path=model_name,
+        attn_implementation="eager",
+        num_hidden_layers=n_layer,
+        trust_remote_code=True,
+        token="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",
+    )
     qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
-    
-    
-    mad = torch.mean(torch.abs(pt_pooled_embedding - qeff_pt_outputs))
-    # print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
-    # assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
+    qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
+
+    # ipdb.set_trace()
+    mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
+    print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
+    assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
 
     onnx_model = qeff_model.export()
     ort_session = ort.InferenceSession(str(onnx_model))
@@ -68,6 +79,49 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     attention_mask = np.array(inputs["attention_mask"])
 
     onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+
+    onnx_outputs = ort_session.run(None, onnx_inputs)
+    pt_embeddings = pt_outputs[0][0].detach().numpy()
+    onnx_embeddings = onnx_outputs[0]
+    mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
+    print("Mad for onnx and PyTorch is ", mad)
+    assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
+
+    qeff_model.compile(
+        num_cores=14,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
+    )
+    ai100_output = qeff_model.generate(inputs=inputs)
+
+    # Compare ONNX and AI 100 outputs
+    mad = np.mean(np.abs(ai100_output["output"][:, : inputs["input_ids"].shape[1], :] - onnx_outputs[0]))
+    print("Mad for onnx and AI 100 output is ", mad)
+    assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}"
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
+
+    # pt_pooled_embedding = average_pool(pt_outputs.last_hidden_state, inputs["attention_mask"])
+    pt_pooled_embedding = mean_pooling(pt_outputs.last_hidden_state, inputs["attention_mask"])
+
+    # Pytorch transformed model
+    qeff_model = QEFFAutoModel.from_pretrained(
+        pretrained_model_name_or_path=model_name,
+        pooling="mean",
+        num_hidden_layers=n_layer,
+        token="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",
+        attn_implementation="eager",
+        trust_remote_code=True,
+    )
+    inputs = tokenizer("My name is", return_tensors="pt")
+    qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
+
+    mad = torch.mean(torch.abs(pt_pooled_embedding - qeff_pt_outputs))
+    print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
+    assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
+
+    onnx_model = qeff_model.export()
+    ort_session = ort.InferenceSession(str(onnx_model))
+
     # Run inference
     onnx_outputs = ort_session.run(None, onnx_inputs)
 
@@ -87,12 +141,32 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     ai100_output = qeff_model.generate(inputs=inputs)
 
     # Compare ONNX and AI 100 outputs
-    mad = np.mean(np.abs(ai100_output['output'] - onnx_outputs[0]))
+    mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
-    assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
+    assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}"
     assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
-model_name="intfloat/e5-large"
-check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
-
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model_name", embed_test_models)
+def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
+    """
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+    """
+    check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.parametrize("model_name", embed_test_models)
+def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
+    """
+    QNN Compilation path test.
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+    """
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_embed_pytorch_vs_ort_vs_ai100(
+        model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
+    )

From 5c9d1a112be8b9f9bcf4ff53f3ca6cf2e57dc66e Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Thu, 29 May 2025 09:09:12 +0000
Subject: [PATCH 05/14] Made pytorch transfrom insted of method for pooling

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 QEfficient/base/modeling_qeff.py              |  6 ++++
 .../embeddings/embedding_utils.py             | 15 +--------
 .../transformers/models/modeling_auto.py      |  8 +++--
 .../transformers/models/pytorch_transforms.py | 14 ++++++++
 .../models/test_embedding_models.py           | 32 +++++++++----------
 5 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index e956b95a4..cf53a8c70 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -11,6 +11,7 @@
 import logging
 import shutil
 import subprocess
+import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Dict, List, Optional
@@ -58,6 +59,11 @@ def __init__(self, model: torch.nn.Module) -> None:
             self.model, transformed = transform.apply(self.model)
             any_transformed = any_transformed or transformed
 
+        if not any_transformed:
+            warnings.warn(f"No transforms applied to model: {self.model_name}. It may be an unsupported model!")
+        else:
+            logger.info(f"Pytorch transforms applied to model: {self.model_name}")
+
     @property
     @abstractmethod
     def model_name(self) -> str: ...
diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
index 57f8d5fe9..e7a90fc51 100644
--- a/QEfficient/transformers/embeddings/embedding_utils.py
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -51,17 +51,4 @@ def forward(
         self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs
     ):
         output = self.base_model(input_ids, attention_mask, **kwargs)
-        return self.pooling_fn(output[0], attention_mask)
-
-
-def embedding_transform(func):
-    def wrapper(self, model, **kwargs):
-        if kwargs.get("pooling") is not None:
-            pooling = kwargs["pooling"]
-            pooling_method = POOLING_MAP[pooling]
-            model = PooledModel(model, pooling_method)
-            warnings.warn(f"Pooling method {pooling} is applied to the model.")
-        result = func(self, model, **kwargs)
-        return result
-
-    return wrapper
+        return self.pooling_fn(output[0], attention_mask)
\ No newline at end of file
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index bd35115c6..a4b9a0465 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -35,7 +35,6 @@
     calculate_latency,
     get_compilation_dims,
 )
-from QEfficient.transformers.embeddings.embedding_utils import embedding_transform
 from QEfficient.transformers.models.pytorch_transforms import (
     CustomOpsTransform,
     KVCacheModuleMethodMapperTransform,
@@ -43,6 +42,7 @@
     SpDTransform,
     VlmKVOffloadTransform,
     VlmNoKVOffloadTransform,
+    EmbeddingTransform,
 )
 from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
 from QEfficient.transformers.quantizers.quant_transforms import (
@@ -158,10 +158,14 @@ class QEFFAutoModel(QEFFTransformersBase):
     _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
-    @embedding_transform
     def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
+        
+        # Make Embedding specific transforms like pooling
+        self.model, _= EmbeddingTransform.apply(self.model, **kwargs)
+        
         self.model.base_model.config.use_cache = True
+        
         # self.model.config.use_cache = True
         # self.num_layers = self.model.base_model.config.num_hidden_layers
         self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None)
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index edac05248..3fd8803fe 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -7,6 +7,7 @@
 
 from types import MethodType
 from typing import Optional, Tuple
+import warnings
 
 from torch import nn
 from transformers.models.codegen.modeling_codegen import (
@@ -145,6 +146,7 @@
 
 from QEfficient.base.pytorch_transforms import ModuleMappingTransform, ModuleMethodMapperTransform
 from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC
+from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel
 from QEfficient.transformers.models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QeffCodeGenBlock,
@@ -524,3 +526,15 @@ class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform):
         "InternVisionEmbeddings": {"forward": QEffInternVisionEmbeddings.forward},
     }
     _match_class_replace_method = {}
+
+class EmbeddingTransform:    
+    @classmethod
+    def apply(cls, model: nn.Module, **kwargs) -> Tuple[nn.Module, bool]:
+        transformed = False
+        if kwargs.get("pooling") is not None:
+            pooling = kwargs["pooling"]
+            pooling_method = POOLING_MAP[pooling]
+            model = PooledModel(model, pooling_method)
+            warnings.warn(f"Pooling method {pooling} is applied to the model.")
+        return model, transformed
+           
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 7804f0bdb..3abcce852 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -24,9 +24,9 @@
     # "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
     # "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
     # "BAAI/bge-small-en-v1.5",  # BertModel
-    # "jinaai/jina-embeddings-v2-base-en",
+    "jinaai/jina-embeddings-v2-base-en",
     # "intfloat/e5-large"
-    "sentence-transformers/gtr-t5-large"
+    # "sentence-transformers/gtr-t5-large"
     # ]
 ]
 
@@ -156,17 +156,17 @@ def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
     check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
 
 
-@pytest.mark.on_qaic
-@pytest.mark.qnn
-@pytest.mark.parametrize("model_name", embed_test_models)
-def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
-    """
-    QNN Compilation path test.
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
-    """
-    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
-
-    check_embed_pytorch_vs_ort_vs_ai100(
-        model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
-    )
+# @pytest.mark.on_qaic
+# @pytest.mark.qnn
+# @pytest.mark.parametrize("model_name", embed_test_models)
+# def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
+#     """
+#     QNN Compilation path test.
+#     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+#     """
+#     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+#     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+#     check_embed_pytorch_vs_ort_vs_ai100(
+#         model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
+#     )

From 93fcc66db4de52dbf02b90ba949f3185c289a998 Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Thu, 29 May 2025 09:34:26 +0000
Subject: [PATCH 06/14] Updated tests and example script

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 .../embeddings/embedding_utils.py             |  3 +-
 .../transformers/models/modeling_auto.py      | 12 +++--
 .../transformers/models/pytorch_transforms.py | 10 +++--
 examples/embedding_model.py                   | 25 ++---------
 .../models/test_embedding_models.py           | 45 ++++++-------------
 5 files changed, 30 insertions(+), 65 deletions(-)

diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
index e7a90fc51..1adf2908e 100644
--- a/QEfficient/transformers/embeddings/embedding_utils.py
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import warnings
 from typing import Optional
 
 import torch
@@ -51,4 +50,4 @@ def forward(
         self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs
     ):
         output = self.base_model(input_ids, attention_mask, **kwargs)
-        return self.pooling_fn(output[0], attention_mask)
\ No newline at end of file
+        return self.pooling_fn(output[0], attention_mask)
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index a4b9a0465..8e58085fc 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -37,12 +37,12 @@
 )
 from QEfficient.transformers.models.pytorch_transforms import (
     CustomOpsTransform,
+    EmbeddingTransform,
     KVCacheModuleMethodMapperTransform,
     KVCacheTransform,
     SpDTransform,
     VlmKVOffloadTransform,
     VlmNoKVOffloadTransform,
-    EmbeddingTransform,
 )
 from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
 from QEfficient.transformers.quantizers.quant_transforms import (
@@ -160,14 +160,12 @@ class QEFFAutoModel(QEFFTransformersBase):
 
     def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
-        
+
         # Make Embedding specific transforms like pooling
-        self.model, _= EmbeddingTransform.apply(self.model, **kwargs)
-        
+        self.model, _ = EmbeddingTransform.apply(self.model, **kwargs)
+
         self.model.base_model.config.use_cache = True
-        
-        # self.model.config.use_cache = True
-        # self.num_layers = self.model.base_model.config.num_hidden_layers
+
         self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None)
 
     @classmethod
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 3fd8803fe..faf38b34d 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -5,9 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
+import warnings
 from types import MethodType
 from typing import Optional, Tuple
-import warnings
 
 from torch import nn
 from transformers.models.codegen.modeling_codegen import (
@@ -527,7 +527,12 @@ class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform):
     }
     _match_class_replace_method = {}
 
-class EmbeddingTransform:    
+
+class EmbeddingTransform:
+    """
+    Apply Embedding transform to the embedding model.
+    """
+
     @classmethod
     def apply(cls, model: nn.Module, **kwargs) -> Tuple[nn.Module, bool]:
         transformed = False
@@ -537,4 +542,3 @@ def apply(cls, model: nn.Module, **kwargs) -> Tuple[nn.Module, bool]:
             model = PooledModel(model, pooling_method)
             warnings.warn(f"Pooling method {pooling} is applied to the model.")
         return model, transformed
-           
diff --git a/examples/embedding_model.py b/examples/embedding_model.py
index d5c429d14..d67f3eaa2 100644
--- a/examples/embedding_model.py
+++ b/examples/embedding_model.py
@@ -8,39 +8,22 @@
 # This is the work example of the Embedding model with the AI 100
 # For more information, visit: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
 
-import torch
-import torch.nn.functional as F
 from transformers import AutoTokenizer
 
 from QEfficient import QEFFAutoModel as AutoModel
 
-
-def mean_pooling(model_output, attention_mask):
-    token_embeddings = model_output  # First element of model_output contains all token embeddings
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-
-
-
 # Sentences we want sentence embeddings for
 sentences = "This is an example sentence"
 
 # Load model from HuggingFace Hub
-tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-large", token="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")
+tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 
 
-qeff_model = AutoModel.from_pretrained("intfloat/e5-large", token="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")
+qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="mean")
 qeff_model.compile(num_cores=14)
 
 # Tokenize sentences
 encoded_input = tokenizer(sentences, return_tensors="pt")
-qeff_output = torch.tensor(qeff_model.generate(encoded_input))
-
-# Perform pooling
-sentence_embeddings = mean_pooling(qeff_output, encoded_input["attention_mask"])
-
-# Normalize embeddings
-sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+sentence_embeddings = qeff_model.generate(encoded_input)
 
-print("Sentence embeddings:")
-print(sentence_embeddings)
+print("Sentence embeddings:", sentence_embeddings)
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 3abcce852..7d5b414fd 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -16,18 +16,13 @@
 
 from QEfficient.transformers.embeddings.embedding_utils import mean_pooling
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
-from QEfficient.utils._utils import create_json
-from QEfficient.utils.constants import Constants, QnnConstants
+from QEfficient.utils.constants import Constants
 
 embed_test_models = [
     # model_name, architecture
-    # "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
-    # "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
-    # "BAAI/bge-small-en-v1.5",  # BertModel
-    "jinaai/jina-embeddings-v2-base-en",
-    # "intfloat/e5-large"
-    # "sentence-transformers/gtr-t5-large"
-    # ]
+    "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
+    "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
+    "BAAI/bge-small-en-v1.5",  # BertModel
 ]
 
 
@@ -39,7 +34,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     qnn_config: Optional[str] = None,
 ):
     # Prepare input
-    tokenizer = AutoTokenizer.from_pretrained(model_name, token="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")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
     inputs = tokenizer("My name is", return_tensors="pt")
 
     # Original PyTorch model
@@ -48,25 +43,14 @@ def check_embed_pytorch_vs_ort_vs_ai100(
         num_hidden_layers=n_layer,
         attn_implementation="eager",
         trust_remote_code=True,
-        token="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",
     )
 
     pt_outputs = pt_model(**inputs)
     pt_embeddings = pt_outputs[0][0].detach().numpy()
-
-    # checking without pooling
     # Pytorch transformed model
-    qeff_model = QEFFAutoModel.from_pretrained(
-        pretrained_model_name_or_path=model_name,
-        attn_implementation="eager",
-        num_hidden_layers=n_layer,
-        trust_remote_code=True,
-        token="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",
-    )
+    qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name)
     qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
     qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
-
-    # ipdb.set_trace()
     mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
     print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
     assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
@@ -79,8 +63,11 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     attention_mask = np.array(inputs["attention_mask"])
 
     onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
-
+    # Run inference
     onnx_outputs = ort_session.run(None, onnx_inputs)
+
+    # Compare Transformed PyTorch and ONNX outputs
+
     pt_embeddings = pt_outputs[0][0].detach().numpy()
     onnx_embeddings = onnx_outputs[0]
     mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
@@ -100,18 +87,12 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}"
     assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
-    # pt_pooled_embedding = average_pool(pt_outputs.last_hidden_state, inputs["attention_mask"])
+    # Testing pooled model
     pt_pooled_embedding = mean_pooling(pt_outputs.last_hidden_state, inputs["attention_mask"])
 
     # Pytorch transformed model
-    qeff_model = QEFFAutoModel.from_pretrained(
-        pretrained_model_name_or_path=model_name,
-        pooling="mean",
-        num_hidden_layers=n_layer,
-        token="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",
-        attn_implementation="eager",
-        trust_remote_code=True,
-    )
+    qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name, pooling="mean")
+
     inputs = tokenizer("My name is", return_tensors="pt")
     qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
 

From b108344ec9e26dfd486129be64e20073f029c05f Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Thu, 29 May 2025 09:43:15 +0000
Subject: [PATCH 07/14] Minor fixes in the tests

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 .../models/test_embedding_models.py           | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 7d5b414fd..bf33d464f 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -16,7 +16,8 @@
 
 from QEfficient.transformers.embeddings.embedding_utils import mean_pooling
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
-from QEfficient.utils.constants import Constants
+from QEfficient.utils._utils import create_json
+from QEfficient.utils.constants import Constants, QnnConstants
 
 embed_test_models = [
     # model_name, architecture
@@ -137,17 +138,17 @@ def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
     check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
 
 
-# @pytest.mark.on_qaic
-# @pytest.mark.qnn
-# @pytest.mark.parametrize("model_name", embed_test_models)
-# def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
-#     """
-#     QNN Compilation path test.
-#     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
-#     """
-#     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-#     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
-
-#     check_embed_pytorch_vs_ort_vs_ai100(
-#         model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
-#     )
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.parametrize("model_name", embed_test_models)
+def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
+    """
+    QNN Compilation path test.
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+    """
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_embed_pytorch_vs_ort_vs_ai100(
+        model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
+    )

From fb0948bc53cf1d4a1c5db973a77c484242b34283 Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Sun, 1 Jun 2025 06:57:20 +0000
Subject: [PATCH 08/14] Added support of list of seq_len at compile and
 generate will pick the closest seq_len based on input_ids_len

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 .../transformers/models/modeling_auto.py      | 22 +++++++++++++------
 examples/embedding_model.py                   | 10 +++++----
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 8e58085fc..0e23477ea 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -278,7 +278,7 @@ def compile(
         onnx_path: Optional[str] = None,
         compile_dir: Optional[str] = None,
         *,
-        seq_len: int = 32,
+        seq_len: Union[int, List[int]] = 32,
         batch_size: int = 1,
         num_devices: int = 1,
         num_cores: int = 16,  # FIXME: Make this mandatory arg
@@ -293,7 +293,7 @@ def compile(
         ``Optional`` Args:
             :onnx_path (str, optional): Path to pre-exported onnx model.
             :compile_dir (str, optional): Path for saving the qpc generated.
-            :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``.
+            :seq_len (Union[int, List[int]]): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``.
             :batch_size (int, optional): Batch size. ``Defaults to 1``.
             :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1.
             :num_cores (int): Number of cores used to compile the model.
@@ -310,7 +310,7 @@ def compile(
         """
 
         specializations = [
-            {"batch_size": batch_size, "seq_len": seq_len},
+            {"batch_size": batch_size, "seq_len": sl} for sl in (seq_len if isinstance(seq_len, list) else [seq_len])
         ]
 
         return self._compile(
@@ -371,10 +371,19 @@ def cloud_ai_100_feature_generate(
         if self.qpc_session is None:
             self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids)
             self.batch_size = self.qpc_session.bindings[0].dims[0]
-            self.seq_len = self.qpc_session.bindings[0].dims[1]
-        # Prepare input
+
+        # Dynamic switching to closest seq_Len based on input_ids_len
+        input_ids_len = inputs["input_ids"].shape[1]
+
+        for allowed_shape in self.qpc_session.allowed_shapes:
+            seq_len_allowed = allowed_shape[1][1][1]
+
+            if seq_len_allowed > input_ids_len:
+                self.seq_len = seq_len_allowed
+                break
+
         input_ids = np.array(
-            torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - inputs["input_ids"].size(1)), "constant", 0)
+            torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - input_ids_len), "constant", 0)
         )
         attention_mask = np.array(
             torch.nn.functional.pad(
@@ -389,7 +398,6 @@ def cloud_ai_100_feature_generate(
         }
         self.qpc_session.set_buffers(outputs)
         outputs = self.qpc_session.run(inputs)
-        # outputs = outputs["output"][:, :input_ids_len, :]
         return outputs
 
     def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]:
diff --git a/examples/embedding_model.py b/examples/embedding_model.py
index d67f3eaa2..dcb63bed4 100644
--- a/examples/embedding_model.py
+++ b/examples/embedding_model.py
@@ -1,8 +1,8 @@
 # -----------------------------------------------------------------------------
-#
+
 # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 # SPDX-License-Identifier: BSD-3-Clause
-#
+
 # -----------------------------------------------------------------------------
 
 # This is the work example of the Embedding model with the AI 100
@@ -18,12 +18,14 @@
 # Load model from HuggingFace Hub
 tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 
-
+# if pooling is set to "mean", the model will use the mean pooling method else no pooling will be applied
 qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="mean")
-qeff_model.compile(num_cores=14)
+
+qeff_model.compile(num_cores=16, seq_len=32)
 
 # Tokenize sentences
 encoded_input = tokenizer(sentences, return_tensors="pt")
+
 sentence_embeddings = qeff_model.generate(encoded_input)
 
 print("Sentence embeddings:", sentence_embeddings)

From 158f6a6e02912664c69f64b10be5738c07dec308 Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Mon, 2 Jun 2025 11:26:19 +0000
Subject: [PATCH 09/14] Added QAIC and QNN tests for pooling and multiple
 seq_len

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 .../embeddings/embedding_utils.py             |   4 +
 .../transformers/models/modeling_auto.py      |   3 +
 .../transformers/models/pytorch_transforms.py |   2 +-
 examples/embedding_model.py                   |   3 +-
 .../models/test_embedding_models.py           | 135 +++++++++++-------
 5 files changed, 92 insertions(+), 55 deletions(-)

diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
index 1adf2908e..e96bb11de 100644
--- a/QEfficient/transformers/embeddings/embedding_utils.py
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -40,6 +40,10 @@ def cls_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor)
 
 
 class PooledModel(nn.Module):
+    """
+    Adds pooling functionality to embedding model.
+    """
+
     def __init__(self, base_model, pooling_fn):
         super().__init__()
         self.config = base_model.config
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 0e23477ea..478ec45df 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -382,6 +382,9 @@ def cloud_ai_100_feature_generate(
                 self.seq_len = seq_len_allowed
                 break
 
+        # To handle single seq_len as we can't fetch allowed shapes for single seq_len
+        self.seq_len = self.qpc_session.bindings[0].dims[1] if not hasattr(self, "seq_len") else self.seq_len
+
         input_ids = np.array(
             torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - input_ids_len), "constant", 0)
         )
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index faf38b34d..662d34735 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -530,7 +530,7 @@ class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform):
 
 class EmbeddingTransform:
     """
-    Apply Embedding transform to the embedding model.
+    Apply Embedding transform to the model.
     """
 
     @classmethod
diff --git a/examples/embedding_model.py b/examples/embedding_model.py
index dcb63bed4..3c1914536 100644
--- a/examples/embedding_model.py
+++ b/examples/embedding_model.py
@@ -18,9 +18,10 @@
 # Load model from HuggingFace Hub
 tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 
-# if pooling is set to "mean", the model will use the mean pooling method else no pooling will be applied
+# If pooling is not set, model will generate default output
 qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="mean")
 
+# Here list of seq_len also can be used
 qeff_model.compile(num_cores=16, seq_len=32)
 
 # Tokenize sentences
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index bf33d464f..53a179e64 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -14,16 +14,14 @@
 import torch
 from transformers import AutoModel, AutoTokenizer
 
-from QEfficient.transformers.embeddings.embedding_utils import mean_pooling
+from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
 from QEfficient.utils._utils import create_json
 from QEfficient.utils.constants import Constants, QnnConstants
 
 embed_test_models = [
-    # model_name, architecture
-    "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
-    "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
-    "BAAI/bge-small-en-v1.5",  # BertModel
+    {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"},
+    {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"},
 ]
 
 
@@ -33,6 +31,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     n_layer: int = 1,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    pooling: Optional[str] = None,
 ):
     # Prepare input
     tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -46,16 +45,27 @@ def check_embed_pytorch_vs_ort_vs_ai100(
         trust_remote_code=True,
     )
 
+    # Original PyTorch model output
     pt_outputs = pt_model(**inputs)
-    pt_embeddings = pt_outputs[0][0].detach().numpy()
-    # Pytorch transformed model
-    qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name)
+    pooling_method = POOLING_MAP[pooling] if pooling else None
+    pt_embeddings = (
+        pooling_method(pt_outputs.last_hidden_state, inputs["attention_mask"])
+        if pooling
+        else pt_outputs.last_hidden_state
+    )
+
+    # QEff transformed PyTorch model
+    qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name, pooling=pooling)
+
+    # QEff transformed PyTorch model output
     qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
-    qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
-    mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
+    qeff_pt_embeddings = qeff_pt_outputs if pooling else qeff_pt_outputs[0]
+
+    mad = torch.mean(torch.abs(pt_embeddings - qeff_pt_embeddings))
     print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
     assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
 
+    # ONNX session load
     onnx_model = qeff_model.export()
     ort_session = ort.InferenceSession(str(onnx_model))
 
@@ -64,14 +74,12 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     attention_mask = np.array(inputs["attention_mask"])
 
     onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+
     # Run inference
     onnx_outputs = ort_session.run(None, onnx_inputs)
 
     # Compare Transformed PyTorch and ONNX outputs
-
-    pt_embeddings = pt_outputs[0][0].detach().numpy()
-    onnx_embeddings = onnx_outputs[0]
-    mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
+    mad = torch.mean(torch.abs(pt_embeddings - torch.tensor(onnx_outputs[0])))
     print("Mad for onnx and PyTorch is ", mad)
     assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
 
@@ -81,74 +89,95 @@ def check_embed_pytorch_vs_ort_vs_ai100(
         qnn_config=qnn_config,
     )
     ai100_output = qeff_model.generate(inputs=inputs)
+    qeff_ai100_embeddings = (
+        ai100_output["output"] if pooling else ai100_output["output"][:, : inputs["input_ids"].shape[1], :]
+    )
 
     # Compare ONNX and AI 100 outputs
-    mad = np.mean(np.abs(ai100_output["output"][:, : inputs["input_ids"].shape[1], :] - onnx_outputs[0]))
+    mad = np.mean(np.abs(qeff_ai100_embeddings - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
     assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}"
     assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
-    # Testing pooled model
-    pt_pooled_embedding = mean_pooling(pt_outputs.last_hidden_state, inputs["attention_mask"])
 
-    # Pytorch transformed model
-    qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name, pooling="mean")
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model", embed_test_models)
+def test_embed_model_pytorch_vs_onnx_vs_ai100(model):
+    """
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+    """
+    check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=32, n_layer=1)
 
-    inputs = tokenizer("My name is", return_tensors="pt")
-    qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
 
-    mad = torch.mean(torch.abs(pt_pooled_embedding - qeff_pt_outputs))
-    print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
-    assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model", embed_test_models)
+def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling(model):
+    """
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with pooling.
+    """
+    check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=32, n_layer=1, pooling=model["pooling"])
 
-    onnx_model = qeff_model.export()
-    ort_session = ort.InferenceSession(str(onnx_model))
 
-    # Run inference
-    onnx_outputs = ort_session.run(None, onnx_inputs)
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model", embed_test_models[:1])
+def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len(model):
+    """
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len.
+    """
+    check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=[32, 20], n_layer=1)
 
-    # Compare Transformed PyTorch and ONNX outputs
 
-    pt_embeddings = pt_outputs[0][0].detach().numpy()
-    onnx_embeddings = onnx_outputs[0]
-    mad = np.mean(np.abs(pt_pooled_embedding.detach().numpy() - onnx_embeddings))
-    print("Mad for onnx and PyTorch is ", mad)
-    assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
-
-    qeff_model.compile(
-        num_cores=14,
-        enable_qnn=enable_qnn,
-        qnn_config=qnn_config,
-    )
-    ai100_output = qeff_model.generate(inputs=inputs)
-
-    # Compare ONNX and AI 100 outputs
-    mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0]))
-    print("Mad for onnx and AI 100 output is ", mad)
-    assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}"
-    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
+##########  QNN TESTS ##############
 
 
 @pytest.mark.on_qaic
+@pytest.mark.qnn
 @pytest.mark.parametrize("model_name", embed_test_models)
-def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
+def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
     """
+    QNN Compilation path test.
     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
     """
-    check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_embed_pytorch_vs_ort_vs_ai100(
+        model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
+    )
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.parametrize("model", embed_test_models)
+def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling_qnn(model):
+    """
+    QNN Compilation path test.
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with pooling.
+    """
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_embed_pytorch_vs_ort_vs_ai100(
+        model_name=model["model_name"],
+        seq_len=32,
+        n_layer=1,
+        pooling=model["pooling"],
+        enable_qnn=True,
+        qnn_config=qnn_config_json_path,
+    )
 
 
 @pytest.mark.on_qaic
 @pytest.mark.qnn
-@pytest.mark.parametrize("model_name", embed_test_models)
-def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
+@pytest.mark.parametrize("model", [embed_test_models[0]])
+def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len_qnn(model):
     """
     QNN Compilation path test.
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len.
     """
     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
 
     check_embed_pytorch_vs_ort_vs_ai100(
-        model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
+        model_name=model["model_name"], seq_len=[32, 20], n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
     )

From 6f445e6174d0f6d5eac8b4ca12b1b3014db94e1c Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Mon, 9 Jun 2025 10:11:48 +0000
Subject: [PATCH 10/14] Addressed comments and added support for pooling as a
 method as well

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 .../transformers/embeddings/embedding_utils.py       | 11 +++++++++++
 QEfficient/transformers/models/modeling_auto.py      |  6 +++---
 QEfficient/transformers/models/pytorch_transforms.py | 11 ++++++-----
 examples/embedding_model.py                          | 12 +++++++++---
 4 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
index e96bb11de..36827e60e 100644
--- a/QEfficient/transformers/embeddings/embedding_utils.py
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import inspect
 from typing import Optional
 
 import torch
@@ -55,3 +56,13 @@ def forward(
     ):
         output = self.base_model(input_ids, attention_mask, **kwargs)
         return self.pooling_fn(output[0], attention_mask)
+
+def validate_user_pooling_function(user_function):
+    if not callable(user_function):
+        raise TypeError("Provided pooling function is not callable.")
+
+    sig = inspect.signature(user_function)
+    required_args = {"last_hidden_states", "attention_mask"}
+    if not required_args.issubset(sig.parameters.keys()):
+        raise ValueError(f"Pooling function must accept arguments: {required_args}")
+    return user_function
\ No newline at end of file
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 478ec45df..bfd86f80a 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -37,9 +37,9 @@
 )
 from QEfficient.transformers.models.pytorch_transforms import (
     CustomOpsTransform,
-    EmbeddingTransform,
     KVCacheModuleMethodMapperTransform,
     KVCacheTransform,
+    PoolingTransform,
     SpDTransform,
     VlmKVOffloadTransform,
     VlmNoKVOffloadTransform,
@@ -162,7 +162,7 @@ def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
 
         # Make Embedding specific transforms like pooling
-        self.model, _ = EmbeddingTransform.apply(self.model, **kwargs)
+        self.model, _ = PoolingTransform.apply(self.model, **kwargs)
 
         self.model.base_model.config.use_cache = True
 
@@ -378,7 +378,7 @@ def cloud_ai_100_feature_generate(
         for allowed_shape in self.qpc_session.allowed_shapes:
             seq_len_allowed = allowed_shape[1][1][1]
 
-            if seq_len_allowed > input_ids_len:
+            if seq_len_allowed >= input_ids_len:
                 self.seq_len = seq_len_allowed
                 break
 
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 662d34735..7b8df70f9 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -146,7 +146,7 @@
 
 from QEfficient.base.pytorch_transforms import ModuleMappingTransform, ModuleMethodMapperTransform
 from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC
-from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel
+from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function
 from QEfficient.transformers.models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QeffCodeGenBlock,
@@ -528,9 +528,10 @@ class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform):
     _match_class_replace_method = {}
 
 
-class EmbeddingTransform:
+class PoolingTransform:
     """
-    Apply Embedding transform to the model.
+    Apply a pooling transformation to the model. This transformation appends a pooling layer to the model, allowing for the reduction of spatial dimensions in the output.
+    The pooling layer can be configured to use different pooling methods, such as max pooling or average pooling.
     """
 
     @classmethod
@@ -538,7 +539,7 @@ def apply(cls, model: nn.Module, **kwargs) -> Tuple[nn.Module, bool]:
         transformed = False
         if kwargs.get("pooling") is not None:
             pooling = kwargs["pooling"]
-            pooling_method = POOLING_MAP[pooling]
+            pooling_method = POOLING_MAP[pooling] if isinstance(pooling,str) else validate_user_pooling_function(pooling)
             model = PooledModel(model, pooling_method)
-            warnings.warn(f"Pooling method {pooling} is applied to the model.")
+            warnings.warn(f"Pooling method {pooling.__name__} is applied to the model.")
         return model, transformed
diff --git a/examples/embedding_model.py b/examples/embedding_model.py
index 3c1914536..b373ec0b5 100644
--- a/examples/embedding_model.py
+++ b/examples/embedding_model.py
@@ -8,10 +8,16 @@
 # This is the work example of the Embedding model with the AI 100
 # For more information, visit: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
 
+import torch
 from transformers import AutoTokenizer
 
 from QEfficient import QEFFAutoModel as AutoModel
 
+def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
+    last_hidden_states[input_mask_expanded == 0] = -1e9
+    return torch.max(last_hidden_states, 1)[0]
+
 # Sentences we want sentence embeddings for
 sentences = "This is an example sentence"
 
@@ -19,10 +25,10 @@
 tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 
 # If pooling is not set, model will generate default output
-qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="mean")
+qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling=max_pooling)
 
-# Here list of seq_len also can be used
-qeff_model.compile(num_cores=16, seq_len=32)
+# Here seq_len can be list seq_len or single int
+qeff_model.compile(num_cores=16, seq_len=[32,64])
 
 # Tokenize sentences
 encoded_input = tokenizer(sentences, return_tensors="pt")

From b68e16ded951346bc47a7b19bccbd5716d4da059 Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Mon, 9 Jun 2025 10:12:55 +0000
Subject: [PATCH 11/14] Ruff check and format

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 .../embeddings/embedding_utils.py             | 19 ++++++++++++++++++-
 .../transformers/models/pytorch_transforms.py |  4 +++-
 examples/embedding_model.py                   | 12 ++++++++++--
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
index 36827e60e..094b23588 100644
--- a/QEfficient/transformers/embeddings/embedding_utils.py
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -57,7 +57,24 @@ def forward(
         output = self.base_model(input_ids, attention_mask, **kwargs)
         return self.pooling_fn(output[0], attention_mask)
 
+
 def validate_user_pooling_function(user_function):
+    """
+    Validate a user-provided pooling function to ensure it meets the required interface.
+
+    The function should take two arguments:
+    - last_hidden_states (torch.Tensor): The last hidden states of the model.
+    - attention_mask (torch.Tensor): The attention mask of the input sequence.
+
+    It should return a torch.Tensor representing the pooled output.
+
+    Args:
+        user_function (callable): The user-provided pooling function.
+
+    Raises:
+        ValueError: If the user-provided function does not meet the required interface.
+    """
+
     if not callable(user_function):
         raise TypeError("Provided pooling function is not callable.")
 
@@ -65,4 +82,4 @@ def validate_user_pooling_function(user_function):
     required_args = {"last_hidden_states", "attention_mask"}
     if not required_args.issubset(sig.parameters.keys()):
         raise ValueError(f"Pooling function must accept arguments: {required_args}")
-    return user_function
\ No newline at end of file
+    return user_function
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 7b8df70f9..b2325459d 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -539,7 +539,9 @@ def apply(cls, model: nn.Module, **kwargs) -> Tuple[nn.Module, bool]:
         transformed = False
         if kwargs.get("pooling") is not None:
             pooling = kwargs["pooling"]
-            pooling_method = POOLING_MAP[pooling] if isinstance(pooling,str) else validate_user_pooling_function(pooling)
+            pooling_method = (
+                POOLING_MAP[pooling] if isinstance(pooling, str) else validate_user_pooling_function(pooling)
+            )
             model = PooledModel(model, pooling_method)
             warnings.warn(f"Pooling method {pooling.__name__} is applied to the model.")
         return model, transformed
diff --git a/examples/embedding_model.py b/examples/embedding_model.py
index b373ec0b5..072020b16 100644
--- a/examples/embedding_model.py
+++ b/examples/embedding_model.py
@@ -13,22 +13,30 @@
 
 from QEfficient import QEFFAutoModel as AutoModel
 
+
 def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
     last_hidden_states[input_mask_expanded == 0] = -1e9
     return torch.max(last_hidden_states, 1)[0]
 
+
 # Sentences we want sentence embeddings for
 sentences = "This is an example sentence"
 
 # Load model from HuggingFace Hub
 tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 
-# If pooling is not set, model will generate default output
+
+# You can specify the pooling strategy either as a string (e.g., "mean") or by passing a custom pooling function.
+# If no pooling is specified, the model will return its default output (typically token embeddings).
 qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling=max_pooling)
 
+# Example: Using mean pooling by specifying it as a string.
+# This will return sentence embeddings computed using mean pooling.
+# qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="mean")
+
 # Here seq_len can be list seq_len or single int
-qeff_model.compile(num_cores=16, seq_len=[32,64])
+qeff_model.compile(num_cores=16, seq_len=[32, 64])
 
 # Tokenize sentences
 encoded_input = tokenizer(sentences, return_tensors="pt")

From cf59606d9ae59af95ee21ee3648886058b5b7021 Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Tue, 10 Jun 2025 12:45:14 +0000
Subject: [PATCH 12/14] Addressed comments

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 .../embeddings/embedding_utils.py             | 40 ++++++++++++++++
 .../transformers/models/modeling_auto.py      | 46 +++++++++++++------
 .../transformers/models/pytorch_transforms.py | 18 ++++----
 examples/embedding_model.py                   |  6 ++-
 4 files changed, 84 insertions(+), 26 deletions(-)

diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
index 094b23588..dd68e5fb9 100644
--- a/QEfficient/transformers/embeddings/embedding_utils.py
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -13,22 +13,62 @@
 
 
 def mean_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Performs mean pooling on the last hidden states of a transformer model.
+
+    Args:
+        last_hidden_states (torch.Tensor): The last hidden states of the transformer model.
+        attention_mask (torch.Tensor): The attention mask used to mask out padding tokens.
+
+    Returns:
+        torch.Tensor: The mean pooled last hidden states.
+    """
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
     return torch.sum(last_hidden_states * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
 
 def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Performs average pooling on the last hidden states of a transformer model.
+
+    Args:
+        last_hidden_states (torch.Tensor): The last hidden states of the transformer model.
+        attention_mask (torch.Tensor): The attention mask used to mask out padding tokens.
+
+    Returns:
+        torch.Tensor: The average pooled last hidden states.
+    """
     last_hidden = last_hidden_states[0].masked_fill(~attention_mask[..., None].bool(), 0.0)
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 
 
 def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Performs max pooling on the last hidden states of a transformer model.
+
+    Args:
+        last_hidden_states (torch.Tensor): The last hidden states of the transformer model.
+        attention_mask (torch.Tensor): The attention mask used to mask out padding tokens.
+
+    Returns:
+        torch.Tensor: The max pooled last hidden states.
+    """
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
     last_hidden_states[input_mask_expanded == 0] = -1e9
     return torch.max(last_hidden_states, 1)[0]
 
 
 def cls_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Performs CLS pooling on the last hidden states of a transformer model.
+
+    Args:
+        last_hidden_states (torch.Tensor): The last hidden states of the transformer model.
+        attention_mask (torch.Tensor): The attention mask used to mask out padding tokens.
+
+    Returns:
+        torch.Tensor: The CLS pooled last hidden states.
+    """
     return last_hidden_states[:, 0]
 
 
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index bfd86f80a..ba15271b7 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -158,11 +158,12 @@ class QEFFAutoModel(QEFFTransformersBase):
     _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
-    def __init__(self, model: nn.Module, **kwargs):
+    def __init__(self, model: nn.Module, pooling=None, **kwargs):
         super().__init__(model)
 
-        # Make Embedding specific transforms like pooling
-        self.model, _ = PoolingTransform.apply(self.model, **kwargs)
+        # Make Embedding specific transforms like appending pooling
+        if pooling:
+            self.model, _ = PoolingTransform.apply(self.model, pooling)
 
         self.model.base_model.config.use_cache = True
 
@@ -178,12 +179,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k
         This API can also be used as exception for VLM model since transformers support loading InternChatVL models via AutoModel API we support it via AutoModelForCausalLM API
         Args:
             pretrained_model_name_or_path (str): The name or path of the pre-trained model.
-            pooling (Optional[str], optional): The pooling method to use. Defaults to None.
-                Options:
-                    - "mean": Mean pooling
-                    - "max": Max pooling
-                    - "cls": CLS token pooling
-                    - "avg": Average pooling
+            pooling (Optional[Union[str, Callable]], optional): The pooling method to use. Defaults to None.
+            Options:
+                - "mean": Mean pooling
+                - "max": Max pooling
+                - "cls": CLS token pooling
+                - "avg": Average pooling
+                - Callable: A custom pooling function
+                - None: No pooling applied
 
         .. code-block:: python
 
@@ -191,7 +194,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k
             from transformers import AutoTokenizer
 
             # Initialize the model using from_pretrained similar to transformers.AutoModel.
-            model = QEFFAutoModel.from_pretrained("model_name")
+            model = QEFFAutoModel.from_pretrained("model_name", pooling="mean")
 
             # Now you can directly compile the model for Cloud AI 100
             model.compile(num_cores=16)  # Considering you have a Cloud AI 100 SKU
@@ -309,6 +312,9 @@ def compile(
             :str: Path of the compiled ``qpc`` package.
         """
 
+        if isinstance(seq_len, list) and len(seq_len) >= 15:
+            warnings.warn("Recommended: `seq_len` should contain fewer than 15 items.")
+
         specializations = [
             {"batch_size": batch_size, "seq_len": sl} for sl in (seq_len if isinstance(seq_len, list) else [seq_len])
         ]
@@ -396,11 +402,21 @@ def cloud_ai_100_feature_generate(
 
         inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
 
-        outputs = {
-            "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(np.float32),
-        }
-        self.qpc_session.set_buffers(outputs)
-        outputs = self.qpc_session.run(inputs)
+        # TODO: Remove try and catch after compiler fix
+        try:
+            outputs = {
+                "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(np.float32),
+            }
+            self.qpc_session.set_buffers(outputs)
+            outputs = self.qpc_session.run(inputs)
+        except Exception as e:
+            outputs = {
+                "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[1]).astype(
+                    np.float32
+                ),
+            }
+            self.qpc_session.set_buffers(outputs)
+            outputs = self.qpc_session.run(inputs)
         return outputs
 
     def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]:
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index b2325459d..fe2a9729a 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -7,7 +7,7 @@
 
 import warnings
 from types import MethodType
-from typing import Optional, Tuple
+from typing import Callable, Optional, Tuple, Union
 
 from torch import nn
 from transformers.models.codegen.modeling_codegen import (
@@ -535,13 +535,13 @@ class PoolingTransform:
     """
 
     @classmethod
-    def apply(cls, model: nn.Module, **kwargs) -> Tuple[nn.Module, bool]:
+    def apply(cls, model: nn.Module, pooling: Union[str, Callable]) -> Tuple[nn.Module, bool]:
         transformed = False
-        if kwargs.get("pooling") is not None:
-            pooling = kwargs["pooling"]
-            pooling_method = (
-                POOLING_MAP[pooling] if isinstance(pooling, str) else validate_user_pooling_function(pooling)
-            )
-            model = PooledModel(model, pooling_method)
-            warnings.warn(f"Pooling method {pooling.__name__} is applied to the model.")
+        pooling_method = (
+            POOLING_MAP[pooling]
+            if isinstance(pooling, str) and pooling in POOLING_MAP
+            else validate_user_pooling_function(pooling)
+        )
+        model = PooledModel(model, pooling_method)
+        warnings.warn("Pooling is applied to the model.")
         return model, transformed
diff --git a/examples/embedding_model.py b/examples/embedding_model.py
index 072020b16..23c9cfb3d 100644
--- a/examples/embedding_model.py
+++ b/examples/embedding_model.py
@@ -33,10 +33,12 @@ def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor)
 
 # Example: Using mean pooling by specifying it as a string.
 # This will return sentence embeddings computed using mean pooling.
-# qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="mean")
+# qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 
-# Here seq_len can be list seq_len or single int
+# Here seq_len can be list of seq_len or single int
 qeff_model.compile(num_cores=16, seq_len=[32, 64])
+# qeff_model.compile(num_cores=16, seq_len=32)
+
 
 # Tokenize sentences
 encoded_input = tokenizer(sentences, return_tensors="pt")

From d93d9d03fb54b9f00213735cbc3e1229cb71396b Mon Sep 17 00:00:00 2001
From: Amit Raj <quic_amitraj@quicinc.com>
Date: Wed, 11 Jun 2025 09:04:08 +0000
Subject: [PATCH 13/14] lint fixed

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
---
 QEfficient/transformers/models/modeling_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index ba15271b7..dc5570dc5 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -409,7 +409,7 @@ def cloud_ai_100_feature_generate(
             }
             self.qpc_session.set_buffers(outputs)
             outputs = self.qpc_session.run(inputs)
-        except Exception as e:
+        except Exception:
             outputs = {
                 "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[1]).astype(
                     np.float32

From 5678b78f70eca6746bd8374fe871b2df4829b5f9 Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <abukhoye@qti.qualcomm.com>
Date: Wed, 11 Jun 2025 13:46:44 +0000
Subject: [PATCH 14/14] qnn tests fixed embedding

Signed-off-by: Abukhoyer Shaik <abukhoye@qti.qualcomm.com>
---
 tests/transformers/models/test_embedding_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 53a179e64..2d110faeb 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -142,7 +142,7 @@ def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
 
     check_embed_pytorch_vs_ort_vs_ai100(
-        model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
+        model_name=model_name["model_name"], seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
     )