Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

import QEfficient
from QEfficient.base.modeling_qeff import QEFFBaseModel
from QEfficient.base.onnx_transforms import FP16ClipTransform
from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform
from QEfficient.base.pytorch_transforms import SplitGateUpWeightsTransform
from QEfficient.generation.cloud_infer import QAICInferenceSession
from QEfficient.generation.text_generation_inference import (
Expand Down Expand Up @@ -243,7 +243,8 @@ class QEFFAutoModel(QEFFTransformersBase):

_hf_auto_class = AutoModel
_pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
_onnx_transforms = [FP16ClipTransform]
# FP16Clip inlines external weights; without Split the saved protobuf exceeds 2GB for large embedders.
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]

def __init__(self, model: nn.Module, pooling=None, **kwargs):
"""
Expand Down
55 changes: 55 additions & 0 deletions tests/unit_test/e2e/test_embedding_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,3 +334,58 @@ def test_bert_ort_mean_pooled_embedding_matches_qeff(self, tmp_export_dir):
pt_top = int(pt_mean.argmax(-1))
ort_top = int(ort_mean.argmax(-1))
assert pt_top == ort_top, f"Mean-pooled embedding argmax mismatch: QEff={pt_top}, ORT={ort_top}"


@pytest.mark.embedding
@pytest.mark.onnx
@pytest.mark.slow
class TestEmbeddingOnnxExternalData:
"""
Regression for QRANIUMSW-60769:

Embedding models >2 GB (e.g. BAAI/bge-reranker-v2-m3, intfloat/multilingual-e5-large)
produced a PooledModel.onnx whose weights were inlined into the protobuf, so the
AIC compiler's 2 GB ModelProto parser rejected it with
``MODEL_LOADER_INVALID_PROTOBUF / Failed to parse ModelProto``.

The fix re-adds ``SplitTensorsTransform`` to ``QEFFAutoModel._onnx_transforms`` so
initializers above ``SIZE_THRESHOLD_DEFAULT`` (1024 bytes) are spilled to sidecar
``*.onnx.data`` files.

Tiny BERT still has multi-KB initializers (e.g. the vocab embedding is
``VOCAB_SIZE * HIDDEN_SIZE * 4`` bytes = 128 KB), so the split is observable on
CPU without downloading any large model.
"""

def test_embedding_onnx_spills_weights_to_external_data(self, tmp_export_dir):
import pathlib

import onnx
from onnx import TensorProto

model, _ = make_tiny_bert()
qeff_model = QEFFAutoModel(model)
onnx_path = qeff_model.export(export_dir=str(tmp_export_dir))

loaded = onnx.load(str(onnx_path), load_external_data=False)
external_inits = [init for init in loaded.graph.initializer if init.data_location == TensorProto.EXTERNAL]
assert external_inits, (
"QEFFAutoModel export should spill large initializers to external data "
"(SplitTensorsTransform missing from _onnx_transforms). See QRANIUMSW-60769."
)

export_root = pathlib.Path(str(onnx_path)).parent
sidecar_files = list(export_root.glob("*.onnx.data"))
assert sidecar_files, (
f"Expected at least one *.onnx.data sidecar next to {onnx_path}, "
f"found only: {sorted(p.name for p in export_root.iterdir())}"
)

def test_embedding_auto_model_declares_split_tensors_transform(self):
"""Guard: the class-level transform list must include SplitTensorsTransform."""
from QEfficient.base.onnx_transforms import SplitTensorsTransform

assert SplitTensorsTransform in QEFFAutoModel._onnx_transforms, (
"QEFFAutoModel._onnx_transforms must include SplitTensorsTransform so that "
">2 GB embedding exports stay under the AIC compiler's ModelProto parse limit."
)
4 changes: 2 additions & 2 deletions tests/unit_test/models/test_model_quickcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def test_text_embedding_fp16_clip_transform_and_export(tmp_path):
transform_names = {transform.__name__ for transform in qeff_model._onnx_transforms}

assert "FP16ClipTransform" in transform_names
assert "SplitTensorsTransform" not in transform_names
assert "SplitTensorsTransform" in transform_names

inputs = tokenizer("hello world", return_tensors="pt")
onnx_path = _exported_onnx_path(qeff_model.export(tmp_path / "embedding-ai100"))
Expand Down Expand Up @@ -675,7 +675,7 @@ def test_proxy_toggle_onnx_transform_policy_for_embedding():
_skip_on_model_fetch_error(exc, model_id)

_assert_proxy_only_onnx_transform_policy(
qeff_default, enable_proxy=False, always_on_transforms={"FP16ClipTransform"}
qeff_default, enable_proxy=False, always_on_transforms={"FP16ClipTransform", "SplitTensorsTransform"}
)
_assert_proxy_only_onnx_transform_policy(qeff_proxy, enable_proxy=True)

Expand Down
Loading