From 32c3d3ca1af6f5c1e70f2ddf85a2ac47c072fc49 Mon Sep 17 00:00:00 2001 From: Rishin Raj Date: Tue, 28 Apr 2026 13:19:30 +0530 Subject: [PATCH] Add SplitTensorsTransform to QEFFAutoModel to prevent >2GB protobuf exports Signed-off-by: Rishin Raj --- .../transformers/models/modeling_auto.py | 5 +- tests/unit_test/e2e/test_embedding_e2e.py | 55 +++++++++++++++++++ .../unit_test/models/test_model_quickcheck.py | 4 +- 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 10dc5ddd9..70099bc5d 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -29,7 +29,7 @@ import QEfficient from QEfficient.base.modeling_qeff import QEFFBaseModel -from QEfficient.base.onnx_transforms import FP16ClipTransform +from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform from QEfficient.base.pytorch_transforms import SplitGateUpWeightsTransform from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.generation.text_generation_inference import ( @@ -243,7 +243,8 @@ class QEFFAutoModel(QEFFTransformersBase): _hf_auto_class = AutoModel _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform] - _onnx_transforms = [FP16ClipTransform] + # FP16Clip inlines external weights; without Split the saved protobuf exceeds 2GB for large embedders. + _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__(self, model: nn.Module, pooling=None, **kwargs): """ diff --git a/tests/unit_test/e2e/test_embedding_e2e.py b/tests/unit_test/e2e/test_embedding_e2e.py index 0c7558fe0..fcaabfafa 100644 --- a/tests/unit_test/e2e/test_embedding_e2e.py +++ b/tests/unit_test/e2e/test_embedding_e2e.py @@ -334,3 +334,58 @@ def test_bert_ort_mean_pooled_embedding_matches_qeff(self, tmp_export_dir): pt_top = int(pt_mean.argmax(-1)) ort_top = int(ort_mean.argmax(-1)) assert pt_top == ort_top, f"Mean-pooled embedding argmax mismatch: QEff={pt_top}, ORT={ort_top}" + + +@pytest.mark.embedding +@pytest.mark.onnx +@pytest.mark.slow +class TestEmbeddingOnnxExternalData: + """ + Regression for QRANIUMSW-60769: + + Embedding models >2 GB (e.g. BAAI/bge-reranker-v2-m3, intfloat/multilingual-e5-large) + produced a PooledModel.onnx whose weights were inlined into the protobuf, so the + AIC compiler's 2 GB ModelProto parser rejected it with + ``MODEL_LOADER_INVALID_PROTOBUF / Failed to parse ModelProto``. + + The fix re-adds ``SplitTensorsTransform`` to ``QEFFAutoModel._onnx_transforms`` so + initializers above ``SIZE_THRESHOLD_DEFAULT`` (1024 bytes) are spilled to sidecar + ``*.onnx.data`` files. + + Tiny BERT still has multi-KB initializers (e.g. the vocab embedding is + ``VOCAB_SIZE * HIDDEN_SIZE * 4`` bytes = 128 KB), so the split is observable on + CPU without downloading any large model. + """ + + def test_embedding_onnx_spills_weights_to_external_data(self, tmp_export_dir): + import pathlib + + import onnx + from onnx import TensorProto + + model, _ = make_tiny_bert() + qeff_model = QEFFAutoModel(model) + onnx_path = qeff_model.export(export_dir=str(tmp_export_dir)) + + loaded = onnx.load(str(onnx_path), load_external_data=False) + external_inits = [init for init in loaded.graph.initializer if init.data_location == TensorProto.EXTERNAL] + assert external_inits, ( + "QEFFAutoModel export should spill large initializers to external data " + "(SplitTensorsTransform missing from _onnx_transforms). See QRANIUMSW-60769." + ) + + export_root = pathlib.Path(str(onnx_path)).parent + sidecar_files = list(export_root.glob("*.onnx.data")) + assert sidecar_files, ( + f"Expected at least one *.onnx.data sidecar next to {onnx_path}, " + f"found only: {sorted(p.name for p in export_root.iterdir())}" + ) + + def test_embedding_auto_model_declares_split_tensors_transform(self): + """Guard: the class-level transform list must include SplitTensorsTransform.""" + from QEfficient.base.onnx_transforms import SplitTensorsTransform + + assert SplitTensorsTransform in QEFFAutoModel._onnx_transforms, ( + "QEFFAutoModel._onnx_transforms must include SplitTensorsTransform so that " + ">2 GB embedding exports stay under the AIC compiler's ModelProto parse limit." + ) diff --git a/tests/unit_test/models/test_model_quickcheck.py b/tests/unit_test/models/test_model_quickcheck.py index 5ee941e45..b7376543b 100644 --- a/tests/unit_test/models/test_model_quickcheck.py +++ b/tests/unit_test/models/test_model_quickcheck.py @@ -395,7 +395,7 @@ def test_text_embedding_fp16_clip_transform_and_export(tmp_path): transform_names = {transform.__name__ for transform in qeff_model._onnx_transforms} assert "FP16ClipTransform" in transform_names - assert "SplitTensorsTransform" not in transform_names + assert "SplitTensorsTransform" in transform_names inputs = tokenizer("hello world", return_tensors="pt") onnx_path = _exported_onnx_path(qeff_model.export(tmp_path / "embedding-ai100")) @@ -675,7 +675,7 @@ def test_proxy_toggle_onnx_transform_policy_for_embedding(): _skip_on_model_fetch_error(exc, model_id) _assert_proxy_only_onnx_transform_policy( - qeff_default, enable_proxy=False, always_on_transforms={"FP16ClipTransform"} + qeff_default, enable_proxy=False, always_on_transforms={"FP16ClipTransform", "SplitTensorsTransform"} ) _assert_proxy_only_onnx_transform_policy(qeff_proxy, enable_proxy=True)