quic · quic-amitraj · Apr 18, 2026
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -1328,14 +1328,19 @@ def export(
                 kv_offload=True,
                 continuous_batching=self.continuous_batching,
                 comp_ctx_lengths=self.comp_ctx_lengths_decode,
+                prefill_seq_len=prefill_seq_len,
             )
             dynamic_axes = self.model.get_onnx_dynamic_axes(
                 kv_offload=True,
                 continuous_batching=self.continuous_batching,
                 comp_ctx_lengths=self.comp_ctx_lengths_decode,
             )
         except TypeError:
-            inputs = self.model.get_dummy_inputs(kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode)
+            inputs = self.model.get_dummy_inputs(
+                kv_offload=True,
+                comp_ctx_lengths=self.comp_ctx_lengths_decode,
+                prefill_seq_len=prefill_seq_len,
+            )
             dynamic_axes = self.model.get_onnx_dynamic_axes(
                 kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode
             )

diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
@@ -848,8 +848,13 @@ def get_dummy_inputs(
         continuous_batching: bool = False,
         **kwargs,
     ):
+        prefill_seq_len = kwargs.get("prefill_seq_len")
+        if prefill_seq_len is None:
+            prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
+        prefill_seq_len = int(prefill_seq_len)
+
         inputs_shapes = {}
-        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
+        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len)
         # vision_size = 1024
         vision_size = 187
         inputs_shapes["vision_embeds"] = (
@@ -861,7 +866,7 @@ def get_dummy_inputs(
         inputs_shapes["position_ids"] = (
             3,
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
-            constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
+            prefill_seq_len,
         )
         inputs_shapes["pixel_values"] = (748, 1536)
         inputs_shapes["image_idx"] = (1, 1)
@@ -881,8 +886,8 @@ def get_dummy_inputs(
         lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=torch.float32)
         lang_inputs["position_ids"] = (
             (
-                torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
-                .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
+                torch.arange(prefill_seq_len, dtype=torch.int64)
+                .view(1, prefill_seq_len)
                 .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
             )
             .unsqueeze(0)
@@ -898,7 +903,7 @@ def get_dummy_inputs(
         kv_cache_shape = get_padding_shape_from_config(
             config=self.model.config.text_config,
             batch_size=fbs if continuous_batching else bs,
-            seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
+            seq_len=prefill_seq_len,
         )
 
         lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)]

diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py
@@ -364,8 +364,9 @@ def update_vlm_ort_outputs(self, ort_outputs):
         Return:
             updated_outputs (Dict): Updated past_key_values, logits, pixel_values
         """
+        num_layers = self.n_layer[0] if isinstance(self.n_layer, (list, tuple)) else self.n_layer
         present_key_values = []
-        for i in range(self.n_layer[0]):
+        for i in range(num_layers):
             if "past_key." + str(i) + "_RetainedState" in ort_outputs:
                 present_key_values.append(ort_outputs["past_key." + str(i) + "_RetainedState"])
             if "past_value." + str(i) + "_RetainedState" in ort_outputs:
@@ -397,7 +398,8 @@ def update_vlm_ort_inputs(self, inputs, ort_outputs):
         updated_inputs = {}
         updated_inputs["input_ids"] = ort_outputs["logits"].argmax(-1)
         updated_inputs["position_ids"] = np.max(inputs["position_ids"], axis=1, keepdims=True) + 1
-        for i in range(self.n_layer[0]):
+        num_layers = self.n_layer[0] if isinstance(self.n_layer, (list, tuple)) else self.n_layer
+        for i in range(num_layers):
             updated_inputs["past_key." + str(i)] = ort_outputs["past_key_values"][i * 2]
             updated_inputs["past_value." + str(i)] = ort_outputs["past_key_values"][i * 2 + 1]
         if "pixel_values_RetainedState" in ort_outputs.keys():

diff --git a/QEfficient/utils/test_utils.py b/QEfficient/utils/test_utils.py
@@ -472,6 +472,8 @@ class ModelConfig:
         "Qwen/Qwen2.5-VL-3B-Instruct",
         "Qwen/Qwen3-VL-30B-A3B-Instruct",
         "Qwen/Qwen3-VL-2B-Instruct",
+        "Qwen/Qwen3-VL-Reranker-2B",
+        "Qwen/Qwen3-VL-Reranker-8B",
     }
 
     EXTERNAL_MODELS = {

diff --git a/docs/source/validate.md b/docs/source/validate.md
@@ -84,6 +84,13 @@
 | **Qwen2_5_VLForConditionalGeneration** | Qwen2.5-VL | [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)  | ✔️               | ✔️                       |             ✕           |          ✔️             |
 | **Mistral3ForConditionalGeneration** | Mistral3| [mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)| ✕ | ✔️ | ✕  | ✕  |
 
+### Vision-Language Reranker Models (Text + Image Scoring)
+**QEff Auto Class:** `QEFFAutoModelForImageTextToText`
+
+| Architecture                        | Model Family | Representative Models                                                                 | Qeff Single Qpc | Qeff Dual Qpc | vllm Single Qpc | vllm Dual Qpc |
+|------------------------------------|--------------|----------------------------------------------------------------------------------------|------------|---------------------|-------------------|-----------------|
+| **Qwen3VLForConditionalGeneration** | Qwen3-VL Reranker | [Qwen/Qwen3-VL-Reranker-2B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B)<br>[Qwen/Qwen3-VL-Reranker-8B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-8B)  | ✕               | ✔️                       |             ✕           |          ✕             |
+
 
 
 **Dual QPC:**

diff --git a/examples/image_text_to_text/README.md b/examples/image_text_to_text/README.md
@@ -100,12 +100,18 @@ Some models have specialized examples demonstrating advanced features:
 |-------|----------|
 | **Llama-4**  | [models/llama4/](models/llama4/) |
 | **Qwen** |  [models/qwen_vl/](models/qwen_vl/) |
+| **Qwen3-VL Reranker** | [models/qwen3vl/reranker/](models/qwen3vl/reranker/) |
 | **Mistral** | [models/mistral_vision/](models/mistral_vision/) |
 | **Gemma** | [models/gemma_vision/](models/gemma_vision/) |
 | **Granite** | [models/granite_vision/](models/granite_vision/) |
 | **InternVL** | [models/internvl/](models/internvl/) |
 | **Molmo** | [models/molmo/](models/molmo/) |
 
+Example command for Qwen3-VL reranker:
+```bash
+python models/qwen3vl/reranker/qwen3_vl_reranker.py
+```
+
 
 ## Documentation
 - **Full Guide**: [VLM Documentation](../../docs/source/quick_start.md#vision-language-models)

diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/README.md b/examples/image_text_to_text/models/qwen3vl/reranker/README.md
@@ -0,0 +1,52 @@
+# Qwen3-VL Reranker Inference
+
+This directory contains an AI100 example for running Qwen3-VL reranker models with QEfficient and printing per-document relevance scores.
+
+Supported models:
+- `Qwen/Qwen3-VL-Reranker-2B`
+- `Qwen/Qwen3-VL-Reranker-8B`
+
+## What this example does
+
+- Loads Qwen3-VL reranker from Hugging Face (or local snapshot path).
+- Uses QEff dual-QPC execution (vision encoder + language model).
+- Runs the same query against multiple text/image documents.
+- Prints one score per document in input order.
+
+## Required package
+
+- `qwen-vl-utils>=0.0.14`
+
+```bash
+pip install "qwen-vl-utils>=0.0.14"
+```
+
+## Script
+
+- `qwen3_vl_reranker.py`
+
+## Run
+
+```bash
+python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \
+  --model-name Qwen/Qwen3-VL-Reranker-2B
+```
+
+Or run with 8B:
+
+```bash
+python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \
+  --model-name Qwen/Qwen3-VL-Reranker-8B
+```
+
+With compile parameters:
+
+```bash
+python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \
+  --model-name Qwen/Qwen3-VL-Reranker-2B \
+  --ctx-len 2048 \
+  --num-cores 16 \
+  --num-devices 1 \
+  --compile-prefill-seq-len 4096 \
+  --mxfp6-matmul
+```