diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 0995cc5bd..21847f25d 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -383,11 +383,7 @@ def forward( key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - # cos, sin = position_embeddings - # kv_seq_len = key_states.shape[-2] - # kv_seq_len = past_key_value.get_seq_length() past_seen_tokens = past_key_values.get_seq_length(self.layer_idx) if past_key_values is not None else 0 - # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos_cached, sin_cached) blocking_config = getattr(self, "attn_blocking_config", AttentionBlockingConfig()) @@ -502,7 +498,7 @@ def forward( residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states[0] + hidden_states = residual + hidden_states outputs = (hidden_states,) @@ -669,7 +665,7 @@ class QEffQwen3VLDecoderWrapper(nn.Module): def __init__(self, model): super().__init__() self.model = model - self.language_model = self.model.model + self.language_model = self.model.model.language_model def get_submodules_for_export(self) -> Type[nn.Module]: """ diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 976c80919..240d04d99 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -497,7 +497,9 @@ def forward( residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states[0] + if isinstance(hidden_states, tuple): + hidden_states, _ = hidden_states + hidden_states = residual + hidden_states outputs = (hidden_states,) diff --git a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py index 962daaa52..b6e78604a 100644 --- a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py +++ b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py @@ -84,8 +84,6 @@ num_devices=4, height=354, width=536, - # height=1024, - # width=1024, mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True, @@ -95,9 +93,6 @@ ### IMAGE + TEXT ### image_url = "https://picsum.photos/id/237/536/354" - # image_url = ( - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png" - # ) image = Image.open(requests.get(image_url, stream=True).raw) @@ -111,16 +106,6 @@ }, ] - # messages_2 = [ - # { - # "role": "user", - # "content": [ - # {"type": "image", "image": image}, - # {"type": "text", "text": "Describe about the color of the dog."}, - # ], - # }, - # ] - messages = [messages_1] * batch_size texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages] diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index 1162400e4..24829fe5f 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -170,7 +170,7 @@ "num_layers": 1, "img_url_list":[ "https://picsum.photos/id/237/536/354", - "https://picsum.photos/id/237/536/354" + "https://picsum.photos/id/238/536/354" ], "text_prompt_list": [ "Can you describe the image in detail?", @@ -235,7 +235,7 @@ "num_layers": 1, "img_url_list":[ "https://picsum.photos/id/237/536/354", - "https://picsum.photos/id/237/536/354" + "https://picsum.photos/id/238/536/354" ], "text_prompt_list": [ "Can you describe the image in detail?", @@ -292,7 +292,7 @@ "num_layers": 1, "img_url_list":[ "https://picsum.photos/id/237/536/354", - "https://picsum.photos/id/237/536/354" + "https://picsum.photos/id/238/536/354" ], "text_prompt_list": [ "Can you describe the image in detail.",