Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@
QEFFCommonLoader,
)
from QEfficient.compile.compile_helper import compile
from QEfficient.diffusers.pipelines.flux.pipeline_flux import QEffFluxPipeline
from QEfficient.diffusers.pipelines.wan.pipeline_wan import QEffWanPipeline

# from QEfficient.diffusers.pipelines.flux.pipeline_flux import QEffFluxPipeline
# from QEfficient.diffusers.pipelines.wan.pipeline_wan import QEffWanPipeline
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.peft import QEffAutoPeftModelForCausalLM
Expand Down
12 changes: 10 additions & 2 deletions QEfficient/transformers/cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,16 @@
from typing import Any, Dict, List, Optional, Tuple

import torch
from transformers.cache_utils import DynamicCache, DynamicLayer, EncoderDecoderCache, HybridCache, HybridChunkedCache
from transformers.cache_utils import Cache, DynamicCache, DynamicLayer, EncoderDecoderCache

try:
from transformers.cache_utils import HybridCache, HybridChunkedCache
except ImportError:
# Newer transformers builds may not expose these cache helpers. Fall back to
# DynamicCache so QEfficient can still import in environments where the
# hybrid-cache-specific models are not exercised.
HybridCache = DynamicCache
HybridChunkedCache = DynamicCache

from QEfficient.customop import (
CtxGatherFunc,
Expand Down Expand Up @@ -309,7 +318,6 @@ class QEffDynamicCache(DynamicCache):
def __init__(self, ddp_cache_data: Optional[Iterable[tuple[torch.Tensor, torch.Tensor]]] = None, *args, **kwargs):
# Remove layer_classes if present to avoid duplicate argument
kwargs.pop("layer_classes", None)
from transformers.cache_utils import Cache # Import here to avoid circular import

Cache.__init__(self, layer_classes=QEffDynamicLayer, *args, **kwargs)
if ddp_cache_data is not None:
Expand Down
14 changes: 9 additions & 5 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -2310,8 +2310,11 @@ def cloud_ai_100_generate(
if "pixel_values_RetainedState" in qpc_session.output_names:
inputs["pixel_values"] = inputs["pixel_values"].astype("float16")

inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
inputs["image_idx"] = np.array([[0]])
inputs["position_ids"] = np.repeat(
np.where(inputs["attention_mask"], np.arange(padded_len), -1)[np.newaxis, ...], 4, axis=0
)

# inputs["image_idx"] = np.array([[0]])

if self.comp_ctx_lengths_prefill is not None:
list_of_comp_ctx_lengths_prefill = [
Expand All @@ -2334,18 +2337,19 @@ def cloud_ai_100_generate(
chunk_inputs["comp_ctx_lengths"] = list_of_comp_ctx_lengths_prefill[prefill_ccl_id]

chunk_inputs["input_ids"] = inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
chunk_inputs["position_ids"] = inputs["position_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
chunk_inputs["position_ids"] = inputs["position_ids"][..., i * prefill_seq_len : (i + 1) * prefill_seq_len]
outputs = qpc_session.run(chunk_inputs)

if self._write_io_dir is not None:
write_io_files(chunk_inputs, outputs, self._write_io_dir, "prefill", "aic_batch_io", True, False)

chunk_inputs["image_idx"] = outputs["image_idx_output"]
# chunk_inputs["image_idx"] = outputs["image_idx_output"]

prefill_time = perf_counter() - prefill_start
# Get first token
inputs["input_ids"] = outputs["logits"].argmax(2)
inputs["position_ids"] = input_len.numpy()
# inputs["position_ids"] = input_len.numpy()
inputs["position_ids"] = np.max(inputs["position_ids"], axis=-1, keepdims=True) + 1

if "cross_attention_mask" in inputs:
bs, _, num_images, img_tiles = inputs["cross_attention_mask"].shape
Expand Down
33 changes: 32 additions & 1 deletion QEfficient/transformers/models/pytorch_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@
Qwen2_5_VLVisionAttention,
)
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
Qwen2RMSNorm as Qwen2_5RMSNorm,
Qwen2_5_VLRMSNorm as Qwen2_5RMSNorm,
)
from transformers.models.qwen3.modeling_qwen3 import (
Qwen3Attention,
Expand All @@ -185,6 +185,17 @@
Qwen3Model,
Qwen3RMSNorm,
)
from transformers.models.qwen3_5.modeling_qwen3_5 import (
Qwen3_5Attention,
Qwen3_5DecoderLayer,
Qwen3_5DynamicCache,
Qwen3_5ForConditionalGeneration,
Qwen3_5GatedDeltaNet,
Qwen3_5Model,
Qwen3_5RMSNorm,
Qwen3_5RMSNormGated,
Qwen3_5TextModel,
)
from transformers.models.qwen3_moe.modeling_qwen3_moe import (
Qwen3MoeAttention,
Qwen3MoeDecoderLayer,
Expand Down Expand Up @@ -424,6 +435,16 @@
QEffQwen3ForCausalLM,
QEffQwen3Model,
)
from QEfficient.transformers.models.qwen3_5.modeling_qwen3_5 import (
QEffQwen3_5Attention,
QEffQwen3_5DecoderLayer,
QEffQwen3_5DynamicCache,
QEffQwen3_5ForConditionalGeneration,
QEffQwen3_5GatedDeltaNet,
QEffQwen3_5GatedDeltaNetCustomRMSNormAIC,
QEffQwen3_5Model,
QEffQwen3_5TextModel,
)
from QEfficient.transformers.models.qwen3_moe.modeling_qwen3_moe import (
QEffPrefillChunkedQwen3MoeSparseMoeBlock,
QEffQwen3MoeAttention,
Expand Down Expand Up @@ -480,6 +501,8 @@ class CustomOpsTransform(ModuleMappingTransform):
Qwen3MoeRMSNorm: CustomRMSNormAIC,
Gemma3RMSNorm: QEffGemma3CustomRMSNormAIC,
Olmo2RMSNorm: CustomRMSNormAIC,
Qwen3_5RMSNorm: GemmaCustomRMSNormAIC,
Qwen3_5RMSNormGated: QEffQwen3_5GatedDeltaNetCustomRMSNormAIC,
}


Expand Down Expand Up @@ -621,6 +644,14 @@ class KVCacheTransform(ModuleMappingTransform):
Qwen3DecoderLayer: QEffQwen3DecoderLayer,
Qwen3Model: QEffQwen3Model,
Qwen3ForCausalLM: QEffQwen3ForCausalLM,
# Qwen3_5
Qwen3_5DynamicCache: QEffQwen3_5DynamicCache,
Qwen3_5GatedDeltaNet: QEffQwen3_5GatedDeltaNet,
Qwen3_5DecoderLayer: QEffQwen3_5DecoderLayer,
Qwen3_5TextModel: QEffQwen3_5TextModel,
Qwen3_5Model: QEffQwen3_5Model,
Qwen3_5ForConditionalGeneration: QEffQwen3_5ForConditionalGeneration,
Qwen3_5Attention: QEffQwen3_5Attention,
# Qwen2.5 VL
Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration,
Qwen2_5_VLModel: QEffQwen2_5_VLModel,
Expand Down
6 changes: 6 additions & 0 deletions QEfficient/transformers/models/qwen3_5/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
Loading