UbiquitousLearning · chenghuaWang · Jan 27, 2026 · Jan 26, 2026 · Jan 27, 2026 · coderabbitai
diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
@@ -132,7 +132,7 @@ Tensor rotateHalf(Tensor x, nn::Module* m, const std::string& qdq_name_in_pytorc
 }
 
 using vi32 = std::vector<int32_t>;
-#define CONV2D_PROPERTY vi32{1, 1}, vi32{1, 1}, vi32{0, 0}, vi32{1, 1}, false, aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32
+#define CONV2D_PROPERTY vi32{1, 1}, vi32{1, 1}, vi32{0, 0}, vi32{1, 1}, false, aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16
 
 // Using Conv2D to replace Linear.
 // Conv2D Filter Weight is [1, 1, In, Out]

diff --git a/examples/qwen3_qnn_aot/qnn_aot_cfg_1.7B.json b/examples/qwen3_qnn_aot/qnn_aot_cfg_1.7B.json
@@ -23,15 +23,15 @@
                     "method": "LPBQ",
                     "sym": true,
                     "precision": "w4a16",
-                    "block_size": 32
+                    "block_size": 16
                 }
             },
             "linear": {
                 "fallback": {
                     "method": "LPBQ",
                     "sym": true,
                     "precision": "w4a16",
-                    "block_size": 32
+                    "block_size": 16
                 }
             },
             "kv_cache": {

@@ -61,6 +61,7 @@ void CPUConv2DOp::load(const ParameterFile::ptr_t& ploader) {
       weight_ = packed_weight;
       break;
     }
+    case aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16:
     case aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32:
     case aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64: {
       break;

@@ -172,6 +172,7 @@ void QnnAOTNodeTensor::setupComplexTensorQuantization(const ir::tensor::TensorVa
       break;
     }
     case ir::linalg::QuantizationSpecType::kLPBQ: {
+      MLLM_INFO("Solving LPBQ quantization for tensor: {}", v->tensor_.name());
       // This LPBQ Type is for Conv2D Only !!! Linear has diff layout cmp with conv2d
 
       auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecLPBQ>(quant_spec);
@@ -182,7 +183,7 @@ void QnnAOTNodeTensor::setupComplexTensorQuantization(const ir::tensor::TensorVa
       MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale_level_1_fp.size(-1));
       MLLM_RT_ASSERT_EQ(cfg->scale_level_0_int.dtype(), kUInt8);
       for (int i = 0; i < num_scale_offsets; ++i) {
-        scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({0, 0, 0, i});
+        scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({i});
         scale_offsets[i].offset = 0;
       }
 

@@ -83,7 +83,7 @@ uint8_t LLM2QnnLoweringPass::run(const ir::node_ptr_t& op) {
   for (auto& region_op : model_op->getTopRegion()->ops()) {
     if (auto sub_graph_op = std::dynamic_pointer_cast<ir::graph::SubGraphOp>(region_op)) {
       auto symbol_attr = sub_graph_op->getSymbolAttr();
-      if (symbol_attr) { subgraphs[symbol_attr->str()] = sub_graph_op; }
+      if (symbol_attr && symbol_attr->str() != "init") { subgraphs[symbol_attr->str()] = sub_graph_op; }
     }
   }
 

@@ -317,8 +317,9 @@ bool LLMQuantRecipeConv2DPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr
       ir::linalg::QuantizationSpecLPBQ::ptr_t weight_quant_spec = nullptr;
 
       if (precision == "w4a16") {
+        // HWIO
         weight_quant_spec =
-            ir::linalg::QuantizationSpecLPBQ::create(-8, 7, block_size, 0, 4, kUInt4, kFloat32, Tensor::nil(), Tensor::nil());
+            ir::linalg::QuantizationSpecLPBQ::create(-7, 7, block_size, 3, 4, kInt4, kFloat32, Tensor::nil(), Tensor::nil());
 
         // output sym int16
         auto out_quant_spec = ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65536 - 1, kUInt16, kFloat32, kInt32,

@@ -197,7 +197,7 @@ uint8_t SplitLLMGraphPass::run(const ir::node_ptr_t& op) {
             if (name == "model." + std::to_string(i) + ".s" + std::to_string(__global_seq_len)) { matched = true; }
             if (name == "model") { matched = true; }
           }
-          if (!matched) { wvw.removeOp(sub_g_op); }
+          if (!matched && name != "init") { wvw.removeOp(sub_g_op); }
           return ir::IRWriter::WalkResult::WALK_CONTINUE;
         });
   }

@@ -94,6 +94,16 @@ bool QnnAOTConv2DPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op)
     qnn_op_node->emplaceParamTensor(pad_amount_param);
   }
 
+  // Add params: dilation
+  {
+    auto dilation_param = QNNParamTensorWrapper::create("dilation", base_op->getName() + ".dilation", QNN_DATATYPE_UINT_32,
+                                                        std::vector<uint32_t>{2});
+    uint32_t* data = static_cast<uint32_t*>(dilation_param->alloc());
+    data[0] = 1;
+    data[1] = 1;
+    qnn_op_node->emplaceParamTensor(dilation_param);
+  }
+
   // Register this op node into one graph.
   env->captureAOTNodeOp(qnn_context_name, qnn_graph_name, qnn_op_node);
 

@@ -65,8 +65,7 @@ void PromptProcessor<T>::init_io() {
   output_tensors_.reserve(1 + 2 * config_.num_layers);
 
   // 1. Logits
-  // DBG:
-  auto logits = Tensor::empty({1, 1, config_.ar_len, 2048}, kUInt16, kQNN).alloc();
+  auto logits = Tensor::empty({1, 1, config_.ar_len, config_.vocab_size}, kUInt16, kQNN).alloc();
   logits.setName("logits");
   output_tensors_.push_back(logits);
 
@@ -132,7 +131,7 @@ int64_t PromptProcessor<T>::prefill(const std::vector<int64_t>& prompt_tokens, i
 
     prepare_io(prompt_tokens, processed_tokens, current_pos);
 
-    auto module_input = input_tensors_;
+    std::vector<Tensor> module_input = input_tensors_;
     output_tensors_ = (*module_)(module_input);
 
     int32_t n_update = chunk_size;

@@ -1,9 +1,10 @@
 // Copyright (c) MLLM Team.
 // Licensed under the MIT License.
 
-#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
 #include <algorithm>
 #include <cstring>
+
+#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
 #include "mllm/core/DataTypes.hpp"
 #include "mllm/core/DeviceTypes.hpp"
 #include "mllm/preprocessor/tokenizers/Unicode.hpp"

diff --git a/mllm/core/aops/Conv2DOp.cpp b/mllm/core/aops/Conv2DOp.cpp
@@ -78,7 +78,8 @@ void Conv2DOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& o
 
   // CHECK if in Qualcomm DSP shape. Inputs is [N, H, W, C], Filter Weight is [N, H, In, Out]
   if (options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32
-      || options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64) {
+      || options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64
+      || options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16) {
     in_channels = ishape[3];
     in_height = ishape[1];
     in_width = ishape[2];
@@ -112,7 +113,8 @@ void Conv2DOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& o
   auto new_shape = std::vector<int32_t>{batch, out_channels, h_out, w_out};
 
   if (options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32
-      || options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64) {
+      || options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64
+      || options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16) {
     new_shape = std::vector<int32_t>{batch, h_out, w_out, out_channels};
   }
 

diff --git a/mllm/core/aops/Conv2DOp.hpp b/mllm/core/aops/Conv2DOp.hpp
@@ -12,6 +12,7 @@ enum class Conv2DOpImplType {
   kDefault = 0,
 
   // LPBQ
+  kQNN_LPBQ_w4a16o16_G16,
   kQNN_LPBQ_w4a16o16_G32,
   kQNN_LPBQ_w4a16o16_G64,
 };
@@ -28,7 +29,11 @@ struct Conv2DOpOptions : public BaseOpOptions<Conv2DOpOptions> {
 };
 
 inline Conv2DOpImplType str2Conv2DOpImplType(const std::string& str) {
-  static const std::unordered_map<std::string, Conv2DOpImplType> map = {{"Default", Conv2DOpImplType::kDefault}};
+  static const std::unordered_map<std::string, Conv2DOpImplType> map = {
+      {"Default", Conv2DOpImplType::kDefault},
+      {"QNN_LPBQ_w4a16o16_G16", Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16},
+      {"QNN_LPBQ_w4a16o16_G32", Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32},
+      {"QNN_LPBQ_w4a16o16_G64", Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64}};
 
   auto it = map.find(str);
   if (it != map.end()) { return it->second; }
@@ -38,7 +43,11 @@ inline Conv2DOpImplType str2Conv2DOpImplType(const std::string& str) {
 }
 
 inline std::string Conv2DOpImplType2Str(Conv2DOpImplType type) {
-  static const std::unordered_map<Conv2DOpImplType, std::string> map = {{Conv2DOpImplType::kDefault, "Default"}};
+  static const std::unordered_map<Conv2DOpImplType, std::string> map = {
+      {Conv2DOpImplType::kDefault, "Default"},
+      {Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16, "QNN_LPBQ_w4a16o16_G16"},
+      {Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32, "QNN_LPBQ_w4a16o16_G32"},
+      {Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64, "QNN_LPBQ_w4a16o16_G64"}};
 
   auto it = map.find(type);
   if (it != map.end()) return it->second;

diff --git a/pymllm/backends/qualcomm/transformers/core/embedding.py b/pymllm/backends/qualcomm/transformers/core/embedding.py
@@ -124,9 +124,12 @@ def freeze_weight(self):
             f"Class: {class_name}, Instance: {instance_class_name}, Weight Quantized: scale={self.weight_fake_quant.scale}, zp={self.weight_fake_quant.zero_point}"
         )
 
-    def disable_quant(self):
+    def disable_fakequant(self):
         """Completely turn off quantization noise and return to floating point mode"""
-        self.weight_fake_quant.disable_fakequant()
+        self.weight_fake_quant.disable_fake_quant()
+
+    def enable_fakequant(self):
+        self.weight_fake_quant.enable_fake_quant()
 
     def extra_repr(self):
         s = f"{self.num_embeddings}, {self.embedding_dim}"

diff --git a/pymllm/backends/qualcomm/transformers/core/observer.py b/pymllm/backends/qualcomm/transformers/core/observer.py
@@ -1,5 +1,147 @@
 import torch
+from typing import Tuple
 from torchao.quantization.pt2e import UniformQuantizationObserverBase
+from torchao.quantization.pt2e import FakeQuantize, MappingType, PerBlock
+from torchao.quantization.pt2e._affine_quantization import (
+    _get_reduction_params,
+    AffineQuantizedMinMaxObserver,
+    choose_qparams_affine_with_min_max,
+)
+from torchao.quantization.quant_primitives import _fake_quantize_affine
+
+
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+class PerBlockParamObserver(AffineQuantizedMinMaxObserver):
+    def __init__(
+        self,
+        dtype: torch.dtype,
+        block_size: torch.Size,
+        quant_min=None,
+        quant_max=None,
+        eps=torch.finfo(torch.float32).eps,  # noqa: B008
+        **kwargs,
+    ):
+        super().__init__(
+            mapping_type=MappingType.SYMMETRIC,
+            target_dtype=dtype,
+            granularity=PerBlock,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            eps=eps,
+            **kwargs,
+        )
+        self.dtype = dtype
+        self.block_size = block_size
+        # TODO: expand this when QNN starts to support more configurations
+        self.bitwidth_of_scale = 4
+        self.num_steps = 2**self.bitwidth_of_scale
+        self.calibrated = False
+
+    def forward(self, input: torch.Tensor):
+        if input.numel() == 0 or self.calibrated:
+            return input
+
+        input_detached = input.detach()
+        self.original_dtype = input_detached.dtype
+        shape_for_reduction, reduction_dims = _get_reduction_params(
+            self.block_size, input_detached.size()
+        )
+        input_detached = input_detached.view(shape_for_reduction)
+        min_val = torch.amin(input_detached, dim=reduction_dims)
+        max_val = torch.amax(input_detached, dim=reduction_dims)
+        if not hasattr(self, "min_val") or not hasattr(self, "max_val"):
+            self.min_val = min_val
+            self.max_val = max_val
+        else:
+            assert self.min_val.shape == min_val.shape, (
+                f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
+            )
+            assert self.max_val.shape == max_val.shape, (
+                f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
+            )
+            min_val = torch.min(self.min_val, min_val)
+            max_val = torch.max(self.max_val, max_val)
+            self.min_val.copy_(min_val)
+            self.max_val.copy_(max_val)
+
+        self.calibrated = True
+        return input
+
+    def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert hasattr(self, "min_val") and hasattr(self, "max_val"), (
+            "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        )
+        return choose_qparams_affine_with_min_max(
+            self.min_val,
+            self.max_val,
+            self.mapping_type,
+            [],
+            self.target_dtype,
+            self.quant_min,
+            self.quant_max,
+            self.eps,
+            self.scale_dtype,
+            self.zero_point_dtype,
+            self.preserve_zero,
+            self.zero_point_domain,
+        )
+
+
+class PerBlockParamFakeQuantize(FakeQuantize):
+    def __init__(
+        self,
+        dtype: torch.dtype = torch.int8,
+        block_size: torch.Size = None,
+        quant_min: int = None,
+        quant_max: int = None,
+        eps: float = torch.finfo(torch.float32).eps,  # noqa: B008
+        **kwargs,
+    ):
+        super().__init__()
+        assert block_size is not None, (
+            "block_size must be provided for per-block quantization"
+        )
+
+        self.activation_post_process = PerBlockParamObserver(
+            dtype=dtype,
+            block_size=block_size,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            eps=eps,
+            **kwargs,
+        )
+        self.dtype = dtype
+        self.block_size = block_size
+        self.quant_min = quant_min if quant_min is not None else torch.iinfo(dtype).min
+        self.quant_max = quant_max if quant_max is not None else torch.iinfo(dtype).max
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.numel() == 0:
+            return x
+
+        self.activation_post_process(x)
+        scale, zero_point = self.activation_post_process.calculate_qparams()
+
+        return _fake_quantize_affine(
+            x,
+            self.block_size,
+            scale,
+            zero_point,
+            quant_dtype=self.dtype,
+            quant_min=self.quant_min,
+            quant_max=self.quant_max,
+        )
+
+    def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.activation_post_process.calculate_qparams()
+
+    def convert(self, model, observer_node):
+        self.activation_post_process.convert(model, observer_node)
 
 
 class ConcatObserver(UniformQuantizationObserverBase):