Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ Tensor rotateHalf(Tensor x, nn::Module* m, const std::string& qdq_name_in_pytorc
}

using vi32 = std::vector<int32_t>;
#define CONV2D_PROPERTY vi32{1, 1}, vi32{1, 1}, vi32{0, 0}, vi32{1, 1}, false, aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32
#define CONV2D_PROPERTY vi32{1, 1}, vi32{1, 1}, vi32{0, 0}, vi32{1, 1}, false, aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16

// Using Conv2D to replace Linear.
// Conv2D Filter Weight is [1, 1, In, Out]
Expand Down
4 changes: 2 additions & 2 deletions examples/qwen3_qnn_aot/qnn_aot_cfg_1.7B.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@
"method": "LPBQ",
"sym": true,
"precision": "w4a16",
"block_size": 32
"block_size": 16
}
},
"linear": {
"fallback": {
"method": "LPBQ",
"sym": true,
"precision": "w4a16",
"block_size": 32
"block_size": 16
}
},
"kv_cache": {
Expand Down
1 change: 1 addition & 0 deletions mllm/backends/cpu/ops/Conv2DOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ void CPUConv2DOp::load(const ParameterFile::ptr_t& ploader) {
weight_ = packed_weight;
break;
}
case aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16:
case aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32:
case aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64: {
break;
Expand Down
3 changes: 2 additions & 1 deletion mllm/backends/qnn/aot/QnnWrappersAPI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ void QnnAOTNodeTensor::setupComplexTensorQuantization(const ir::tensor::TensorVa
break;
}
case ir::linalg::QuantizationSpecType::kLPBQ: {
MLLM_INFO("Solving LPBQ quantization for tensor: {}", v->tensor_.name());
// This LPBQ Type is for Conv2D Only !!! Linear has diff layout cmp with conv2d

auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecLPBQ>(quant_spec);
Expand All @@ -182,7 +183,7 @@ void QnnAOTNodeTensor::setupComplexTensorQuantization(const ir::tensor::TensorVa
MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale_level_1_fp.size(-1));
MLLM_RT_ASSERT_EQ(cfg->scale_level_0_int.dtype(), kUInt8);
for (int i = 0; i < num_scale_offsets; ++i) {
scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({0, 0, 0, i});
scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({i});
scale_offsets[i].offset = 0;
}

Expand Down
2 changes: 1 addition & 1 deletion mllm/backends/qnn/aot/passes/LLM2QnnLoweringPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ uint8_t LLM2QnnLoweringPass::run(const ir::node_ptr_t& op) {
for (auto& region_op : model_op->getTopRegion()->ops()) {
if (auto sub_graph_op = std::dynamic_pointer_cast<ir::graph::SubGraphOp>(region_op)) {
auto symbol_attr = sub_graph_op->getSymbolAttr();
if (symbol_attr) { subgraphs[symbol_attr->str()] = sub_graph_op; }
if (symbol_attr && symbol_attr->str() != "init") { subgraphs[symbol_attr->str()] = sub_graph_op; }
}
}

Expand Down
3 changes: 2 additions & 1 deletion mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,9 @@ bool LLMQuantRecipeConv2DPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr
ir::linalg::QuantizationSpecLPBQ::ptr_t weight_quant_spec = nullptr;

if (precision == "w4a16") {
// HWIO
weight_quant_spec =
ir::linalg::QuantizationSpecLPBQ::create(-8, 7, block_size, 0, 4, kUInt4, kFloat32, Tensor::nil(), Tensor::nil());
ir::linalg::QuantizationSpecLPBQ::create(-7, 7, block_size, 3, 4, kInt4, kFloat32, Tensor::nil(), Tensor::nil());

// output sym int16
auto out_quant_spec = ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65536 - 1, kUInt16, kFloat32, kInt32,
Expand Down
2 changes: 1 addition & 1 deletion mllm/backends/qnn/aot/passes/SplitLLMGraphPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ uint8_t SplitLLMGraphPass::run(const ir::node_ptr_t& op) {
if (name == "model." + std::to_string(i) + ".s" + std::to_string(__global_seq_len)) { matched = true; }
if (name == "model") { matched = true; }
}
if (!matched) { wvw.removeOp(sub_g_op); }
if (!matched && name != "init") { wvw.removeOp(sub_g_op); }
return ir::IRWriter::WalkResult::WALK_CONTINUE;
});
}
Expand Down
10 changes: 10 additions & 0 deletions mllm/backends/qnn/aot/visitor/Conv2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ bool QnnAOTConv2DPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op)
qnn_op_node->emplaceParamTensor(pad_amount_param);
}

// Add params: dilation
{
auto dilation_param = QNNParamTensorWrapper::create("dilation", base_op->getName() + ".dilation", QNN_DATATYPE_UINT_32,
std::vector<uint32_t>{2});
uint32_t* data = static_cast<uint32_t*>(dilation_param->alloc());
data[0] = 1;
data[1] = 1;
qnn_op_node->emplaceParamTensor(dilation_param);
}
Comment on lines +97 to +105
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, find the Conv2D.cpp file and surrounding context
find . -name "Conv2D.cpp" -path "*/qnn/aot/visitor/*" | head -5

Repository: UbiquitousLearning/mllm

Length of output: 109


🏁 Script executed:

# Search for Conv2DOp definition and its options/attributes
rg -n "struct Conv2DOptions|class Conv2DOp|Conv2DOp\s*{" --type=cpp -A 10 | head -50

Repository: UbiquitousLearning/mllm

Length of output: 226


🏁 Script executed:

# Look for Conv2D-related files to understand the structure
fd "Conv2D" --type f | grep -E "\.(h|cpp)$"

Repository: UbiquitousLearning/mllm

Length of output: 196


🏁 Script executed:

# Check the Conv2D.cpp file context around lines 97-105
rg -n "Add params: dilation" --type=cpp -B 20 -A 20

Repository: UbiquitousLearning/mllm

Length of output: 2975


🏁 Script executed:

# Get the full Conv2D.cpp file to understand the context
wc -l mllm/backends/qnn/aot/visitor/Conv2D.cpp

Repository: UbiquitousLearning/mllm

Length of output: 111


🏁 Script executed:

# Read the Conv2D.cpp file focusing on the function that contains lines 97-105
cat -n mllm/backends/qnn/aot/visitor/Conv2D.cpp | head -120

Repository: UbiquitousLearning/mllm

Length of output: 5160


🏁 Script executed:

# Look for Conv2DOp header/definition
fd "Conv2DOp" --type f | grep -E "\.h(pp)?$"

Repository: UbiquitousLearning/mllm

Length of output: 129


🏁 Script executed:

# Search for Conv2DOp class/struct definition
rg -n "class Conv2DOp|struct Conv2DOp" --type=cpp --type=hpp -A 15

Repository: UbiquitousLearning/mllm

Length of output: 94


🏁 Script executed:

# Search for Conv2DOp class definition with correct type
rg -n "class Conv2DOp|struct Conv2DOp" -A 20 --type=cpp --type=h

Repository: UbiquitousLearning/mllm

Length of output: 4387


🏁 Script executed:

# Check the Conv2DOp.hpp header files directly
cat -n mllm/core/aops/Conv2DOp.hpp

Repository: UbiquitousLearning/mllm

Length of output: 3207


🏁 Script executed:

# Search for Conv2DOptions struct that might have dilation, stride, pad fields
rg -n "Conv2DOptions|struct.*Conv2D" -A 10 --type=cpp --type=h | head -100

Repository: UbiquitousLearning/mllm

Length of output: 734


🏁 Script executed:

# Verify by checking what bias does vs what stride/padding/dilation do
rg -n "real_linear_op->options()" mllm/backends/qnn/aot/visitor/Conv2D.cpp

Repository: UbiquitousLearning/mllm

Length of output: 109


Stride, padding, and dilation parameters are hardcoded.

All three are hardcoded to default values instead of retrieving them from real_linear_op->options(). The bias parameter (line 58) shows the correct pattern—use real_linear_op->options() to access configuration. Update stride, padding, and dilation to use the same approach for proper Conv2D behavior with custom convolution parameters.

🤖 Prompt for AI Agents
In `@mllm/backends/qnn/aot/visitor/Conv2D.cpp` around lines 97 - 105, The
dilation/stride/padding blocks (currently creating dilation_param, stride,
padding as hardcoded uint32_t {1}/defaults) must read the actual convolution
settings from real_linear_op->options() instead of using fixed values; update
the code that creates QNNParamTensorWrapper instances (e.g., dilation_param,
stride_param, padding_param) to query real_linear_op->options() for dilation,
strides, and pads, allocate the param tensors via alloc(), and fill the uint32_t
data array with those option values before calling
qnn_op_node->emplaceParamTensor(...) so Conv2D uses the configured parameters
(follow the same pattern used for the bias handling around
real_linear_op->options()).


// Register this op node into one graph.
env->captureAOTNodeOp(qnn_context_name, qnn_graph_name, qnn_op_node);

Expand Down
5 changes: 2 additions & 3 deletions mllm/backends/qnn/aot_rt/PromptProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ void PromptProcessor<T>::init_io() {
output_tensors_.reserve(1 + 2 * config_.num_layers);

// 1. Logits
// DBG:
auto logits = Tensor::empty({1, 1, config_.ar_len, 2048}, kUInt16, kQNN).alloc();
auto logits = Tensor::empty({1, 1, config_.ar_len, config_.vocab_size}, kUInt16, kQNN).alloc();
logits.setName("logits");
output_tensors_.push_back(logits);

Expand Down Expand Up @@ -132,7 +131,7 @@ int64_t PromptProcessor<T>::prefill(const std::vector<int64_t>& prompt_tokens, i

prepare_io(prompt_tokens, processed_tokens, current_pos);

auto module_input = input_tensors_;
std::vector<Tensor> module_input = input_tensors_;
output_tensors_ = (*module_)(module_input);

int32_t n_update = chunk_size;
Expand Down
3 changes: 2 additions & 1 deletion mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
// Copyright (c) MLLM Team.
// Licensed under the MIT License.

#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
#include <algorithm>
#include <cstring>

#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
#include "mllm/core/DataTypes.hpp"
#include "mllm/core/DeviceTypes.hpp"
#include "mllm/preprocessor/tokenizers/Unicode.hpp"
Expand Down
6 changes: 4 additions & 2 deletions mllm/core/aops/Conv2DOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ void Conv2DOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& o

// CHECK if in Qualcomm DSP shape. Inputs is [N, H, W, C], Filter Weight is [N, H, In, Out]
if (options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32
|| options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64) {
|| options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64
|| options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16) {
in_channels = ishape[3];
in_height = ishape[1];
in_width = ishape[2];
Expand Down Expand Up @@ -112,7 +113,8 @@ void Conv2DOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& o
auto new_shape = std::vector<int32_t>{batch, out_channels, h_out, w_out};

if (options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32
|| options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64) {
|| options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64
|| options_.impl_type == Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16) {
new_shape = std::vector<int32_t>{batch, h_out, w_out, out_channels};
}

Expand Down
13 changes: 11 additions & 2 deletions mllm/core/aops/Conv2DOp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ enum class Conv2DOpImplType {
kDefault = 0,

// LPBQ
kQNN_LPBQ_w4a16o16_G16,
kQNN_LPBQ_w4a16o16_G32,
kQNN_LPBQ_w4a16o16_G64,
};
Expand All @@ -28,7 +29,11 @@ struct Conv2DOpOptions : public BaseOpOptions<Conv2DOpOptions> {
};

inline Conv2DOpImplType str2Conv2DOpImplType(const std::string& str) {
static const std::unordered_map<std::string, Conv2DOpImplType> map = {{"Default", Conv2DOpImplType::kDefault}};
static const std::unordered_map<std::string, Conv2DOpImplType> map = {
{"Default", Conv2DOpImplType::kDefault},
{"QNN_LPBQ_w4a16o16_G16", Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16},
{"QNN_LPBQ_w4a16o16_G32", Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32},
{"QNN_LPBQ_w4a16o16_G64", Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64}};

auto it = map.find(str);
if (it != map.end()) { return it->second; }
Expand All @@ -38,7 +43,11 @@ inline Conv2DOpImplType str2Conv2DOpImplType(const std::string& str) {
}

inline std::string Conv2DOpImplType2Str(Conv2DOpImplType type) {
static const std::unordered_map<Conv2DOpImplType, std::string> map = {{Conv2DOpImplType::kDefault, "Default"}};
static const std::unordered_map<Conv2DOpImplType, std::string> map = {
{Conv2DOpImplType::kDefault, "Default"},
{Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G16, "QNN_LPBQ_w4a16o16_G16"},
{Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32, "QNN_LPBQ_w4a16o16_G32"},
{Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G64, "QNN_LPBQ_w4a16o16_G64"}};

auto it = map.find(type);
if (it != map.end()) return it->second;
Expand Down
7 changes: 5 additions & 2 deletions pymllm/backends/qualcomm/transformers/core/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,12 @@ def freeze_weight(self):
f"Class: {class_name}, Instance: {instance_class_name}, Weight Quantized: scale={self.weight_fake_quant.scale}, zp={self.weight_fake_quant.zero_point}"
)

def disable_quant(self):
def disable_fakequant(self):
"""Completely turn off quantization noise and return to floating point mode"""
self.weight_fake_quant.disable_fakequant()
self.weight_fake_quant.disable_fake_quant()

def enable_fakequant(self):
self.weight_fake_quant.enable_fake_quant()

def extra_repr(self):
s = f"{self.num_embeddings}, {self.embedding_dim}"
Expand Down
142 changes: 142 additions & 0 deletions pymllm/backends/qualcomm/transformers/core/observer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,147 @@
import torch
from typing import Tuple
from torchao.quantization.pt2e import UniformQuantizationObserverBase
from torchao.quantization.pt2e import FakeQuantize, MappingType, PerBlock
from torchao.quantization.pt2e._affine_quantization import (
_get_reduction_params,
AffineQuantizedMinMaxObserver,
choose_qparams_affine_with_min_max,
)
from torchao.quantization.quant_primitives import _fake_quantize_affine


# Copyright (c) Qualcomm Innovation Center, Inc.
# All rights reserved
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
class PerBlockParamObserver(AffineQuantizedMinMaxObserver):
def __init__(
self,
dtype: torch.dtype,
block_size: torch.Size,
quant_min=None,
quant_max=None,
eps=torch.finfo(torch.float32).eps, # noqa: B008
**kwargs,
):
super().__init__(
mapping_type=MappingType.SYMMETRIC,
target_dtype=dtype,
granularity=PerBlock,
quant_min=quant_min,
quant_max=quant_max,
eps=eps,
**kwargs,
)
self.dtype = dtype
self.block_size = block_size
# TODO: expand this when QNN starts to support more configurations
self.bitwidth_of_scale = 4
self.num_steps = 2**self.bitwidth_of_scale
self.calibrated = False

def forward(self, input: torch.Tensor):
if input.numel() == 0 or self.calibrated:
return input

input_detached = input.detach()
self.original_dtype = input_detached.dtype
shape_for_reduction, reduction_dims = _get_reduction_params(
self.block_size, input_detached.size()
)
input_detached = input_detached.view(shape_for_reduction)
min_val = torch.amin(input_detached, dim=reduction_dims)
max_val = torch.amax(input_detached, dim=reduction_dims)
if not hasattr(self, "min_val") or not hasattr(self, "max_val"):
self.min_val = min_val
self.max_val = max_val
else:
assert self.min_val.shape == min_val.shape, (
f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
)
assert self.max_val.shape == max_val.shape, (
f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
)
min_val = torch.min(self.min_val, min_val)
max_val = torch.max(self.max_val, max_val)
self.min_val.copy_(min_val)
self.max_val.copy_(max_val)

self.calibrated = True
return input

def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
assert hasattr(self, "min_val") and hasattr(self, "max_val"), (
"Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
)
return choose_qparams_affine_with_min_max(
self.min_val,
self.max_val,
self.mapping_type,
[],
self.target_dtype,
self.quant_min,
self.quant_max,
self.eps,
self.scale_dtype,
self.zero_point_dtype,
self.preserve_zero,
self.zero_point_domain,
)


class PerBlockParamFakeQuantize(FakeQuantize):
def __init__(
self,
dtype: torch.dtype = torch.int8,
block_size: torch.Size = None,
quant_min: int = None,
quant_max: int = None,
eps: float = torch.finfo(torch.float32).eps, # noqa: B008
**kwargs,
):
super().__init__()
assert block_size is not None, (
"block_size must be provided for per-block quantization"
)

self.activation_post_process = PerBlockParamObserver(
dtype=dtype,
block_size=block_size,
quant_min=quant_min,
quant_max=quant_max,
eps=eps,
**kwargs,
)
self.dtype = dtype
self.block_size = block_size
self.quant_min = quant_min if quant_min is not None else torch.iinfo(dtype).min
self.quant_max = quant_max if quant_max is not None else torch.iinfo(dtype).max
self.eps = eps

def forward(self, x: torch.Tensor) -> torch.Tensor:
if x.numel() == 0:
return x

self.activation_post_process(x)
scale, zero_point = self.activation_post_process.calculate_qparams()

return _fake_quantize_affine(
x,
self.block_size,
scale,
zero_point,
quant_dtype=self.dtype,
quant_min=self.quant_min,
quant_max=self.quant_max,
)

def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
return self.activation_post_process.calculate_qparams()

def convert(self, model, observer_node):
self.activation_post_process.convert(model, observer_node)


class ConcatObserver(UniformQuantizationObserverBase):
Expand Down
Loading