-
Notifications
You must be signed in to change notification settings - Fork 191
feat(qualcomm): Qnn AOT Lowering Passes #584
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9a01660
3e5ee4f
b8bf680
7b740eb
cca38ba
37aa377
8160496
45c495f
32ebd47
ec0dfcf
404d5da
c41efb9
f723aba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| // Copyright (c) MLLM Team. | ||
| // Licensed under the MIT License. | ||
|
|
||
| #include "mllm/backends/cpu/ops/GatherOp.hpp" | ||
| #include "mllm/core/Tensor.hpp" | ||
|
|
||
| namespace mllm::cpu { | ||
|
|
||
| CPUGatherOp::CPUGatherOp(const aops::GatherOpOptions& options) : aops::GatherOp(options) {} | ||
|
|
||
| void CPUGatherOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) { | ||
| auto& table = inputs[0]; | ||
| auto& indices = inputs[1]; | ||
| auto& output = outputs[0]; | ||
|
|
||
| int dim = options_.dim; | ||
| if (dim < 0) dim += table.shape().size(); | ||
|
|
||
| int64_t outer_size = 1; | ||
| for (int i = 0; i < dim; ++i) outer_size *= table.shape()[i]; | ||
|
|
||
| int64_t inner_size = 1; | ||
| for (int i = dim + 1; i < table.shape().size(); ++i) inner_size *= table.shape()[i]; | ||
|
|
||
| int64_t dim_size = table.shape()[dim]; | ||
| int64_t indices_count = indices.numel(); | ||
|
|
||
| size_t data_type_size = 4; | ||
| switch (table.dtype()) { | ||
| case MLLM_TYPE_F32: data_type_size = sizeof(float); break; | ||
| case MLLM_TYPE_F16: data_type_size = sizeof(mllm_fp16_t); break; | ||
| case MLLM_TYPE_I32: data_type_size = sizeof(int32_t); break; | ||
| default: MLLM_ERROR("GatherOp table type not supported: {}", (int)table.dtype()); | ||
| } | ||
|
|
||
| const uint8_t* table_ptr = table.ptr<uint8_t>(); | ||
| uint8_t* output_ptr = output.ptr<uint8_t>(); | ||
|
|
||
| const int32_t* indices_i32 = indices.dtype() == MLLM_TYPE_I32 ? indices.ptr<int32_t>() : nullptr; | ||
| const float* indices_f32 = !indices_i32 && indices.dtype() == MLLM_TYPE_F32 ? indices.ptr<float>() : nullptr; | ||
|
|
||
| if (!indices_i32 && !indices_f32) { | ||
| MLLM_ERROR("GatherOp indices type not supported: {}", (int)indices.dtype()); | ||
| return; | ||
| } | ||
|
|
||
| // FIXME: parallel | ||
| for (int64_t o = 0; o < outer_size; ++o) { | ||
| for (int64_t i = 0; i < indices_count; ++i) { | ||
| int64_t idx = 0; | ||
| if (indices_i32) { | ||
| idx = indices_i32[i]; | ||
| } else if (indices_f32) { | ||
| idx = (int64_t)indices_f32[i]; | ||
| } | ||
|
|
||
| if (idx < 0) idx += dim_size; | ||
|
|
||
| if (idx < 0 || idx >= dim_size) { continue; } | ||
|
|
||
| int64_t src_offset = (o * dim_size + idx) * inner_size * data_type_size; | ||
| int64_t dst_offset = (o * indices_count + i) * inner_size * data_type_size; | ||
|
|
||
| std::memcpy(output_ptr + dst_offset, table_ptr + src_offset, inner_size * data_type_size); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| } // namespace mllm::cpu | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| // Copyright (c) MLLM Team. | ||
| // Licensed under the MIT License. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include "mllm/core/BaseOp.hpp" | ||
| #include "mllm/core/aops/GatherOp.hpp" | ||
|
|
||
| namespace mllm::cpu { | ||
|
|
||
| class CPUGatherOp final : public aops::GatherOp { | ||
| public: | ||
| explicit CPUGatherOp(const aops::GatherOpOptions& options); | ||
|
|
||
| void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override; | ||
| }; | ||
|
|
||
| class CPUGatherOpFactory : public TypedOpFactory<OpTypes::kGather, aops::GatherOpOptions> { | ||
| public: | ||
| std::shared_ptr<BaseOp> createOpImpl(const aops::GatherOpOptions& options) override { | ||
| return std::make_shared<CPUGatherOp>(options); | ||
| } | ||
| }; | ||
|
|
||
| } // namespace mllm::cpu |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,7 @@ | |
| #include "mllm/core/DataTypes.hpp" | ||
| #include "mllm/core/DeviceTypes.hpp" | ||
| #include "mllm/engine/Context.hpp" | ||
| #include "mllm/mllm.hpp" | ||
| #include "mllm/utils/Common.hpp" | ||
| #include "mllm/utils/Log.hpp" | ||
| #include "mllm/compile/ir/tensor/Value.hpp" | ||
|
|
@@ -303,11 +304,11 @@ Qnn_DataType_t mllmDataTypeToQnnDataType(DataTypes dtype) { | |
| Qnn_DataType_t ret = QNN_DATATYPE_UNDEFINED; | ||
| switch (dtype) { | ||
| case kInt8: { | ||
| ret = QNN_DATATYPE_INT_8; | ||
| ret = QNN_DATATYPE_SFIXED_POINT_8; | ||
| break; | ||
| } | ||
| case kInt16: { | ||
| ret = QNN_DATATYPE_INT_16; | ||
| ret = QNN_DATATYPE_UFIXED_POINT_16; | ||
| break; | ||
|
Comment on lines
310
to
312
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Critical: Incorrect type mapping for kInt16. Line 311 maps 🔎 Proposed fix case kInt16: {
- ret = QNN_DATATYPE_UFIXED_POINT_16;
+ ret = QNN_DATATYPE_SFIXED_POINT_16;
break;
}🤖 Prompt for AI Agents |
||
| } | ||
| case kInt32: { | ||
|
|
@@ -319,11 +320,11 @@ Qnn_DataType_t mllmDataTypeToQnnDataType(DataTypes dtype) { | |
| break; | ||
| } | ||
| case kUInt8: { | ||
| ret = QNN_DATATYPE_UINT_8; | ||
| ret = QNN_DATATYPE_UFIXED_POINT_8; | ||
| break; | ||
| } | ||
| case kUInt16: { | ||
| ret = QNN_DATATYPE_UINT_16; | ||
| ret = QNN_DATATYPE_UFIXED_POINT_16; | ||
| break; | ||
| } | ||
| case kUInt32: { | ||
|
|
@@ -449,7 +450,8 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::create(const std::string& na | |
| // in this case, the tensor may be a placeholder(input/output except for graph IO) | ||
| // it will be allocated to QNN shared buffer via QNNTensorWrapper::alloc() later | ||
| MLLM_RT_ASSERT(!name.empty()); | ||
| if (type != QNN_TENSOR_TYPE_STATIC) { MLLM_RT_ASSERT(tensor.device() == kQNN); } | ||
| // in AOT case, the tensor is all on CPU (TODO: handle this) | ||
| // if (type != QNN_TENSOR_TYPE_STATIC) { MLLM_RT_ASSERT(tensor.device() == kQNN); } | ||
|
Comment on lines
+453
to
+454
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add proper device validation instead of commenting out the assertion. The device type assertion has been commented out for AOT handling, but this removes an important safety check that could catch device placement errors. Instead of removing the check entirely, add conditional logic to handle both runtime and AOT cases properly. 🔎 Proposed fix- // in AOT case, the tensor is all on CPU (TODO: handle this)
- // if (type != QNN_TENSOR_TYPE_STATIC) { MLLM_RT_ASSERT(tensor.device() == kQNN); }
+ // In AOT case, tensors may be on CPU; in runtime, non-static tensors must be on QNN
+ if (type != QNN_TENSOR_TYPE_STATIC) {
+ // TODO: Replace with proper AOT detection mechanism
+ bool isAOT = (tensor.device() == kCPU);
+ if (!isAOT) {
+ MLLM_RT_ASSERT(tensor.device() == kQNN);
+ }
+ }🤖 Prompt for AI Agents |
||
|
|
||
| Qnn_DataType_t dataType = mllmDataTypeToQnnDataType(tensor.dtype()); | ||
|
|
||
|
|
@@ -467,11 +469,6 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::createStaticTensor(const std | |
| Qnn_QuantizeParams_t quantize) { | ||
| MLLM_RT_ASSERT(!name.empty() && tensor.rank() > 0 && !tensor.isNil()); | ||
|
|
||
| // mllm currently support float16/float32/sfixed8(int8) as static tensor (weight) data type | ||
| // uint8 and int32 is caused by QNNLinear which uses Conv2d | ||
| MLLM_RT_ASSERT(tensor.dtype() == kFloat16 || tensor.dtype() == kFloat32 || tensor.dtype() == kInt8 || tensor.dtype() == kUInt8 | ||
| || tensor.dtype() == kInt32); | ||
|
|
||
| std::shared_ptr<QNNTensorWrapper> tensorWrapper = QNNTensorWrapper::create(name, QNN_TENSOR_TYPE_STATIC, tensor, quantize); | ||
|
|
||
| tensorWrapper->isAlloc_ = true; | ||
|
|
@@ -618,4 +615,75 @@ void propagateQuantScale(const Tensor& input, Tensor& output) { | |
| } | ||
| } | ||
|
|
||
| void __printQnnTensor(const Qnn_Tensor_t* tensor) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix reserved identifier violation in function name. The function name 🔎 Proposed fix-void __printQnnTensor(const Qnn_Tensor_t* tensor) {
+void printQnnTensor(const Qnn_Tensor_t* tensor) {Don't forget to update the header declaration in
🧰 Tools🪛 Clang (14.0.6)[error] 618-618: declaration uses identifier '__printQnnTensor', which is a reserved identifier (bugprone-reserved-identifier,-warnings-as-errors) 🤖 Prompt for AI Agents |
||
| if (tensor == nullptr) { | ||
| MLLM_ERROR("Tensor is null"); | ||
| return; | ||
| } | ||
| if (tensor->version != QNN_TENSOR_VERSION_2) { | ||
| MLLM_ERROR("Only Qnn_TensorV2_t is supported"); | ||
| return; | ||
| } | ||
|
|
||
| const Qnn_TensorV2_t& t = tensor->v2; | ||
|
|
||
| std::string tensor_type = ""; | ||
|
|
||
| switch (t.type) { | ||
| case QNN_TENSOR_TYPE_APP_READ: tensor_type = "APP_READ"; break; | ||
| case QNN_TENSOR_TYPE_APP_WRITE: tensor_type = "APP_WRITE"; break; | ||
| case QNN_TENSOR_TYPE_NATIVE: tensor_type = "APP_NATIVE"; break; | ||
| case QNN_TENSOR_TYPE_STATIC: tensor_type = "STATIC"; break; | ||
| default: tensor_type = "UNKNOWN"; | ||
| } | ||
|
|
||
| std::string dtype_str; | ||
| switch (t.dataType) { | ||
| case QNN_DATATYPE_INT_8: dtype_str = "INT_8"; break; | ||
| case QNN_DATATYPE_INT_16: dtype_str = "INT_16"; break; | ||
| case QNN_DATATYPE_INT_32: dtype_str = "INT_32"; break; | ||
| case QNN_DATATYPE_INT_64: dtype_str = "INT_64"; break; | ||
| case QNN_DATATYPE_UINT_8: dtype_str = "UINT_8"; break; | ||
| case QNN_DATATYPE_UINT_16: dtype_str = "UINT_16"; break; | ||
| case QNN_DATATYPE_UINT_32: dtype_str = "UINT_32"; break; | ||
| case QNN_DATATYPE_UINT_64: dtype_str = "UINT_64"; break; | ||
| case QNN_DATATYPE_FLOAT_16: dtype_str = "FLOAT_16"; break; | ||
| case QNN_DATATYPE_FLOAT_32: dtype_str = "FLOAT_32"; break; | ||
| case QNN_DATATYPE_FLOAT_64: dtype_str = "FLOAT_64"; break; | ||
| case QNN_DATATYPE_SFIXED_POINT_4: dtype_str = "SFIXED_POINT_4"; break; | ||
| case QNN_DATATYPE_SFIXED_POINT_8: dtype_str = "SFIXED_POINT_8"; break; | ||
| case QNN_DATATYPE_SFIXED_POINT_16: dtype_str = "SFIXED_POINT_16"; break; | ||
| case QNN_DATATYPE_SFIXED_POINT_32: dtype_str = "SFIXED_POINT_32"; break; | ||
| case QNN_DATATYPE_UFIXED_POINT_4: dtype_str = "UFIXED_POINT_4"; break; | ||
| case QNN_DATATYPE_UFIXED_POINT_8: dtype_str = "UFIXED_POINT_8"; break; | ||
| case QNN_DATATYPE_UFIXED_POINT_16: dtype_str = "UFIXED_POINT_16"; break; | ||
| case QNN_DATATYPE_UFIXED_POINT_32: dtype_str = "UFIXED_POINT_32"; break; | ||
| case QNN_DATATYPE_BOOL_8: dtype_str = "BOOL_8"; break; | ||
| case QNN_DATATYPE_STRING: dtype_str = "STRING"; break; | ||
| default: dtype_str = "UNKNOWN"; break; | ||
| } | ||
|
|
||
| std::string shape_str = "["; | ||
| for (uint32_t i = 0; i < t.rank; ++i) { | ||
| shape_str += std::to_string(t.dimensions[i]); | ||
| if (i < t.rank - 1) shape_str += ", "; | ||
| } | ||
| shape_str += "]"; | ||
|
|
||
| std::string quant_str = "None"; | ||
| if (t.quantizeParams.encodingDefinition == QNN_DEFINITION_DEFINED) { | ||
| if (t.quantizeParams.quantizationEncoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) { | ||
| quant_str = "Scale: " + std::to_string(t.quantizeParams.scaleOffsetEncoding.scale) | ||
| + ", Offset: " + std::to_string(t.quantizeParams.scaleOffsetEncoding.offset); | ||
| } else if (t.quantizeParams.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { | ||
| quant_str = "Axis Scale Offset (Axis: " + std::to_string(t.quantizeParams.axisScaleOffsetEncoding.axis) + ")"; | ||
| } else if (t.quantizeParams.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION) { | ||
| quant_str = "Blockwise Expansion (axis:" + std::to_string(t.quantizeParams.blockwiseExpansion->axis) | ||
| + ", blockSize:" + std::to_string(t.quantizeParams.blockwiseExpansion->numBlocksPerAxis) + ")"; | ||
| } | ||
| } | ||
|
|
||
| MLLM_INFO("Tensor: {}, Type:{}, Shape: {}, Dtype: {}, Quant: {}", t.name, tensor_type, shape_str, dtype_str, quant_str); | ||
| } | ||
|
|
||
| } // namespace mllm::qnn | ||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -6,7 +6,6 @@ | |||||||
| #include "QnnTypes.h" | ||||||||
| #include "mllm/core/Tensor.hpp" | ||||||||
|
|
||||||||
| #include <map> | ||||||||
| #include <memory> | ||||||||
| #include <string> | ||||||||
| #include <vector> | ||||||||
|
|
@@ -89,6 +88,8 @@ bool freeQnnTensor(Qnn_Tensor_t& tensor); | |||||||
|
|
||||||||
| bool freeQnnTensors(Qnn_Tensor_t*& tensors, uint32_t numTensors); | ||||||||
|
|
||||||||
| void __printQnnTensor(const Qnn_Tensor_t* tensor); // for debug use | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reserved identifier violation: rename the debug function. The function name 🔎 Proposed fix-void __printQnnTensor(const Qnn_Tensor_t* tensor); // for debug use
+void printQnnTensorDebug(const Qnn_Tensor_t* tensor); // for debug use📝 Committable suggestion
Suggested change
🧰 Tools🪛 Clang (14.0.6)[error] 91-91: declaration uses identifier '__printQnnTensor', which is a reserved identifier (bugprone-reserved-identifier,-warnings-as-errors) 🤖 Prompt for AI Agents |
||||||||
|
|
||||||||
|
Comment on lines
+91
to
+92
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reserved identifier violation: rename the debug function. The function name 🔎 Proposed fix-void __printQnnTensor(const Qnn_Tensor_t* tensor); // for debug use
+void printQnnTensorDebug(const Qnn_Tensor_t* tensor); // for debug useNote: Ensure the function definition and all call sites are updated to match the new name. 📝 Committable suggestion
Suggested change
🧰 Tools🪛 Clang (14.0.6)[error] 91-91: declaration uses identifier '__printQnnTensor', which is a reserved identifier (bugprone-reserved-identifier,-warnings-as-errors) 🤖 Prompt for AI Agents |
||||||||
| inline void __mllmQnnLoggerCallback(const char* fmt, QnnLog_Level_t level, uint64_t times_tamp, va_list argp) { | ||||||||
| const char* level_str = ""; | ||||||||
| const char* color_start = ""; | ||||||||
|
|
@@ -277,9 +278,12 @@ QNNParamScalarWrapper::QNNParamScalarWrapper(const std::string& name, T value) : | |||||||
| if constexpr (std::is_same_v<T, bool>) { | ||||||||
| qnnParam_.scalarParam.dataType = QNN_DATATYPE_BOOL_8; | ||||||||
| qnnParam_.scalarParam.bool8Value = static_cast<uint8_t>(value); | ||||||||
| } else if constexpr (std::is_same_v<T, uint32_t> || std::is_same_v<T, int32_t>) { | ||||||||
| } else if constexpr (std::is_same_v<T, uint32_t>) { | ||||||||
| qnnParam_.scalarParam.dataType = QNN_DATATYPE_UINT_32; | ||||||||
| qnnParam_.scalarParam.uint32Value = static_cast<uint32_t>(value); | ||||||||
| } else if constexpr (std::is_same_v<T, int32_t>) { | ||||||||
| qnnParam_.scalarParam.dataType = QNN_DATATYPE_INT_32; | ||||||||
| qnnParam_.scalarParam.int32Value = static_cast<int32_t>(value); | ||||||||
| } else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) { | ||||||||
| qnnParam_.scalarParam.dataType = QNN_DATATYPE_FLOAT_32; | ||||||||
| qnnParam_.scalarParam.floatValue = static_cast<float>(value); | ||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ QnnAOTNodeTensor::QnnAOTNodeTensor(const ir::tensor::TensorValue::ptr_t& v, bool | |
| } else { | ||
| tensor_wrapper_ = mllm::qnn::QNNTensorWrapper::create(name, type, v->tensor_, quant); | ||
| } | ||
| setupComplexTensorQuantization(v); // per-channel and LPBQ cases | ||
| } | ||
|
|
||
| Qnn_TensorType_t QnnAOTNodeTensor::parseQnnTensorTypeFromIR(const ir::tensor::TensorValue::ptr_t& v) { | ||
|
|
@@ -90,7 +91,7 @@ Qnn_TensorType_t QnnAOTNodeTensor::parseQnnTensorTypeFromIR(const ir::tensor::Te | |
|
|
||
| // Check Attribute. The Attribute priority is higher than tensor type | ||
| if (v->getAttr("qnn_graph_outputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; } | ||
| if (v->getAttr("qnn_graph_inputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE; } | ||
| if (v->getAttr("qnn_graph_inputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; } | ||
| if (v->getAttr("constant")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_STATIC; } | ||
|
|
||
| return ret_qnn_tensor_type; | ||
|
|
@@ -109,7 +110,16 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten | |
| auto quant_spec = v->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_; | ||
|
|
||
| switch (quant_spec->type) { | ||
| case ir::linalg::QuantizationSpecType::kRaw: { | ||
| case ir::linalg::QuantizationSpecType::kRaw: | ||
| case ir::linalg::QuantizationSpecType::kSymPerChannel: | ||
| case ir::linalg::QuantizationSpecType::kLPBQ: { | ||
| break; | ||
| } | ||
| case ir::linalg::QuantizationSpecType::kAsymPerTensor: { | ||
| auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecAsymPerTensor>(quant_spec); | ||
| ret.encodingDefinition = QNN_DEFINITION_DEFINED; | ||
| ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET; | ||
| ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = cfg->zero_point.item<int32_t>()}; | ||
| break; | ||
| } | ||
| case ir::linalg::QuantizationSpecType::kSymPerTensor: { | ||
|
|
@@ -119,6 +129,19 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten | |
| ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = 0}; | ||
| break; | ||
| } | ||
| default: { | ||
| MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle kNone type"); | ||
| } | ||
| } | ||
|
|
||
| return ret; | ||
| } | ||
|
|
||
| void QnnAOTNodeTensor::setupComplexTensorQuantization(const ir::tensor::TensorValue::ptr_t& v) { | ||
| MLLM_RT_ASSERT(v->getAttr("quant_recipe")); | ||
| auto quant_spec = v->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_; | ||
|
|
||
| switch (quant_spec->type) { | ||
| case ir::linalg::QuantizationSpecType::kSymPerChannel: { | ||
| auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecSymPerChannel>(quant_spec); | ||
|
|
||
|
|
@@ -135,12 +158,6 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten | |
| tensor_wrapper_->setScaleOffsetQuantization(scale_offsets, cfg->ch_axis); | ||
| break; | ||
| } | ||
| case ir::linalg::QuantizationSpecType::kSymPerBlock: | ||
| case ir::linalg::QuantizationSpecType::kAsymPerTensor: | ||
| case ir::linalg::QuantizationSpecType::kAsymPerChannel: | ||
| case ir::linalg::QuantizationSpecType::kAsymPerBlock: { | ||
| MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle [kSymPerBlock, kAsymPerTensor, kAsymPerChannel, kAsymPerBlock] type"); | ||
| } | ||
| case ir::linalg::QuantizationSpecType::kLPBQ: { | ||
| auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecLPBQ>(quant_spec); | ||
|
|
||
|
|
@@ -150,28 +167,23 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten | |
| MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale_level_1_fp.size(0)); | ||
| MLLM_RT_ASSERT_EQ(cfg->scale_level_0_int.dtype(), kUInt8); | ||
| for (int i = 0; i < num_scale_offsets; ++i) { | ||
| scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({i}); | ||
| scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({i, 0, 0}); | ||
| scale_offsets[i].offset = 0; | ||
| } | ||
|
|
||
| Qnn_BlockwiseExpansion_t blockwise_expansion; | ||
| blockwise_expansion.axis = cfg->ch_axis; | ||
| blockwise_expansion.axis = cfg->ch_axis; | ||
| blockwise_expansion.scaleOffsets = nullptr; // Will be set by setBlockwiseQuantization | ||
| blockwise_expansion.numBlocksPerAxis = v->tensor_.size(cfg->ch_axis) / cfg->block_size; | ||
| blockwise_expansion.numBlocksPerAxis = v->tensor_.size(1) / cfg->block_size; | ||
| blockwise_expansion.blockScaleBitwidth = 12; // 12 bits for 4 to 16 expansion | ||
|
Comment on lines
174
to
178
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # First, understand the structure of QuantizationSpecLPBQ and ch_axis
rg -n "ch_axis" --type cpp -B 2 -A 2 | head -80Repository: UbiquitousLearning/mllm Length of output: 6583 🏁 Script executed: # Check how blockwise_expansion is used and what axis/numBlocksPerAxis mean
rg -n "blockwise_expansion" --type cpp -B 3 -A 3 | head -100Repository: UbiquitousLearning/mllm Length of output: 1458 🏁 Script executed: # Look for LPBQ configuration and how ch_axis is set
rg -n "QuantizationSpecLPBQ\|kLPBQ" --type cpp -B 2 -A 5 | head -150Repository: UbiquitousLearning/mllm Length of output: 49 🏁 Script executed: # Check the tensor dimensions in context of weight tensors
rg -n "v->tensor_\.size\(" --type cpp -B 2 -A 2 | head -100Repository: UbiquitousLearning/mllm Length of output: 1627 🏁 Script executed: # Search for where LPBQ ch_axis is set in LLMQuantRecipePass
rg -n "QuantizationSpecLPBQ::create" --type cpp -B 5 -A 15 | head -150Repository: UbiquitousLearning/mllm Length of output: 4470 🏁 Script executed: # Look for Linear layer quantization setup - typically ch_axis would be set there
rg -n "Linear\|MatMul" mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp -B 2 -A 8 | head -200Repository: UbiquitousLearning/mllm Length of output: 49 🏁 Script executed: # Check what values ch_axis can have - look at initialization and usage patterns
rg -n "ch_axis\s*=" --type cpp -B 1 -A 1 | grep -v ".hpp" | head -100Repository: UbiquitousLearning/mllm Length of output: 81 🏁 Script executed: # Found ch_axis = 0 at line 797. Let's verify if this is the only place LPBQ is created
rg -n "QuantizationSpecLPBQ::create" --type cpp -A 2 | head -50Repository: UbiquitousLearning/mllm Length of output: 831 🏁 Script executed: # Check weight tensor dimensions - for Linear layers, shapes should be [out_features, in_features]
rg -n "shape\|size\(0\)|size\(1\)" mllm/backends/qnn/aot/QnnWrappersAPI.cpp | head -50Repository: UbiquitousLearning/mllm Length of output: 153 🏁 Script executed: # Verify: does the code document what axis the blockwise quantization should be along?
rg -n "blockwise\|block_size" mllm/backends/qnn/aot/QnnWrappersAPI.cpp -B 2 -A 2 | head -80Repository: UbiquitousLearning/mllm Length of output: 49 🏁 Script executed: # Let's check if there are any tests or comments explaining the relationship between ch_axis and tensor dims
rg -n "blockwise\|block_size" mllm/compile/ir/linalg/ -B 2 -A 2 | head -100Repository: UbiquitousLearning/mllm Length of output: 49 🏁 Script executed: # Check the full LPBQ quantization spec to understand semantics
cat mllm/compile/ir/linalg/Attribute.hpp | grep -A 30 "struct QuantizationSpecLPBQ"Repository: UbiquitousLearning/mllm Length of output: 1463 🏁 Script executed: # Let's check the relationship more carefully - look at how scale_level_0_int is used
rg -n "scale_level_0_int" --type cpp -B 2 -A 2 | head -100Repository: UbiquitousLearning/mllm Length of output: 3644 🏁 Script executed: # Check dimensions of weight tensors in Linear quantization - what are size(0) and size(1)?
rg -n "Linear.*weight\|register.*weight" mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp -B 3 -A 3 | head -80Repository: UbiquitousLearning/mllm Length of output: 49 🏁 Script executed: # Look for any documentation or schema about blockwise expansion axis semantics
rg -n "Qnn_BlockwiseExpansion" --type cpp -B 5 -A 10 | head -150Repository: UbiquitousLearning/mllm Length of output: 5137 Mismatch between blockwise expansion axis and numBlocksPerAxis calculation.
Suggested fix blockwise_expansion.axis = cfg->ch_axis;
blockwise_expansion.scaleOffsets = nullptr; // Will be set by setBlockwiseQuantization
- blockwise_expansion.numBlocksPerAxis = v->tensor_.size(1) / cfg->block_size;
+ blockwise_expansion.numBlocksPerAxis = v->tensor_.size(cfg->ch_axis) / cfg->block_size;🤖 Prompt for AI Agents |
||
| blockwise_expansion.blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8; | ||
| blockwise_expansion.blocksScale8 = cfg->scale_level_0_int.ptr<mllm_uint8_t>(); | ||
|
|
||
| tensor_wrapper_->setBlockwiseQuantization(blockwise_expansion, scale_offsets); | ||
| break; | ||
| } | ||
| default: { | ||
| MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle kNone type"); | ||
| } | ||
| default: break; | ||
| } | ||
|
|
||
| return ret; | ||
| } | ||
|
|
||
| // QnnAOTNodeOperation implementations | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Silent skip of out-of-bounds indices may leave output uninitialized.
When
idx < 0 || idx >= dim_sizeafter negative index adjustment, the code silently skips the copy, leaving that portion of the output tensor uninitialized. This could result in garbage values or undefined behavior downstream.Consider either:
🔎 Proposed fix: Log warning and zero-fill
📝 Committable suggestion
🤖 Prompt for AI Agents