Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,14 @@ class Qwen3Text final : public nn::Module {
// Quantization
x = x.to(kUInt16PerTensorAsy);

auto position_ids = inputs[1];
const auto& position_ids = inputs[1];
auto causal_mask = inputs[2];
auto llm_embedding_sin = ptq::QDQ_ROPE(this, rope_sin_(), "sin_embedding_input_qdq")[{{0}, position_ids, {kAll}}];
auto llm_embedding_cos = ptq::QDQ_ROPE(this, rope_cos_(), "cos_embedding_input_qdq")[{{0}, position_ids, {kAll}}];

auto llm_embedding_sin =
nn::functional::gather(ptq::QDQ_ROPE(this, rope_sin_(), "sin_embedding_input_qdq"), 1, position_ids);

auto llm_embedding_cos =
nn::functional::gather(ptq::QDQ_ROPE(this, rope_cos_(), "cos_embedding_input_qdq"), 1, position_ids);

std::vector<Tensor> keys;
std::vector<Tensor> values;
Expand Down
4 changes: 3 additions & 1 deletion mllm/backends/cpu/CPUBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "mllm/backends/cpu/ops/FlashAttention2Op.hpp"
#include "mllm/backends/cpu/ops/FlashAttn2WithSinkAndSwaOp.hpp"
#include "mllm/backends/cpu/ops/GELUOp.hpp"
#include "mllm/backends/cpu/ops/GatherOp.hpp"
#include "mllm/backends/cpu/ops/InterpolateOp.hpp"
#include "mllm/backends/cpu/ops/LayerNorm2DOp.hpp"
#include "mllm/backends/cpu/ops/MaskedScatterOp.hpp"
Expand Down Expand Up @@ -81,7 +82,8 @@ CPUBackend::CPUBackend() : Backend(kCPU, createCPUAllocator()) {
CPUMeanOpFactory, CPUKVCacheOpFactory, CPUPagedAttnOpFactory, CPUScatter2ShardsOpFactory, CPURadixAttnOpFactory,
CPUConv2DOpFactory, CPULayerNorm2DOpFactory, CPUInterpolateOpFactory, CPUPadOpFactory, CPUMaskedScatterOpFactory,
CPUArgsortOpFactory, CPUCloneOpFactory, CPUAvgPool1dOpFactory, CPUFlashAttention2SwaSinkOpFactory,
CPURadixAttnRelaxOpFactory, CPURadixAttnSwaSinkOpFactory, CPUEqualOpFactory, CPUWhereOpFactory>();
CPURadixAttnRelaxOpFactory, CPURadixAttnSwaSinkOpFactory, CPUEqualOpFactory, CPUWhereOpFactory,
CPUGatherOpFactory>();
}

CPUBackend::~CPUBackend() {
Expand Down
69 changes: 69 additions & 0 deletions mllm/backends/cpu/ops/GatherOp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright (c) MLLM Team.
// Licensed under the MIT License.

#include "mllm/backends/cpu/ops/GatherOp.hpp"
#include "mllm/core/Tensor.hpp"

namespace mllm::cpu {

CPUGatherOp::CPUGatherOp(const aops::GatherOpOptions& options) : aops::GatherOp(options) {}

void CPUGatherOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
auto& table = inputs[0];
auto& indices = inputs[1];
auto& output = outputs[0];

int dim = options_.dim;
if (dim < 0) dim += table.shape().size();

int64_t outer_size = 1;
for (int i = 0; i < dim; ++i) outer_size *= table.shape()[i];

int64_t inner_size = 1;
for (int i = dim + 1; i < table.shape().size(); ++i) inner_size *= table.shape()[i];

int64_t dim_size = table.shape()[dim];
int64_t indices_count = indices.numel();

size_t data_type_size = 4;
switch (table.dtype()) {
case MLLM_TYPE_F32: data_type_size = sizeof(float); break;
case MLLM_TYPE_F16: data_type_size = sizeof(mllm_fp16_t); break;
case MLLM_TYPE_I32: data_type_size = sizeof(int32_t); break;
default: MLLM_ERROR("GatherOp table type not supported: {}", (int)table.dtype());
}

const uint8_t* table_ptr = table.ptr<uint8_t>();
uint8_t* output_ptr = output.ptr<uint8_t>();

const int32_t* indices_i32 = indices.dtype() == MLLM_TYPE_I32 ? indices.ptr<int32_t>() : nullptr;
const float* indices_f32 = !indices_i32 && indices.dtype() == MLLM_TYPE_F32 ? indices.ptr<float>() : nullptr;

if (!indices_i32 && !indices_f32) {
MLLM_ERROR("GatherOp indices type not supported: {}", (int)indices.dtype());
return;
}

// FIXME: parallel
for (int64_t o = 0; o < outer_size; ++o) {
for (int64_t i = 0; i < indices_count; ++i) {
int64_t idx = 0;
if (indices_i32) {
idx = indices_i32[i];
} else if (indices_f32) {
idx = (int64_t)indices_f32[i];
}

if (idx < 0) idx += dim_size;

if (idx < 0 || idx >= dim_size) { continue; }
Comment on lines +57 to +59
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Silent skip of out-of-bounds indices may leave output uninitialized.

When idx < 0 || idx >= dim_size after negative index adjustment, the code silently skips the copy, leaving that portion of the output tensor uninitialized. This could result in garbage values or undefined behavior downstream.

Consider either:

  1. Logging a warning/error for out-of-bounds indices.
  2. Zero-initializing the output tensor before the loop.
  3. Raising an error for invalid indices.
🔎 Proposed fix: Log warning and zero-fill
-      if (idx < 0 || idx >= dim_size) { continue; }
+      if (idx < 0 || idx >= dim_size) {
+        MLLM_WARN("GatherOp: index {} out of bounds [0, {}), filling with zeros", idx, dim_size);
+        int64_t dst_offset = (o * indices_count + i) * inner_size * data_type_size;
+        std::memset(output_ptr + dst_offset, 0, inner_size * data_type_size);
+        continue;
+      }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if (idx < 0) idx += dim_size;
if (idx < 0 || idx >= dim_size) { continue; }
if (idx < 0) idx += dim_size;
if (idx < 0 || idx >= dim_size) {
MLLM_WARN("GatherOp: index {} out of bounds [0, {}), filling with zeros", idx, dim_size);
int64_t dst_offset = (o * indices_count + i) * inner_size * data_type_size;
std::memset(output_ptr + dst_offset, 0, inner_size * data_type_size);
continue;
}
🤖 Prompt for AI Agents
In @mllm/backends/cpu/ops/GatherOp.cpp around lines 57 - 59, In GatherOp.cpp the
loop silently skips copies when "idx < 0 || idx >= dim_size", leaving parts of
the output uninitialized; change the Gather implementation to zero-initialize
the output buffer/tensor (the destination written by the gather) before the loop
and, when an index is out-of-bounds, either log a warning including the
offending "idx" and "dim_size" or raise an explicit error instead of continuing;
ensure you reference the same variables ("idx", "dim_size") and the output
buffer in your changes so the zero-fill happens once before the loop and all
invalid-index cases are handled with a clear log/error.


int64_t src_offset = (o * dim_size + idx) * inner_size * data_type_size;
int64_t dst_offset = (o * indices_count + i) * inner_size * data_type_size;

std::memcpy(output_ptr + dst_offset, table_ptr + src_offset, inner_size * data_type_size);
}
}
}

} // namespace mllm::cpu
25 changes: 25 additions & 0 deletions mllm/backends/cpu/ops/GatherOp.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright (c) MLLM Team.
// Licensed under the MIT License.

#pragma once

#include "mllm/core/BaseOp.hpp"
#include "mllm/core/aops/GatherOp.hpp"

namespace mllm::cpu {

class CPUGatherOp final : public aops::GatherOp {
public:
explicit CPUGatherOp(const aops::GatherOpOptions& options);

void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
};

class CPUGatherOpFactory : public TypedOpFactory<OpTypes::kGather, aops::GatherOpOptions> {
public:
std::shared_ptr<BaseOp> createOpImpl(const aops::GatherOpOptions& options) override {
return std::make_shared<CPUGatherOp>(options);
}
};

} // namespace mllm::cpu
88 changes: 78 additions & 10 deletions mllm/backends/qnn/QNNUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "mllm/core/DataTypes.hpp"
#include "mllm/core/DeviceTypes.hpp"
#include "mllm/engine/Context.hpp"
#include "mllm/mllm.hpp"
#include "mllm/utils/Common.hpp"
#include "mllm/utils/Log.hpp"
#include "mllm/compile/ir/tensor/Value.hpp"
Expand Down Expand Up @@ -303,11 +304,11 @@ Qnn_DataType_t mllmDataTypeToQnnDataType(DataTypes dtype) {
Qnn_DataType_t ret = QNN_DATATYPE_UNDEFINED;
switch (dtype) {
case kInt8: {
ret = QNN_DATATYPE_INT_8;
ret = QNN_DATATYPE_SFIXED_POINT_8;
break;
}
case kInt16: {
ret = QNN_DATATYPE_INT_16;
ret = QNN_DATATYPE_UFIXED_POINT_16;
break;
Comment on lines 310 to 312
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Incorrect type mapping for kInt16.

Line 311 maps kInt16 (signed 16-bit integer) to QNN_DATATYPE_UFIXED_POINT_16 (unsigned fixed-point 16-bit). This is a sign mismatch that will cause incorrect value interpretation and potential data corruption.

🔎 Proposed fix
     case kInt16: {
-      ret = QNN_DATATYPE_UFIXED_POINT_16;
+      ret = QNN_DATATYPE_SFIXED_POINT_16;
       break;
     }
🤖 Prompt for AI Agents
In @mllm/backends/qnn/QNNUtils.cpp around lines 310 - 312, The case handling for
kInt16 incorrectly maps the signed 16-bit integer to
QNN_DATATYPE_UFIXED_POINT_16 (unsigned); update the mapping in the switch (case
kInt16) to use the signed 16-bit fixed-point/type variant (e.g.,
QNN_DATATYPE_FIXED_POINT_16 or the correct QNN signed-16 enum value used
elsewhere in the codebase) so that kInt16 maps to the signed QNN datatype
instead of QNN_DATATYPE_UFIXED_POINT_16.

}
case kInt32: {
Expand All @@ -319,11 +320,11 @@ Qnn_DataType_t mllmDataTypeToQnnDataType(DataTypes dtype) {
break;
}
case kUInt8: {
ret = QNN_DATATYPE_UINT_8;
ret = QNN_DATATYPE_UFIXED_POINT_8;
break;
}
case kUInt16: {
ret = QNN_DATATYPE_UINT_16;
ret = QNN_DATATYPE_UFIXED_POINT_16;
break;
}
case kUInt32: {
Expand Down Expand Up @@ -449,7 +450,8 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::create(const std::string& na
// in this case, the tensor may be a placeholder(input/output except for graph IO)
// it will be allocated to QNN shared buffer via QNNTensorWrapper::alloc() later
MLLM_RT_ASSERT(!name.empty());
if (type != QNN_TENSOR_TYPE_STATIC) { MLLM_RT_ASSERT(tensor.device() == kQNN); }
// in AOT case, the tensor is all on CPU (TODO: handle this)
// if (type != QNN_TENSOR_TYPE_STATIC) { MLLM_RT_ASSERT(tensor.device() == kQNN); }
Comment on lines +453 to +454
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Add proper device validation instead of commenting out the assertion.

The device type assertion has been commented out for AOT handling, but this removes an important safety check that could catch device placement errors. Instead of removing the check entirely, add conditional logic to handle both runtime and AOT cases properly.

🔎 Proposed fix
-  // in AOT case, the tensor is all on CPU (TODO: handle this)
-  // if (type != QNN_TENSOR_TYPE_STATIC) { MLLM_RT_ASSERT(tensor.device() == kQNN); }
+  // In AOT case, tensors may be on CPU; in runtime, non-static tensors must be on QNN
+  if (type != QNN_TENSOR_TYPE_STATIC) {
+    // TODO: Replace with proper AOT detection mechanism
+    bool isAOT = (tensor.device() == kCPU);
+    if (!isAOT) {
+      MLLM_RT_ASSERT(tensor.device() == kQNN);
+    }
+  }
🤖 Prompt for AI Agents
In @mllm/backends/qnn/QNNUtils.cpp around lines 453 - 454, Replace the
commented-out device assertion with proper conditional validation: keep the
existing runtime check for non-AOT tensors by reinstating
MLLM_RT_ASSERT(tensor.device() == kQNN) when type != QNN_TENSOR_TYPE_STATIC, and
add an explicit check for the AOT/static case (e.g., assert tensor.device() ==
kCPU or allow kCPU/kQNN as appropriate) to validate expected placement; locate
the assertion around the variables type, QNN_TENSOR_TYPE_STATIC and
tensor.device() in QNNUtils.cpp and implement the conditional MLLM_RT_ASSERT
branches to handle both runtime and AOT scenarios.


Qnn_DataType_t dataType = mllmDataTypeToQnnDataType(tensor.dtype());

Expand All @@ -467,11 +469,6 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::createStaticTensor(const std
Qnn_QuantizeParams_t quantize) {
MLLM_RT_ASSERT(!name.empty() && tensor.rank() > 0 && !tensor.isNil());

// mllm currently support float16/float32/sfixed8(int8) as static tensor (weight) data type
// uint8 and int32 is caused by QNNLinear which uses Conv2d
MLLM_RT_ASSERT(tensor.dtype() == kFloat16 || tensor.dtype() == kFloat32 || tensor.dtype() == kInt8 || tensor.dtype() == kUInt8
|| tensor.dtype() == kInt32);

std::shared_ptr<QNNTensorWrapper> tensorWrapper = QNNTensorWrapper::create(name, QNN_TENSOR_TYPE_STATIC, tensor, quantize);

tensorWrapper->isAlloc_ = true;
Expand Down Expand Up @@ -618,4 +615,75 @@ void propagateQuantScale(const Tensor& input, Tensor& output) {
}
}

void __printQnnTensor(const Qnn_Tensor_t* tensor) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Fix reserved identifier violation in function name.

The function name __printQnnTensor uses a leading double underscore, which is reserved for implementation use in C++. This violates the C++ standard and could conflict with compiler or standard library identifiers.

🔎 Proposed fix
-void __printQnnTensor(const Qnn_Tensor_t* tensor) {
+void printQnnTensor(const Qnn_Tensor_t* tensor) {

Don't forget to update the header declaration in mllm/backends/qnn/qnnutils.hpp as well.

Committable suggestion skipped: line range outside the PR's diff.

🧰 Tools
🪛 Clang (14.0.6)

[error] 618-618: declaration uses identifier '__printQnnTensor', which is a reserved identifier

(bugprone-reserved-identifier,-warnings-as-errors)

🤖 Prompt for AI Agents
In @mllm/backends/qnn/QNNUtils.cpp at line 618, The function name uses a
reserved leading double underscore (__printQnnTensor); rename it to a
non-reserved identifier (e.g., printQnnTensor or Qnn_PrintTensor) and update its
declaration in qnnutils.hpp and any call sites to the new name, ensuring
linkage/extern "C" attributes remain unchanged; also run a project-wide search
for __printQnnTensor to fix all references and rebuild to verify no symbol
mismatches.

if (tensor == nullptr) {
MLLM_ERROR("Tensor is null");
return;
}
if (tensor->version != QNN_TENSOR_VERSION_2) {
MLLM_ERROR("Only Qnn_TensorV2_t is supported");
return;
}

const Qnn_TensorV2_t& t = tensor->v2;

std::string tensor_type = "";

switch (t.type) {
case QNN_TENSOR_TYPE_APP_READ: tensor_type = "APP_READ"; break;
case QNN_TENSOR_TYPE_APP_WRITE: tensor_type = "APP_WRITE"; break;
case QNN_TENSOR_TYPE_NATIVE: tensor_type = "APP_NATIVE"; break;
case QNN_TENSOR_TYPE_STATIC: tensor_type = "STATIC"; break;
default: tensor_type = "UNKNOWN";
}

std::string dtype_str;
switch (t.dataType) {
case QNN_DATATYPE_INT_8: dtype_str = "INT_8"; break;
case QNN_DATATYPE_INT_16: dtype_str = "INT_16"; break;
case QNN_DATATYPE_INT_32: dtype_str = "INT_32"; break;
case QNN_DATATYPE_INT_64: dtype_str = "INT_64"; break;
case QNN_DATATYPE_UINT_8: dtype_str = "UINT_8"; break;
case QNN_DATATYPE_UINT_16: dtype_str = "UINT_16"; break;
case QNN_DATATYPE_UINT_32: dtype_str = "UINT_32"; break;
case QNN_DATATYPE_UINT_64: dtype_str = "UINT_64"; break;
case QNN_DATATYPE_FLOAT_16: dtype_str = "FLOAT_16"; break;
case QNN_DATATYPE_FLOAT_32: dtype_str = "FLOAT_32"; break;
case QNN_DATATYPE_FLOAT_64: dtype_str = "FLOAT_64"; break;
case QNN_DATATYPE_SFIXED_POINT_4: dtype_str = "SFIXED_POINT_4"; break;
case QNN_DATATYPE_SFIXED_POINT_8: dtype_str = "SFIXED_POINT_8"; break;
case QNN_DATATYPE_SFIXED_POINT_16: dtype_str = "SFIXED_POINT_16"; break;
case QNN_DATATYPE_SFIXED_POINT_32: dtype_str = "SFIXED_POINT_32"; break;
case QNN_DATATYPE_UFIXED_POINT_4: dtype_str = "UFIXED_POINT_4"; break;
case QNN_DATATYPE_UFIXED_POINT_8: dtype_str = "UFIXED_POINT_8"; break;
case QNN_DATATYPE_UFIXED_POINT_16: dtype_str = "UFIXED_POINT_16"; break;
case QNN_DATATYPE_UFIXED_POINT_32: dtype_str = "UFIXED_POINT_32"; break;
case QNN_DATATYPE_BOOL_8: dtype_str = "BOOL_8"; break;
case QNN_DATATYPE_STRING: dtype_str = "STRING"; break;
default: dtype_str = "UNKNOWN"; break;
}

std::string shape_str = "[";
for (uint32_t i = 0; i < t.rank; ++i) {
shape_str += std::to_string(t.dimensions[i]);
if (i < t.rank - 1) shape_str += ", ";
}
shape_str += "]";

std::string quant_str = "None";
if (t.quantizeParams.encodingDefinition == QNN_DEFINITION_DEFINED) {
if (t.quantizeParams.quantizationEncoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
quant_str = "Scale: " + std::to_string(t.quantizeParams.scaleOffsetEncoding.scale)
+ ", Offset: " + std::to_string(t.quantizeParams.scaleOffsetEncoding.offset);
} else if (t.quantizeParams.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
quant_str = "Axis Scale Offset (Axis: " + std::to_string(t.quantizeParams.axisScaleOffsetEncoding.axis) + ")";
} else if (t.quantizeParams.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION) {
quant_str = "Blockwise Expansion (axis:" + std::to_string(t.quantizeParams.blockwiseExpansion->axis)
+ ", blockSize:" + std::to_string(t.quantizeParams.blockwiseExpansion->numBlocksPerAxis) + ")";
}
}

MLLM_INFO("Tensor: {}, Type:{}, Shape: {}, Dtype: {}, Quant: {}", t.name, tensor_type, shape_str, dtype_str, quant_str);
}

} // namespace mllm::qnn
8 changes: 6 additions & 2 deletions mllm/backends/qnn/QNNUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include "QnnTypes.h"
#include "mllm/core/Tensor.hpp"

#include <map>
#include <memory>
#include <string>
#include <vector>
Expand Down Expand Up @@ -89,6 +88,8 @@ bool freeQnnTensor(Qnn_Tensor_t& tensor);

bool freeQnnTensors(Qnn_Tensor_t*& tensors, uint32_t numTensors);

void __printQnnTensor(const Qnn_Tensor_t* tensor); // for debug use
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Reserved identifier violation: rename the debug function.

The function name __printQnnTensor uses a double underscore prefix, which is reserved by the C++ standard. This can lead to undefined behavior or conflicts with compiler internals.

🔎 Proposed fix
-void __printQnnTensor(const Qnn_Tensor_t* tensor);  // for debug use
+void printQnnTensorDebug(const Qnn_Tensor_t* tensor);  // for debug use
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
void __printQnnTensor(const Qnn_Tensor_t* tensor); // for debug use
void printQnnTensorDebug(const Qnn_Tensor_t* tensor); // for debug use
🧰 Tools
🪛 Clang (14.0.6)

[error] 91-91: declaration uses identifier '__printQnnTensor', which is a reserved identifier

(bugprone-reserved-identifier,-warnings-as-errors)

🤖 Prompt for AI Agents
In @mllm/backends/qnn/QNNUtils.hpp at line 91, The function name
__printQnnTensor uses a double-underscore prefix which is reserved; rename it to
a non-reserved identifier (e.g., printQnnTensor or debugPrintQnnTensor) in the
declaration and update the matching definition and all call sites to the new
name (ensure Qnn_Tensor_t reference stays unchanged and update any
header/implementation pairs and tests that reference __printQnnTensor).


Comment on lines +91 to +92
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Reserved identifier violation: rename the debug function.

The function name __printQnnTensor uses a double underscore prefix, which is reserved by the C++ standard for compiler and standard library implementations. This can lead to undefined behavior or conflicts with compiler internals.

🔎 Proposed fix
-void __printQnnTensor(const Qnn_Tensor_t* tensor);  // for debug use
+void printQnnTensorDebug(const Qnn_Tensor_t* tensor);  // for debug use

Note: Ensure the function definition and all call sites are updated to match the new name.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
void __printQnnTensor(const Qnn_Tensor_t* tensor); // for debug use
void printQnnTensorDebug(const Qnn_Tensor_t* tensor); // for debug use
🧰 Tools
🪛 Clang (14.0.6)

[error] 91-91: declaration uses identifier '__printQnnTensor', which is a reserved identifier

(bugprone-reserved-identifier,-warnings-as-errors)

🤖 Prompt for AI Agents
In @mllm/backends/qnn/QNNUtils.hpp around lines 91 - 92, The debug function name
__printQnnTensor uses reserved leading underscores; rename the function (e.g.,
printQnnTensor or QnnPrintTensor) and update its declaration void
__printQnnTensor(const Qnn_Tensor_t* tensor), its definition, and every call
site to the new identifier so the prototype, implementation, and usages remain
consistent and avoid reserved-identifier UB.

inline void __mllmQnnLoggerCallback(const char* fmt, QnnLog_Level_t level, uint64_t times_tamp, va_list argp) {
const char* level_str = "";
const char* color_start = "";
Expand Down Expand Up @@ -277,9 +278,12 @@ QNNParamScalarWrapper::QNNParamScalarWrapper(const std::string& name, T value) :
if constexpr (std::is_same_v<T, bool>) {
qnnParam_.scalarParam.dataType = QNN_DATATYPE_BOOL_8;
qnnParam_.scalarParam.bool8Value = static_cast<uint8_t>(value);
} else if constexpr (std::is_same_v<T, uint32_t> || std::is_same_v<T, int32_t>) {
} else if constexpr (std::is_same_v<T, uint32_t>) {
qnnParam_.scalarParam.dataType = QNN_DATATYPE_UINT_32;
qnnParam_.scalarParam.uint32Value = static_cast<uint32_t>(value);
} else if constexpr (std::is_same_v<T, int32_t>) {
qnnParam_.scalarParam.dataType = QNN_DATATYPE_INT_32;
qnnParam_.scalarParam.int32Value = static_cast<int32_t>(value);
} else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
qnnParam_.scalarParam.dataType = QNN_DATATYPE_FLOAT_32;
qnnParam_.scalarParam.floatValue = static_cast<float>(value);
Expand Down
44 changes: 28 additions & 16 deletions mllm/backends/qnn/aot/QnnWrappersAPI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ QnnAOTNodeTensor::QnnAOTNodeTensor(const ir::tensor::TensorValue::ptr_t& v, bool
} else {
tensor_wrapper_ = mllm::qnn::QNNTensorWrapper::create(name, type, v->tensor_, quant);
}
setupComplexTensorQuantization(v); // per-channel and LPBQ cases
}

Qnn_TensorType_t QnnAOTNodeTensor::parseQnnTensorTypeFromIR(const ir::tensor::TensorValue::ptr_t& v) {
Expand Down Expand Up @@ -90,7 +91,7 @@ Qnn_TensorType_t QnnAOTNodeTensor::parseQnnTensorTypeFromIR(const ir::tensor::Te

// Check Attribute. The Attribute priority is higher than tensor type
if (v->getAttr("qnn_graph_outputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; }
if (v->getAttr("qnn_graph_inputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE; }
if (v->getAttr("qnn_graph_inputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; }
if (v->getAttr("constant")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_STATIC; }

return ret_qnn_tensor_type;
Expand All @@ -109,7 +110,16 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten
auto quant_spec = v->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_;

switch (quant_spec->type) {
case ir::linalg::QuantizationSpecType::kRaw: {
case ir::linalg::QuantizationSpecType::kRaw:
case ir::linalg::QuantizationSpecType::kSymPerChannel:
case ir::linalg::QuantizationSpecType::kLPBQ: {
break;
}
case ir::linalg::QuantizationSpecType::kAsymPerTensor: {
auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecAsymPerTensor>(quant_spec);
ret.encodingDefinition = QNN_DEFINITION_DEFINED;
ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = cfg->zero_point.item<int32_t>()};
break;
}
case ir::linalg::QuantizationSpecType::kSymPerTensor: {
Expand All @@ -119,6 +129,19 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten
ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = 0};
break;
}
default: {
MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle kNone type");
}
}

return ret;
}

void QnnAOTNodeTensor::setupComplexTensorQuantization(const ir::tensor::TensorValue::ptr_t& v) {
MLLM_RT_ASSERT(v->getAttr("quant_recipe"));
auto quant_spec = v->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_;

switch (quant_spec->type) {
case ir::linalg::QuantizationSpecType::kSymPerChannel: {
auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecSymPerChannel>(quant_spec);

Expand All @@ -135,12 +158,6 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten
tensor_wrapper_->setScaleOffsetQuantization(scale_offsets, cfg->ch_axis);
break;
}
case ir::linalg::QuantizationSpecType::kSymPerBlock:
case ir::linalg::QuantizationSpecType::kAsymPerTensor:
case ir::linalg::QuantizationSpecType::kAsymPerChannel:
case ir::linalg::QuantizationSpecType::kAsymPerBlock: {
MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle [kSymPerBlock, kAsymPerTensor, kAsymPerChannel, kAsymPerBlock] type");
}
case ir::linalg::QuantizationSpecType::kLPBQ: {
auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecLPBQ>(quant_spec);

Expand All @@ -150,28 +167,23 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten
MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale_level_1_fp.size(0));
MLLM_RT_ASSERT_EQ(cfg->scale_level_0_int.dtype(), kUInt8);
for (int i = 0; i < num_scale_offsets; ++i) {
scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({i});
scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({i, 0, 0});
scale_offsets[i].offset = 0;
}

Qnn_BlockwiseExpansion_t blockwise_expansion;
blockwise_expansion.axis = cfg->ch_axis;
blockwise_expansion.axis = cfg->ch_axis;
blockwise_expansion.scaleOffsets = nullptr; // Will be set by setBlockwiseQuantization
blockwise_expansion.numBlocksPerAxis = v->tensor_.size(cfg->ch_axis) / cfg->block_size;
blockwise_expansion.numBlocksPerAxis = v->tensor_.size(1) / cfg->block_size;
blockwise_expansion.blockScaleBitwidth = 12; // 12 bits for 4 to 16 expansion
Comment on lines 174 to 178
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, understand the structure of QuantizationSpecLPBQ and ch_axis
rg -n "ch_axis" --type cpp -B 2 -A 2 | head -80

Repository: UbiquitousLearning/mllm

Length of output: 6583


🏁 Script executed:

# Check how blockwise_expansion is used and what axis/numBlocksPerAxis mean
rg -n "blockwise_expansion" --type cpp -B 3 -A 3 | head -100

Repository: UbiquitousLearning/mllm

Length of output: 1458


🏁 Script executed:

# Look for LPBQ configuration and how ch_axis is set
rg -n "QuantizationSpecLPBQ\|kLPBQ" --type cpp -B 2 -A 5 | head -150

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Check the tensor dimensions in context of weight tensors
rg -n "v->tensor_\.size\(" --type cpp -B 2 -A 2 | head -100

Repository: UbiquitousLearning/mllm

Length of output: 1627


🏁 Script executed:

# Search for where LPBQ ch_axis is set in LLMQuantRecipePass
rg -n "QuantizationSpecLPBQ::create" --type cpp -B 5 -A 15 | head -150

Repository: UbiquitousLearning/mllm

Length of output: 4470


🏁 Script executed:

# Look for Linear layer quantization setup - typically ch_axis would be set there
rg -n "Linear\|MatMul" mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp -B 2 -A 8 | head -200

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Check what values ch_axis can have - look at initialization and usage patterns
rg -n "ch_axis\s*=" --type cpp -B 1 -A 1 | grep -v ".hpp" | head -100

Repository: UbiquitousLearning/mllm

Length of output: 81


🏁 Script executed:

# Found ch_axis = 0 at line 797. Let's verify if this is the only place LPBQ is created
rg -n "QuantizationSpecLPBQ::create" --type cpp -A 2 | head -50

Repository: UbiquitousLearning/mllm

Length of output: 831


🏁 Script executed:

# Check weight tensor dimensions - for Linear layers, shapes should be [out_features, in_features]
rg -n "shape\|size\(0\)|size\(1\)" mllm/backends/qnn/aot/QnnWrappersAPI.cpp | head -50

Repository: UbiquitousLearning/mllm

Length of output: 153


🏁 Script executed:

# Verify: does the code document what axis the blockwise quantization should be along?
rg -n "blockwise\|block_size" mllm/backends/qnn/aot/QnnWrappersAPI.cpp -B 2 -A 2 | head -80

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Let's check if there are any tests or comments explaining the relationship between ch_axis and tensor dims
rg -n "blockwise\|block_size" mllm/compile/ir/linalg/ -B 2 -A 2 | head -100

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Check the full LPBQ quantization spec to understand semantics
cat mllm/compile/ir/linalg/Attribute.hpp | grep -A 30 "struct QuantizationSpecLPBQ"

Repository: UbiquitousLearning/mllm

Length of output: 1463


🏁 Script executed:

# Let's check the relationship more carefully - look at how scale_level_0_int is used
rg -n "scale_level_0_int" --type cpp -B 2 -A 2 | head -100

Repository: UbiquitousLearning/mllm

Length of output: 3644


🏁 Script executed:

# Check dimensions of weight tensors in Linear quantization - what are size(0) and size(1)?
rg -n "Linear.*weight\|register.*weight" mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp -B 3 -A 3 | head -80

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Look for any documentation or schema about blockwise expansion axis semantics
rg -n "Qnn_BlockwiseExpansion" --type cpp -B 5 -A 10 | head -150

Repository: UbiquitousLearning/mllm

Length of output: 5137


Mismatch between blockwise expansion axis and numBlocksPerAxis calculation.

blockwise_expansion.axis is set to cfg->ch_axis (line 175), but numBlocksPerAxis is computed using hardcoded v->tensor_.size(1) (line 177). At line 165, the code correctly uses v->tensor_.size(cfg->ch_axis) for other LPBQ fields; this pattern should be consistent. If ch_axis is 0 (as set in LLMQuantRecipePass.cpp:797), then numBlocksPerAxis should use size(0), not size(1):

Suggested fix
  blockwise_expansion.axis = cfg->ch_axis;
  blockwise_expansion.scaleOffsets = nullptr;  // Will be set by setBlockwiseQuantization
- blockwise_expansion.numBlocksPerAxis = v->tensor_.size(1) / cfg->block_size;
+ blockwise_expansion.numBlocksPerAxis = v->tensor_.size(cfg->ch_axis) / cfg->block_size;
🤖 Prompt for AI Agents
In @mllm/backends/qnn/aot/QnnWrappersAPI.cpp around lines 174 - 178, The
blockwise expansion sets blockwise_expansion.axis = cfg->ch_axis but incorrectly
computes blockwise_expansion.numBlocksPerAxis using v->tensor_.size(1); change
the calculation to use the configured channel axis like the other LPBQ fields
(i.e., compute numBlocksPerAxis as v->tensor_.size(cfg->ch_axis)) so the axis
and block count are consistent when cfg->ch_axis is not 1; update the code
around Qnn_BlockwiseExpansion_t blockwise_expansion and setBlockwiseQuantization
usage to use v->tensor_.size(cfg->ch_axis).

blockwise_expansion.blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8;
blockwise_expansion.blocksScale8 = cfg->scale_level_0_int.ptr<mllm_uint8_t>();

tensor_wrapper_->setBlockwiseQuantization(blockwise_expansion, scale_offsets);
break;
}
default: {
MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle kNone type");
}
default: break;
}

return ret;
}

// QnnAOTNodeOperation implementations
Expand Down
Loading