UbiquitousLearning · chenghuaWang · Jan 7, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
@@ -336,10 +336,14 @@ class Qwen3Text final : public nn::Module {
     // Quantization
     x = x.to(kUInt16PerTensorAsy);
 
-    auto position_ids = inputs[1];
+    const auto& position_ids = inputs[1];
     auto causal_mask = inputs[2];
-    auto llm_embedding_sin = ptq::QDQ_ROPE(this, rope_sin_(), "sin_embedding_input_qdq")[{{0}, position_ids, {kAll}}];
-    auto llm_embedding_cos = ptq::QDQ_ROPE(this, rope_cos_(), "cos_embedding_input_qdq")[{{0}, position_ids, {kAll}}];
+
+    auto llm_embedding_sin =
+        nn::functional::gather(ptq::QDQ_ROPE(this, rope_sin_(), "sin_embedding_input_qdq"), 1, position_ids);
+
+    auto llm_embedding_cos =
+        nn::functional::gather(ptq::QDQ_ROPE(this, rope_cos_(), "cos_embedding_input_qdq"), 1, position_ids);
 
     std::vector<Tensor> keys;
     std::vector<Tensor> values;

@@ -23,6 +23,7 @@
 #include "mllm/backends/cpu/ops/FlashAttention2Op.hpp"
 #include "mllm/backends/cpu/ops/FlashAttn2WithSinkAndSwaOp.hpp"
 #include "mllm/backends/cpu/ops/GELUOp.hpp"
+#include "mllm/backends/cpu/ops/GatherOp.hpp"
 #include "mllm/backends/cpu/ops/InterpolateOp.hpp"
 #include "mllm/backends/cpu/ops/LayerNorm2DOp.hpp"
 #include "mllm/backends/cpu/ops/MaskedScatterOp.hpp"
@@ -81,7 +82,8 @@ CPUBackend::CPUBackend() : Backend(kCPU, createCPUAllocator()) {
                CPUMeanOpFactory, CPUKVCacheOpFactory, CPUPagedAttnOpFactory, CPUScatter2ShardsOpFactory, CPURadixAttnOpFactory,
                CPUConv2DOpFactory, CPULayerNorm2DOpFactory, CPUInterpolateOpFactory, CPUPadOpFactory, CPUMaskedScatterOpFactory,
                CPUArgsortOpFactory, CPUCloneOpFactory, CPUAvgPool1dOpFactory, CPUFlashAttention2SwaSinkOpFactory,
-               CPURadixAttnRelaxOpFactory, CPURadixAttnSwaSinkOpFactory, CPUEqualOpFactory, CPUWhereOpFactory>();
+               CPURadixAttnRelaxOpFactory, CPURadixAttnSwaSinkOpFactory, CPUEqualOpFactory, CPUWhereOpFactory,
+               CPUGatherOpFactory>();
 }
 
 CPUBackend::~CPUBackend() {

@@ -0,0 +1,69 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/cpu/ops/GatherOp.hpp"
+#include "mllm/core/Tensor.hpp"
+
+namespace mllm::cpu {
+
+CPUGatherOp::CPUGatherOp(const aops::GatherOpOptions& options) : aops::GatherOp(options) {}
+
+void CPUGatherOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  auto& table = inputs[0];
+  auto& indices = inputs[1];
+  auto& output = outputs[0];
+
+  int dim = options_.dim;
+  if (dim < 0) dim += table.shape().size();
+
+  int64_t outer_size = 1;
+  for (int i = 0; i < dim; ++i) outer_size *= table.shape()[i];
+
+  int64_t inner_size = 1;
+  for (int i = dim + 1; i < table.shape().size(); ++i) inner_size *= table.shape()[i];
+
+  int64_t dim_size = table.shape()[dim];
+  int64_t indices_count = indices.numel();
+
+  size_t data_type_size = 4;
+  switch (table.dtype()) {
+    case MLLM_TYPE_F32: data_type_size = sizeof(float); break;
+    case MLLM_TYPE_F16: data_type_size = sizeof(mllm_fp16_t); break;
+    case MLLM_TYPE_I32: data_type_size = sizeof(int32_t); break;
+    default: MLLM_ERROR("GatherOp table type not supported: {}", (int)table.dtype());
+  }
+
+  const uint8_t* table_ptr = table.ptr<uint8_t>();
+  uint8_t* output_ptr = output.ptr<uint8_t>();
+
+  const int32_t* indices_i32 = indices.dtype() == MLLM_TYPE_I32 ? indices.ptr<int32_t>() : nullptr;
+  const float* indices_f32 = !indices_i32 && indices.dtype() == MLLM_TYPE_F32 ? indices.ptr<float>() : nullptr;
+
+  if (!indices_i32 && !indices_f32) {
+    MLLM_ERROR("GatherOp indices type not supported: {}", (int)indices.dtype());
+    return;
+  }
+
+  // FIXME: parallel
+  for (int64_t o = 0; o < outer_size; ++o) {
+    for (int64_t i = 0; i < indices_count; ++i) {
+      int64_t idx = 0;
+      if (indices_i32) {
+        idx = indices_i32[i];
+      } else if (indices_f32) {
+        idx = (int64_t)indices_f32[i];
+      }
+
+      if (idx < 0) idx += dim_size;
+
+      if (idx < 0 || idx >= dim_size) { continue; }
-      if (idx < 0) idx += dim_size;
-
-      if (idx < 0 || idx >= dim_size) { continue; }
+      if (idx < 0) idx += dim_size;
+
+      if (idx < 0 || idx >= dim_size) {
+        MLLM_WARN("GatherOp: index {} out of bounds [0, {}), filling with zeros", idx, dim_size);
+        int64_t dst_offset = (o * indices_count + i) * inner_size * data_type_size;
+        std::memset(output_ptr + dst_offset, 0, inner_size * data_type_size);
+        continue;
+      }
-      if (idx < 0) idx += dim_size;
-
-      if (idx < 0 || idx >= dim_size) { continue; }
+      if (idx < 0) idx += dim_size;
+
+      if (idx < 0 || idx >= dim_size) {
+        MLLM_WARN("GatherOp: index {} out of bounds [0, {}), filling with zeros", idx, dim_size);
+        int64_t dst_offset = (o * indices_count + i) * inner_size * data_type_size;
+        std::memset(output_ptr + dst_offset, 0, inner_size * data_type_size);
+        continue;
+      }
+
+      int64_t src_offset = (o * dim_size + idx) * inner_size * data_type_size;
+      int64_t dst_offset = (o * indices_count + i) * inner_size * data_type_size;
+
+      std::memcpy(output_ptr + dst_offset, table_ptr + src_offset, inner_size * data_type_size);
+    }
+  }
+}
+
+}  // namespace mllm::cpu
@@ -0,0 +1,25 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/GatherOp.hpp"
+
+namespace mllm::cpu {
+
+class CPUGatherOp final : public aops::GatherOp {
+ public:
+  explicit CPUGatherOp(const aops::GatherOpOptions& options);
+
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class CPUGatherOpFactory : public TypedOpFactory<OpTypes::kGather, aops::GatherOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::GatherOpOptions& options) override {
+    return std::make_shared<CPUGatherOp>(options);
+  }
+};
+
+}  // namespace mllm::cpu
@@ -5,6 +5,7 @@
 #include "mllm/core/DataTypes.hpp"
 #include "mllm/core/DeviceTypes.hpp"
 #include "mllm/engine/Context.hpp"
+#include "mllm/mllm.hpp"
 #include "mllm/utils/Common.hpp"
 #include "mllm/utils/Log.hpp"
 #include "mllm/compile/ir/tensor/Value.hpp"
@@ -303,11 +304,11 @@ Qnn_DataType_t mllmDataTypeToQnnDataType(DataTypes dtype) {
   Qnn_DataType_t ret = QNN_DATATYPE_UNDEFINED;
   switch (dtype) {
     case kInt8: {
-      ret = QNN_DATATYPE_INT_8;
+      ret = QNN_DATATYPE_SFIXED_POINT_8;
       break;
     }
     case kInt16: {
-      ret = QNN_DATATYPE_INT_16;
+      ret = QNN_DATATYPE_UFIXED_POINT_16;
       break;
     }
     case kInt32: {
@@ -319,11 +320,11 @@ Qnn_DataType_t mllmDataTypeToQnnDataType(DataTypes dtype) {
       break;
     }
     case kUInt8: {
-      ret = QNN_DATATYPE_UINT_8;
+      ret = QNN_DATATYPE_UFIXED_POINT_8;
       break;
     }
     case kUInt16: {
-      ret = QNN_DATATYPE_UINT_16;
+      ret = QNN_DATATYPE_UFIXED_POINT_16;
       break;
     }
     case kUInt32: {
@@ -449,7 +450,8 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::create(const std::string& na
   // in this case, the tensor may be a placeholder(input/output except for graph IO)
   // it will be allocated to QNN shared buffer via QNNTensorWrapper::alloc() later
   MLLM_RT_ASSERT(!name.empty());
-  if (type != QNN_TENSOR_TYPE_STATIC) { MLLM_RT_ASSERT(tensor.device() == kQNN); }
+  // in AOT case, the tensor is all on CPU (TODO: handle this)
+  // if (type != QNN_TENSOR_TYPE_STATIC) { MLLM_RT_ASSERT(tensor.device() == kQNN); }
 
   Qnn_DataType_t dataType = mllmDataTypeToQnnDataType(tensor.dtype());
 
@@ -467,11 +469,6 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::createStaticTensor(const std
                                                                        Qnn_QuantizeParams_t quantize) {
   MLLM_RT_ASSERT(!name.empty() && tensor.rank() > 0 && !tensor.isNil());
 
-  // mllm currently support float16/float32/sfixed8(int8) as static tensor (weight) data type
-  // uint8 and int32 is caused by QNNLinear which uses Conv2d
-  MLLM_RT_ASSERT(tensor.dtype() == kFloat16 || tensor.dtype() == kFloat32 || tensor.dtype() == kInt8 || tensor.dtype() == kUInt8
-                 || tensor.dtype() == kInt32);
-
   std::shared_ptr<QNNTensorWrapper> tensorWrapper = QNNTensorWrapper::create(name, QNN_TENSOR_TYPE_STATIC, tensor, quantize);
 
   tensorWrapper->isAlloc_ = true;
@@ -618,4 +615,75 @@ void propagateQuantScale(const Tensor& input, Tensor& output) {
   }
 }
 
+void __printQnnTensor(const Qnn_Tensor_t* tensor) {
+  if (tensor == nullptr) {
+    MLLM_ERROR("Tensor is null");
+    return;
+  }
+  if (tensor->version != QNN_TENSOR_VERSION_2) {
+    MLLM_ERROR("Only Qnn_TensorV2_t is supported");
+    return;
+  }
+
+  const Qnn_TensorV2_t& t = tensor->v2;
+
+  std::string tensor_type = "";
+
+  switch (t.type) {
+    case QNN_TENSOR_TYPE_APP_READ: tensor_type = "APP_READ"; break;
+    case QNN_TENSOR_TYPE_APP_WRITE: tensor_type = "APP_WRITE"; break;
+    case QNN_TENSOR_TYPE_NATIVE: tensor_type = "APP_NATIVE"; break;
+    case QNN_TENSOR_TYPE_STATIC: tensor_type = "STATIC"; break;
+    default: tensor_type = "UNKNOWN";
+  }
+
+  std::string dtype_str;
+  switch (t.dataType) {
+    case QNN_DATATYPE_INT_8: dtype_str = "INT_8"; break;
+    case QNN_DATATYPE_INT_16: dtype_str = "INT_16"; break;
+    case QNN_DATATYPE_INT_32: dtype_str = "INT_32"; break;
+    case QNN_DATATYPE_INT_64: dtype_str = "INT_64"; break;
+    case QNN_DATATYPE_UINT_8: dtype_str = "UINT_8"; break;
+    case QNN_DATATYPE_UINT_16: dtype_str = "UINT_16"; break;
+    case QNN_DATATYPE_UINT_32: dtype_str = "UINT_32"; break;
+    case QNN_DATATYPE_UINT_64: dtype_str = "UINT_64"; break;
+    case QNN_DATATYPE_FLOAT_16: dtype_str = "FLOAT_16"; break;
+    case QNN_DATATYPE_FLOAT_32: dtype_str = "FLOAT_32"; break;
+    case QNN_DATATYPE_FLOAT_64: dtype_str = "FLOAT_64"; break;
+    case QNN_DATATYPE_SFIXED_POINT_4: dtype_str = "SFIXED_POINT_4"; break;
+    case QNN_DATATYPE_SFIXED_POINT_8: dtype_str = "SFIXED_POINT_8"; break;
+    case QNN_DATATYPE_SFIXED_POINT_16: dtype_str = "SFIXED_POINT_16"; break;
+    case QNN_DATATYPE_SFIXED_POINT_32: dtype_str = "SFIXED_POINT_32"; break;
+    case QNN_DATATYPE_UFIXED_POINT_4: dtype_str = "UFIXED_POINT_4"; break;
+    case QNN_DATATYPE_UFIXED_POINT_8: dtype_str = "UFIXED_POINT_8"; break;
+    case QNN_DATATYPE_UFIXED_POINT_16: dtype_str = "UFIXED_POINT_16"; break;
+    case QNN_DATATYPE_UFIXED_POINT_32: dtype_str = "UFIXED_POINT_32"; break;
+    case QNN_DATATYPE_BOOL_8: dtype_str = "BOOL_8"; break;
+    case QNN_DATATYPE_STRING: dtype_str = "STRING"; break;
+    default: dtype_str = "UNKNOWN"; break;
+  }
+
+  std::string shape_str = "[";
+  for (uint32_t i = 0; i < t.rank; ++i) {
+    shape_str += std::to_string(t.dimensions[i]);
+    if (i < t.rank - 1) shape_str += ", ";
+  }
+  shape_str += "]";
+
+  std::string quant_str = "None";
+  if (t.quantizeParams.encodingDefinition == QNN_DEFINITION_DEFINED) {
+    if (t.quantizeParams.quantizationEncoding == QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
+      quant_str = "Scale: " + std::to_string(t.quantizeParams.scaleOffsetEncoding.scale)
+                  + ", Offset: " + std::to_string(t.quantizeParams.scaleOffsetEncoding.offset);
+    } else if (t.quantizeParams.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+      quant_str = "Axis Scale Offset (Axis: " + std::to_string(t.quantizeParams.axisScaleOffsetEncoding.axis) + ")";
+    } else if (t.quantizeParams.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION) {
+      quant_str = "Blockwise Expansion (axis:" + std::to_string(t.quantizeParams.blockwiseExpansion->axis)
+                  + ", blockSize:" + std::to_string(t.quantizeParams.blockwiseExpansion->numBlocksPerAxis) + ")";
+    }
+  }
+
+  MLLM_INFO("Tensor: {}, Type:{}, Shape: {}, Dtype: {}, Quant: {}", t.name, tensor_type, shape_str, dtype_str, quant_str);
+}
+
 }  // namespace mllm::qnn
@@ -6,7 +6,6 @@
 #include "QnnTypes.h"
 #include "mllm/core/Tensor.hpp"
 
-#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -89,6 +88,8 @@ bool freeQnnTensor(Qnn_Tensor_t& tensor);
 
 bool freeQnnTensors(Qnn_Tensor_t*& tensors, uint32_t numTensors);
 
+void __printQnnTensor(const Qnn_Tensor_t* tensor);  // for debug use
-void __printQnnTensor(const Qnn_Tensor_t* tensor);  // for debug use
+void printQnnTensorDebug(const Qnn_Tensor_t* tensor);  // for debug use
-void __printQnnTensor(const Qnn_Tensor_t* tensor);  // for debug use
+void printQnnTensorDebug(const Qnn_Tensor_t* tensor);  // for debug use
+
-void __printQnnTensor(const Qnn_Tensor_t* tensor);  // for debug use
+void printQnnTensorDebug(const Qnn_Tensor_t* tensor);  // for debug use
+
-void __printQnnTensor(const Qnn_Tensor_t* tensor);  // for debug use
+void printQnnTensorDebug(const Qnn_Tensor_t* tensor);  // for debug use
+
 inline void __mllmQnnLoggerCallback(const char* fmt, QnnLog_Level_t level, uint64_t times_tamp, va_list argp) {
   const char* level_str = "";
   const char* color_start = "";
@@ -277,9 +278,12 @@ QNNParamScalarWrapper::QNNParamScalarWrapper(const std::string& name, T value) :
   if constexpr (std::is_same_v<T, bool>) {
     qnnParam_.scalarParam.dataType = QNN_DATATYPE_BOOL_8;
     qnnParam_.scalarParam.bool8Value = static_cast<uint8_t>(value);
-  } else if constexpr (std::is_same_v<T, uint32_t> || std::is_same_v<T, int32_t>) {
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
     qnnParam_.scalarParam.dataType = QNN_DATATYPE_UINT_32;
     qnnParam_.scalarParam.uint32Value = static_cast<uint32_t>(value);
+  } else if constexpr (std::is_same_v<T, int32_t>) {
+    qnnParam_.scalarParam.dataType = QNN_DATATYPE_INT_32;
+    qnnParam_.scalarParam.int32Value = static_cast<int32_t>(value);
   } else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
     qnnParam_.scalarParam.dataType = QNN_DATATYPE_FLOAT_32;
     qnnParam_.scalarParam.floatValue = static_cast<float>(value);

@@ -31,6 +31,7 @@ QnnAOTNodeTensor::QnnAOTNodeTensor(const ir::tensor::TensorValue::ptr_t& v, bool
   } else {
     tensor_wrapper_ = mllm::qnn::QNNTensorWrapper::create(name, type, v->tensor_, quant);
   }
+  setupComplexTensorQuantization(v);  // per-channel and LPBQ cases
 }
 
 Qnn_TensorType_t QnnAOTNodeTensor::parseQnnTensorTypeFromIR(const ir::tensor::TensorValue::ptr_t& v) {
@@ -90,7 +91,7 @@ Qnn_TensorType_t QnnAOTNodeTensor::parseQnnTensorTypeFromIR(const ir::tensor::Te
 
   // Check Attribute. The Attribute priority is higher than tensor type
   if (v->getAttr("qnn_graph_outputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; }
-  if (v->getAttr("qnn_graph_inputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_READWRITE; }
+  if (v->getAttr("qnn_graph_inputs")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; }
   if (v->getAttr("constant")) { ret_qnn_tensor_type = QNN_TENSOR_TYPE_STATIC; }
 
   return ret_qnn_tensor_type;
@@ -109,7 +110,16 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten
   auto quant_spec = v->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_;
 
   switch (quant_spec->type) {
-    case ir::linalg::QuantizationSpecType::kRaw: {
+    case ir::linalg::QuantizationSpecType::kRaw:
+    case ir::linalg::QuantizationSpecType::kSymPerChannel:
+    case ir::linalg::QuantizationSpecType::kLPBQ: {
+      break;
+    }
+    case ir::linalg::QuantizationSpecType::kAsymPerTensor: {
+      auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecAsymPerTensor>(quant_spec);
+      ret.encodingDefinition = QNN_DEFINITION_DEFINED;
+      ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
+      ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = cfg->zero_point.item<int32_t>()};
       break;
     }
     case ir::linalg::QuantizationSpecType::kSymPerTensor: {
@@ -119,6 +129,19 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten
       ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item<float>(), .offset = 0};
       break;
     }
+    default: {
+      MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle kNone type");
+    }
+  }
+
+  return ret;
+}
+
+void QnnAOTNodeTensor::setupComplexTensorQuantization(const ir::tensor::TensorValue::ptr_t& v) {
+  MLLM_RT_ASSERT(v->getAttr("quant_recipe"));
+  auto quant_spec = v->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_;
+
+  switch (quant_spec->type) {
     case ir::linalg::QuantizationSpecType::kSymPerChannel: {
       auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecSymPerChannel>(quant_spec);
 
@@ -135,12 +158,6 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten
       tensor_wrapper_->setScaleOffsetQuantization(scale_offsets, cfg->ch_axis);
       break;
     }
-    case ir::linalg::QuantizationSpecType::kSymPerBlock:
-    case ir::linalg::QuantizationSpecType::kAsymPerTensor:
-    case ir::linalg::QuantizationSpecType::kAsymPerChannel:
-    case ir::linalg::QuantizationSpecType::kAsymPerBlock: {
-      MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle [kSymPerBlock, kAsymPerTensor, kAsymPerChannel, kAsymPerBlock] type");
-    }
     case ir::linalg::QuantizationSpecType::kLPBQ: {
       auto cfg = std::static_pointer_cast<ir::linalg::QuantizationSpecLPBQ>(quant_spec);
 
@@ -150,28 +167,23 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten
       MLLM_RT_ASSERT_EQ(num_scale_offsets, cfg->scale_level_1_fp.size(0));
       MLLM_RT_ASSERT_EQ(cfg->scale_level_0_int.dtype(), kUInt8);
       for (int i = 0; i < num_scale_offsets; ++i) {
-        scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({i});
+        scale_offsets[i].scale = cfg->scale_level_1_fp.at<float>({i, 0, 0});
         scale_offsets[i].offset = 0;
       }
 
       Qnn_BlockwiseExpansion_t blockwise_expansion;
       blockwise_expansion.axis = cfg->ch_axis;
-      blockwise_expansion.axis = cfg->ch_axis;
       blockwise_expansion.scaleOffsets = nullptr;  // Will be set by setBlockwiseQuantization
-      blockwise_expansion.numBlocksPerAxis = v->tensor_.size(cfg->ch_axis) / cfg->block_size;
+      blockwise_expansion.numBlocksPerAxis = v->tensor_.size(1) / cfg->block_size;
       blockwise_expansion.blockScaleBitwidth = 12;  // 12 bits for 4 to 16 expansion
       blockwise_expansion.blockScaleStorageType = QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8;
       blockwise_expansion.blocksScale8 = cfg->scale_level_0_int.ptr<mllm_uint8_t>();
 
       tensor_wrapper_->setBlockwiseQuantization(blockwise_expansion, scale_offsets);
       break;
     }
-    default: {
-      MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't handle kNone type");
-    }
+    default: break;
   }
-
-  return ret;
 }
 
 // QnnAOTNodeOperation implementations