UbiquitousLearning · chenghuaWang · Jan 17, 2026 · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -17,6 +17,6 @@ if(MLLM_TRACY_ENABLE)
   add_subdirectory(tracy_example)
 endif()
 
-if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
+if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE OR MLLM_BUILD_QNN_BACKEND)
   add_subdirectory(qwen3_qnn_aot)
 endif()
diff --git a/examples/qwen3_qnn_aot/CMakeLists.txt b/examples/qwen3_qnn_aot/CMakeLists.txt
@@ -1,3 +1,10 @@
-add_executable(mllm-qwen3-aot-c compile.cpp)
-target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
-target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR})
+# AOT targets run on x86
+if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
+  add_executable(mllm-qwen3-aot-c compile.cpp)
+  target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+  target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR})
+endif()
+
+add_executable(mllm-qwen3-aot-runner aot_run.cpp)
+target_link_libraries(mllm-qwen3-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+target_include_directories(mllm-qwen3-aot-runner PRIVATE ${MLLM_INCLUDE_DIR})
diff --git a/examples/qwen3_qnn_aot/aot_run.cpp b/examples/qwen3_qnn_aot/aot_run.cpp
@@ -0,0 +1,64 @@
+#include <iostream>
+#include <fmt/core.h>
+#include <mllm/mllm.hpp>
+#include <string>
+#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
+#include "mllm/models/qwen3/configuration_qwen3.hpp"
+#include "mllm/models/qwen3/tokenization_qwen3.hpp"
+
+using mllm::Argparse;
+using namespace mllm::qnn::aot;  // NOLINT
+
+MLLM_MAIN({
+  auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
+  auto& model_path = Argparse::add<std::string>("-m|--model").help("Model path").def("qwen3_qnn.mllm");
+  auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json");
+  auto& config_path = Argparse::add<std::string>("-c|--config").help("Config path").required(true);
+  auto& temperature = Argparse::add<float>("--temperature").help("Temperature").def(0.8f);
+  auto& ar_len = Argparse::add<int>("--ar_len").help("Autoregressive length (chunk size)").def(128);
+
+  Argparse::parse(argc, argv);
+
+  mllm::initQnnBackend(model_path.get());
+
+  if (help.isSet()) {
+    Argparse::printHelp();
+    return 0;
+  }
+
+  auto qwen3_cfg = mllm::models::qwen3::Qwen3Config(config_path.get());
+
+  RunnerConfig config;
+  config.model_path = model_path.get();
+  config.temperature = temperature.get();
+  config.num_layers = qwen3_cfg.num_hidden_layers;
+  config.num_heads = qwen3_cfg.num_attention_heads;
+  config.head_dim = qwen3_cfg.head_dim;
+  config.vocab_size = qwen3_cfg.vocab_size;
+  config.context_len = 1024;
+  config.ar_len = ar_len.get();
+
+  auto tokenizer = mllm::models::qwen3::Qwen3Tokenizer(tokenizer_path.get());
+
+  std::string prompt_text;
+  fmt::print("💬 Prompt text (or 'exit/quit'): ");
+  std::getline(std::cin, prompt_text);
+
+  auto input_tensor = tokenizer.convertMessage({.prompt = prompt_text});
+
+  Runner runner(config, &tokenizer);
+  if (!runner.load()) {
+    std::cerr << "Failed to load model\n";
+    return 1;
+  }
+
+  std::vector<uint64_t> prompt_tokens;
+  auto sequence = input_tensor["sequence"];
+  int64_t* ptr = sequence.ptr<int64_t>();
+  for (int i = 0; i < sequence.shape()[1]; ++i) { prompt_tokens.push_back((uint64_t)ptr[i]); }
+
+  runner.generate(prompt_tokens, config.context_len, [](const std::string& token) { std::cout << token << std::flush; });
+  std::cout << "\n";
+
+  return 0;
+});
diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
@@ -242,7 +242,7 @@ class Qwen3Attention final : public nn::Module {
                           "k_rope_add_0_output_qdq");
 
     // De-quantization and quantization again
-    key_states = key_states.to(kFloat16);
+    key_states = key_states.to(kFloat32);
     key_states = key_states.to(kUInt8PerTensorSym);
     key_states = ptq::QDQ_KV(this, key_states, "k_cast_to_int8_qdq");
 
@@ -251,7 +251,7 @@ class Qwen3Attention final : public nn::Module {
 
     // Handle KV Cache
     value_states = ptq::QDQ(this, value_states, "v_cast_to_int16_qdq");
-    value_states = value_states.to(kFloat16);
+    value_states = value_states.to(kFloat32);
     value_states = value_states.to(kUInt8PerTensorSym);
     value_states = ptq::QDQ_KV(this, value_states, "v_cast_to_int8_qdq");
 

@@ -21,6 +21,12 @@ if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
   list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_SRC})
 endif()
 
+file(GLOB_RECURSE MLLM_QUALCOMM_AOT_RT_SRC 
+    ${CMAKE_CURRENT_LIST_DIR}/aot_rt/*.hpp
+    ${CMAKE_CURRENT_LIST_DIR}/aot_rt/*.cpp
+  )
+list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_RT_SRC})
+
 add_library(
     MllmQNNBackend
     SHARED

@@ -55,24 +55,6 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) {
     MLLM_INFO("QNN backend supports early termination");
   }
 
-  bool contextStatus = false;
-  // check if the qnn_context.bin file exists
-  if (!std::filesystem::exists("qnn_context.bin")) {
-    contextStatus = runtime_->createContext(context_, nullptr);
-  } else {
-    contextStatus = runtime_->retrieveContext(context_, qnnModels_, nullptr);
-
-    // fill qnnModelIndexMap_ info according to qnnModels_
-    for (size_t i = 0; i < qnnModels_.size(); i++) {
-      auto graphName = qnnModels_[i]->getQnnGraphName();
-      qnnModelIndexMap_.insert(std::make_pair(graphName, i));
-    }
-  }
-  if (!contextStatus) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); }
-
-  // init QNN Allocator
-  static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
-
   // set performance parameters for better performance on HTP
   perf_ = QNNPerf::create(&runtime_->qnnInterface);
   perf_->setPowerConfigBurst();
@@ -348,10 +330,10 @@ bool QNNRuntime::createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t
   return true;
 }
 
-bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::shared_ptr<QNNModel>>& qnnModels,
-                                 QnnContext_Config_t** contextConfig) {
+bool QNNRuntime::retrieveContext(const std::string& contextBinaryPath, Qnn_ContextHandle_t& context,
+                                 std::vector<std::shared_ptr<QNNModel>>& qnnModels, QnnContext_Config_t** contextConfig) {
   // Read the binary from qnn_context.bin and get the size in byte
-  std::ifstream file(QNN_Context_File, std::ios::binary | std::ios::ate);
+  std::ifstream file(contextBinaryPath, std::ios::binary | std::ios::ate);
   std::streamsize size = file.tellg();
   file.seekg(0, std::ios::beg);
 
@@ -436,6 +418,25 @@ bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::
   return true;
 }
 
+bool QNNBackend::createContext() {
+  if (!runtime_->createContext(context_, nullptr)) { return false; }
+  // init QNN Allocator
+  static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
+  return true;
+}
+
+bool QNNBackend::loadContext(const std::string& contextPath) {
+  if (!runtime_->retrieveContext(contextPath, context_, qnnModels_, nullptr)) { return false; }
+  // fill qnnModelIndexMap_ info according to qnnModels_
+  for (size_t i = 0; i < qnnModels_.size(); i++) {
+    auto graphName = qnnModels_[i]->getQnnGraphName();
+    qnnModelIndexMap_.insert(std::make_pair(graphName, i));
+  }
+  // init QNN Allocator
+  static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
+  return true;
+}
+
 std::shared_ptr<QNNModel> QNNBackend::createQnnGraph(const std::string& graphName) {
   // If the graph already exists, return the existing model
   if (qnnModelIndexMap_.find(graphName) != qnnModelIndexMap_.end()) {
@@ -535,8 +536,6 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
     return;
   }
 
-  // Prepare QNN input tensors by copying data from runtime inputs to graph input wrappers
-  // This handles the case where input tensor sizes may differ between prefill and decode phases
   std::vector<Qnn_Tensor_t> qnn_inputs;
   std::vector<Qnn_Tensor_t> qnn_outputs;
   for (int i = 0; i < model->getGraphInputTensorWrappers().size(); i++) {
@@ -550,52 +549,8 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
       return;
     }
 
-    if (wrapper_tensor.isNil()) {
-      MLLM_ERROR("Graph input wrapper {} for graph '{}' has no backing tensor", i, graphName);
-      return;
-    }
-
-    // Check for size mismatches (can occur in decode phase where inputs may be smaller)
-    size_t dst_bytes = wrapper_tensor.bytes();
-    size_t src_bytes = runtime_input.bytes();
-    if (dst_bytes != src_bytes) {
-      MLLM_WARN("Graph '{}' input tensor {} byte-size mismatch: wrapper={} bytes, runtime input={} bytes. Copying "
-                "min(dst, src), but this may truncate data.",
-                graphName, i, dst_bytes, src_bytes);
-    }
-
-    if (dst_bytes > 0) {
-      void* dst_ptr = wrapper_tensor.ptr<void>();
-      if (!dst_ptr) {
-        wrapper_tensor.alloc();
-        dst_ptr = wrapper_tensor.ptr<void>();
-      }
-
-      const void* src_ptr = runtime_input.ptr<void>();
-      size_t bytes_to_copy = std::min(dst_bytes, src_bytes);
-      if (!src_ptr) {
-        MLLM_ERROR("Runtime input tensor {} for graph '{}' has null data pointer", i, graphName);
-        return;
-      }
-      if (dst_ptr && src_ptr && dst_ptr != src_ptr) {
-        // Copy source data to destination buffer
-        // This ensures that the graph input wrapper has the correct data for execution
-        if (bytes_to_copy > 0) { std::memcpy(dst_ptr, src_ptr, bytes_to_copy); }
-
-        // If source is smaller than destination, zero out the remaining bytes
-        // This is important for decode phase where input tensors may be smaller than prefill
-        // For example, decode phase may use [1, 1] input while wrapper expects [1, 128]
-        // Note: In current implementation with full [1, 128] tensor, this should not trigger
-        // but it's kept as a safety measure for future optimizations
-        if (src_bytes < dst_bytes) {
-          size_t remaining_bytes = dst_bytes - src_bytes;
-          std::memset(static_cast<char*>(dst_ptr) + bytes_to_copy, 0, remaining_bytes);
-          // Only log if zero-padding actually occurs (unexpected case)
-          MLLM_WARN("[QNN graphExecute] Graph '{}' input tensor {}: zero-padded {} bytes (src={} bytes, dst={} bytes)",
-                    graphName, i, remaining_bytes, src_bytes, dst_bytes);
-        }
-      }
-    }
+    // input wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
+    if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_input); }
 
     // Allocate and register the wrapper tensor with QNN allocator
     // QNNAllocator will handle registered memory descriptor when needed
@@ -617,74 +572,18 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
 
   if (ProfilingLevel::OFF != profilingLevel_) { extractBackendProfilingInfo(runtime_->profileHandle); }
 
-  // Debug: Print last output shape from QNN actual return order (before reordering)
-  // Uncomment below for debugging output order issues
-  // if (!qnn_output_tensors.empty()) {
-  //   const auto& last_output = qnn_output_tensors.back();
-  //   const auto& output_wrappers = model->getGraphOutputTensorWrappers();
-  //   const auto& last_wrapper = output_wrappers.back();
-  //   MLLM_INFO("[QNN Actual Return Order] Last output tensor '{}' shape: {}",
-  //             last_wrapper->getName(), last_output.shape());
-  // }
-
   // Reorder outputs according to MLLM expected order
   const auto& expectedOrder = model->getExpectedOutputOrder();
 
   // Resize outputs to match QNN output count first
   outputs.resize(qnn_output_tensors.size());  // Ensure outputs has enough space for all QNN outputs
   if (!expectedOrder.empty() && expectedOrder.size() == qnn_output_tensors.size()) {
-    // Debug: Log output order information
-    // Uncomment below for debugging output order issues
-    // MLLM_INFO("QNNBackend::graphExecute: Checking output order for graph '{}'", graphName);
-    // MLLM_INFO("  MLLM Expected Output Order ({} outputs):", expectedOrder.size());
-    // for (size_t i = 0; i < expectedOrder.size(); i++) {
-    //   MLLM_INFO("    [{}] {}", i, expectedOrder[i]);
-    // }
-    // MLLM_INFO("  QNN Output Order ({} outputs):", model->getGraphOutputTensorWrappers().size());
-    // for (size_t i = 0; i < model->getGraphOutputTensorWrappers().size(); i++) {
-    //   auto wrapper = model->getGraphOutputTensorWrappers()[i];
-    //   MLLM_INFO("    [{}] {}", i, wrapper->getName());
-    // }
-
-    // Check if reordering is needed
-    // bool needs_reordering = false;
-    // std::vector<std::pair<size_t, int>> mismatches;
-    // for (size_t i = 0; i < expectedOrder.size(); i++) {
-    //   const std::string& expected_name = expectedOrder[i];
-    //   int qnn_index = model->getQnnOutputIndex(expected_name);
-    //   if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
-    //     if (static_cast<int>(i) != qnn_index) {
-    //       needs_reordering = true;
-    //       mismatches.emplace_back(i, qnn_index);
-    //     }
-    //   }
-    // }
-
-    // Debug: Verification messages
-    // Uncomment below for debugging output order issues
-    // if (needs_reordering) {
-    //   MLLM_INFO("  [VERIFICATION] QNN output order DIFFERS from MLLM expected order - REORDERING REQUIRED");
-    //   for (const auto& [mllm_idx, qnn_idx] : mismatches) {
-    //     MLLM_INFO("    Mismatch: MLLM[{}] expects '{}' but it's at QNN[{}]",
-    //               mllm_idx, expectedOrder[mllm_idx], qnn_idx);
-    //   }
-    // } else {
-    //   MLLM_INFO("  [VERIFICATION] QNN output order MATCHES MLLM expected order - no reordering needed");
-    // }
-
     // Reorder outputs according to expected order
     for (size_t i = 0; i < expectedOrder.size(); i++) {
       const std::string& expected_name = expectedOrder[i];
       int qnn_index = model->getQnnOutputIndex(expected_name);
       if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
         outputs[i] = qnn_output_tensors[qnn_index];
-        // Debug: Mapping information
-        // Uncomment below for debugging output order issues
-        // if (static_cast<int>(i) != qnn_index) {
-        //   MLLM_INFO("  Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [REORDERED]", i, qnn_index, expected_name);
-        // } else {
-        //   MLLM_INFO("  Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [SAME]", i, qnn_index, expected_name);
-        // }
       } else {
         MLLM_ERROR("QNNBackend::graphExecute: Failed to find QNN output index for tensor '{}' in graph '{}'", expected_name,
                    graphName);

@@ -50,8 +50,8 @@ class QNNRuntime {
   }
 
   bool createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t** contextConfig = nullptr);
-  bool retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::shared_ptr<QNNModel>>& qnnModels,
-                       QnnContext_Config_t** contextConfig = nullptr);
+  bool retrieveContext(const std::string& contextBinaryPath, Qnn_ContextHandle_t& context,
+                       std::vector<std::shared_ptr<QNNModel>>& qnnModels, QnnContext_Config_t** contextConfig = nullptr);
 
  private:
   QNN_INTERFACE_VER_TYPE qnnInterface;
@@ -87,6 +87,9 @@ class QNNBackend final : public Backend {
  public:
   QNNBackend();
 
+  bool loadContext(const std::string& contextPath);
+  bool createContext();
+
   bool isWeightOnDevice() override { return false; }
 
   // QNN Graph build interfaces

@@ -483,10 +483,7 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::createStaticTensor(const std
 }
 
 void QNNTensorWrapper::alloc() {
-  if (isAlloc_) {
-    MLLM_WARN("Tensor {} has already been allocated.", name_);
-    return;
-  }
+  if (isAlloc_) { MLLM_WARN("Tensor {} has already been allocated.", name_); }
   MLLM_RT_ASSERT(dataContainer_.device() == kQNN);
 
   // if storage is not allocated, allocate it

@@ -205,6 +205,13 @@ class QNNTensorWrapper {
   Tensor& getDataContainer() { return dataContainer_; }
   const std::vector<uint32_t>* getDimension() { return &dimensions_; }
 
+  bool isAlloc() { return isAlloc_; }
+  void __setDataContainer(const Tensor& tensor) {
+    MLLM_RT_ASSERT(dataContainer_.isNil())
+    dataContainer_ = tensor;
+    if (!tensor.isNil()) { isAlloc_ = true; }
+  }
+
   // Helper to set complex quantization params and manage memory
   void setScaleOffsetQuantization(const std::vector<Qnn_ScaleOffset_t>& scaleOffsets, int32_t axis);
   void setBlockwiseQuantization(const Qnn_BlockwiseExpansion_t& blockwise, const std::vector<Qnn_ScaleOffset_t>& scaleOffsets);

@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include <memory>
+#include <filesystem>
 #include "mllm/core/BaseOp.hpp"
 #include "mllm/core/DeviceTypes.hpp"
 #include "mllm/engine/Context.hpp"
@@ -13,12 +14,17 @@
 namespace mllm {
 
 // export initQnnBackend function to initialize QNN backend
-void initQnnBackend() {
+void initQnnBackend(const std::string& context_path) {
   MLLM_RT_ASSERT(isQnnAvailable());
   auto& ctx = Context::instance();
 
   // 1. Register backend
   auto backend = std::make_shared<qnn::QNNBackend>();
+  if (std::filesystem::exists(context_path)) {
+    if (!backend->loadContext(context_path)) { MLLM_ERROR_EXIT(1, "Failed to load QNN context from {}", context_path); }
+  } else {
+    if (!backend->createContext()) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); }
+  }
   ctx.registerBackend(backend);
 
   // 2. Initialize memory manager