diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f963dc9e2..180c3cbe6 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -17,6 +17,6 @@ if(MLLM_TRACY_ENABLE)
   add_subdirectory(tracy_example)
 endif()
 
-if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
+if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE OR MLLM_BUILD_QNN_BACKEND)
   add_subdirectory(qwen3_qnn_aot)
 endif()
diff --git a/examples/qwen3_qnn_aot/CMakeLists.txt b/examples/qwen3_qnn_aot/CMakeLists.txt
index efc9f2db8..18041bdcb 100644
--- a/examples/qwen3_qnn_aot/CMakeLists.txt
+++ b/examples/qwen3_qnn_aot/CMakeLists.txt
@@ -1,3 +1,10 @@
-add_executable(mllm-qwen3-aot-c compile.cpp)
-target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
-target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR})
+# AOT targets run on x86
+if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
+  add_executable(mllm-qwen3-aot-c compile.cpp)
+  target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+  target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR})
+endif()
+
+add_executable(mllm-qwen3-aot-runner aot_run.cpp)
+target_link_libraries(mllm-qwen3-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+target_include_directories(mllm-qwen3-aot-runner PRIVATE ${MLLM_INCLUDE_DIR})
\ No newline at end of file
diff --git a/examples/qwen3_qnn_aot/aot_run.cpp b/examples/qwen3_qnn_aot/aot_run.cpp
new file mode 100644
index 000000000..56203bc14
--- /dev/null
+++ b/examples/qwen3_qnn_aot/aot_run.cpp
@@ -0,0 +1,64 @@
+#include <iostream>
+#include <fmt/core.h>
+#include <mllm/mllm.hpp>
+#include <string>
+#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
+#include "mllm/models/qwen3/configuration_qwen3.hpp"
+#include "mllm/models/qwen3/tokenization_qwen3.hpp"
+
+using mllm::Argparse;
+using namespace mllm::qnn::aot;  // NOLINT
+
+MLLM_MAIN({
+  auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
+  auto& model_path = Argparse::add<std::string>("-m|--model").help("Model path").def("qwen3_qnn.mllm");
+  auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json");
+  auto& config_path = Argparse::add<std::string>("-c|--config").help("Config path").required(true);
+  auto& temperature = Argparse::add<float>("--temperature").help("Temperature").def(0.8f);
+  auto& ar_len = Argparse::add<int>("--ar_len").help("Autoregressive length (chunk size)").def(128);
+
+  Argparse::parse(argc, argv);
+
+  mllm::initQnnBackend(model_path.get());
+
+  if (help.isSet()) {
+    Argparse::printHelp();
+    return 0;
+  }
+
+  auto qwen3_cfg = mllm::models::qwen3::Qwen3Config(config_path.get());
+
+  RunnerConfig config;
+  config.model_path = model_path.get();
+  config.temperature = temperature.get();
+  config.num_layers = qwen3_cfg.num_hidden_layers;
+  config.num_heads = qwen3_cfg.num_attention_heads;
+  config.head_dim = qwen3_cfg.head_dim;
+  config.vocab_size = qwen3_cfg.vocab_size;
+  config.context_len = 1024;
+  config.ar_len = ar_len.get();
+
+  auto tokenizer = mllm::models::qwen3::Qwen3Tokenizer(tokenizer_path.get());
+
+  std::string prompt_text;
+  fmt::print("💬 Prompt text (or 'exit/quit'): ");
+  std::getline(std::cin, prompt_text);
+
+  auto input_tensor = tokenizer.convertMessage({.prompt = prompt_text});
+
+  Runner runner(config, &tokenizer);
+  if (!runner.load()) {
+    std::cerr << "Failed to load model\n";
+    return 1;
+  }
+
+  std::vector<uint64_t> prompt_tokens;
+  auto sequence = input_tensor["sequence"];
+  int64_t* ptr = sequence.ptr<int64_t>();
+  for (int i = 0; i < sequence.shape()[1]; ++i) { prompt_tokens.push_back((uint64_t)ptr[i]); }
+
+  runner.generate(prompt_tokens, config.context_len, [](const std::string& token) { std::cout << token << std::flush; });
+  std::cout << "\n";
+
+  return 0;
+});
\ No newline at end of file
diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
index ce9936e7b..9eed37267 100644
--- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
+++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
@@ -242,7 +242,7 @@ class Qwen3Attention final : public nn::Module {
                           "k_rope_add_0_output_qdq");
 
     // De-quantization and quantization again
-    key_states = key_states.to(kFloat16);
+    key_states = key_states.to(kFloat32);
     key_states = key_states.to(kUInt8PerTensorSym);
     key_states = ptq::QDQ_KV(this, key_states, "k_cast_to_int8_qdq");
 
@@ -251,7 +251,7 @@ class Qwen3Attention final : public nn::Module {
 
     // Handle KV Cache
     value_states = ptq::QDQ(this, value_states, "v_cast_to_int16_qdq");
-    value_states = value_states.to(kFloat16);
+    value_states = value_states.to(kFloat32);
     value_states = value_states.to(kUInt8PerTensorSym);
     value_states = ptq::QDQ_KV(this, value_states, "v_cast_to_int8_qdq");
 
diff --git a/mllm/backends/qnn/CMakeLists.txt b/mllm/backends/qnn/CMakeLists.txt
index 0e4203e08..0ad833792 100644
--- a/mllm/backends/qnn/CMakeLists.txt
+++ b/mllm/backends/qnn/CMakeLists.txt
@@ -21,6 +21,12 @@ if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
   list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_SRC})
 endif()
 
+file(GLOB_RECURSE MLLM_QUALCOMM_AOT_RT_SRC 
+    ${CMAKE_CURRENT_LIST_DIR}/aot_rt/*.hpp
+    ${CMAKE_CURRENT_LIST_DIR}/aot_rt/*.cpp
+  )
+list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_RT_SRC})
+
 add_library(
     MllmQNNBackend
     SHARED
diff --git a/mllm/backends/qnn/QNNBackend.cpp b/mllm/backends/qnn/QNNBackend.cpp
index 5fe81efe4..abcdb6519 100644
--- a/mllm/backends/qnn/QNNBackend.cpp
+++ b/mllm/backends/qnn/QNNBackend.cpp
@@ -55,24 +55,6 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) {
     MLLM_INFO("QNN backend supports early termination");
   }
 
-  bool contextStatus = false;
-  // check if the qnn_context.bin file exists
-  if (!std::filesystem::exists("qnn_context.bin")) {
-    contextStatus = runtime_->createContext(context_, nullptr);
-  } else {
-    contextStatus = runtime_->retrieveContext(context_, qnnModels_, nullptr);
-
-    // fill qnnModelIndexMap_ info according to qnnModels_
-    for (size_t i = 0; i < qnnModels_.size(); i++) {
-      auto graphName = qnnModels_[i]->getQnnGraphName();
-      qnnModelIndexMap_.insert(std::make_pair(graphName, i));
-    }
-  }
-  if (!contextStatus) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); }
-
-  // init QNN Allocator
-  static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
-
   // set performance parameters for better performance on HTP
   perf_ = QNNPerf::create(&runtime_->qnnInterface);
   perf_->setPowerConfigBurst();
@@ -348,10 +330,10 @@ bool QNNRuntime::createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t
   return true;
 }
 
-bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::shared_ptr<QNNModel>>& qnnModels,
-                                 QnnContext_Config_t** contextConfig) {
+bool QNNRuntime::retrieveContext(const std::string& contextBinaryPath, Qnn_ContextHandle_t& context,
+                                 std::vector<std::shared_ptr<QNNModel>>& qnnModels, QnnContext_Config_t** contextConfig) {
   // Read the binary from qnn_context.bin and get the size in byte
-  std::ifstream file(QNN_Context_File, std::ios::binary | std::ios::ate);
+  std::ifstream file(contextBinaryPath, std::ios::binary | std::ios::ate);
   std::streamsize size = file.tellg();
   file.seekg(0, std::ios::beg);
 
@@ -436,6 +418,25 @@ bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::
   return true;
 }
 
+bool QNNBackend::createContext() {
+  if (!runtime_->createContext(context_, nullptr)) { return false; }
+  // init QNN Allocator
+  static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
+  return true;
+}
+
+bool QNNBackend::loadContext(const std::string& contextPath) {
+  if (!runtime_->retrieveContext(contextPath, context_, qnnModels_, nullptr)) { return false; }
+  // fill qnnModelIndexMap_ info according to qnnModels_
+  for (size_t i = 0; i < qnnModels_.size(); i++) {
+    auto graphName = qnnModels_[i]->getQnnGraphName();
+    qnnModelIndexMap_.insert(std::make_pair(graphName, i));
+  }
+  // init QNN Allocator
+  static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
+  return true;
+}
+
 std::shared_ptr<QNNModel> QNNBackend::createQnnGraph(const std::string& graphName) {
   // If the graph already exists, return the existing model
   if (qnnModelIndexMap_.find(graphName) != qnnModelIndexMap_.end()) {
@@ -535,8 +536,6 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
     return;
   }
 
-  // Prepare QNN input tensors by copying data from runtime inputs to graph input wrappers
-  // This handles the case where input tensor sizes may differ between prefill and decode phases
   std::vector<Qnn_Tensor_t> qnn_inputs;
   std::vector<Qnn_Tensor_t> qnn_outputs;
   for (int i = 0; i < model->getGraphInputTensorWrappers().size(); i++) {
@@ -550,52 +549,8 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
       return;
     }
 
-    if (wrapper_tensor.isNil()) {
-      MLLM_ERROR("Graph input wrapper {} for graph '{}' has no backing tensor", i, graphName);
-      return;
-    }
-
-    // Check for size mismatches (can occur in decode phase where inputs may be smaller)
-    size_t dst_bytes = wrapper_tensor.bytes();
-    size_t src_bytes = runtime_input.bytes();
-    if (dst_bytes != src_bytes) {
-      MLLM_WARN("Graph '{}' input tensor {} byte-size mismatch: wrapper={} bytes, runtime input={} bytes. Copying "
-                "min(dst, src), but this may truncate data.",
-                graphName, i, dst_bytes, src_bytes);
-    }
-
-    if (dst_bytes > 0) {
-      void* dst_ptr = wrapper_tensor.ptr<void>();
-      if (!dst_ptr) {
-        wrapper_tensor.alloc();
-        dst_ptr = wrapper_tensor.ptr<void>();
-      }
-
-      const void* src_ptr = runtime_input.ptr<void>();
-      size_t bytes_to_copy = std::min(dst_bytes, src_bytes);
-      if (!src_ptr) {
-        MLLM_ERROR("Runtime input tensor {} for graph '{}' has null data pointer", i, graphName);
-        return;
-      }
-      if (dst_ptr && src_ptr && dst_ptr != src_ptr) {
-        // Copy source data to destination buffer
-        // This ensures that the graph input wrapper has the correct data for execution
-        if (bytes_to_copy > 0) { std::memcpy(dst_ptr, src_ptr, bytes_to_copy); }
-
-        // If source is smaller than destination, zero out the remaining bytes
-        // This is important for decode phase where input tensors may be smaller than prefill
-        // For example, decode phase may use [1, 1] input while wrapper expects [1, 128]
-        // Note: In current implementation with full [1, 128] tensor, this should not trigger
-        // but it's kept as a safety measure for future optimizations
-        if (src_bytes < dst_bytes) {
-          size_t remaining_bytes = dst_bytes - src_bytes;
-          std::memset(static_cast<char*>(dst_ptr) + bytes_to_copy, 0, remaining_bytes);
-          // Only log if zero-padding actually occurs (unexpected case)
-          MLLM_WARN("[QNN graphExecute] Graph '{}' input tensor {}: zero-padded {} bytes (src={} bytes, dst={} bytes)",
-                    graphName, i, remaining_bytes, src_bytes, dst_bytes);
-        }
-      }
-    }
+    // input wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
+    if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_input); }
 
     // Allocate and register the wrapper tensor with QNN allocator
     // QNNAllocator will handle registered memory descriptor when needed
@@ -617,74 +572,18 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
 
   if (ProfilingLevel::OFF != profilingLevel_) { extractBackendProfilingInfo(runtime_->profileHandle); }
 
-  // Debug: Print last output shape from QNN actual return order (before reordering)
-  // Uncomment below for debugging output order issues
-  // if (!qnn_output_tensors.empty()) {
-  //   const auto& last_output = qnn_output_tensors.back();
-  //   const auto& output_wrappers = model->getGraphOutputTensorWrappers();
-  //   const auto& last_wrapper = output_wrappers.back();
-  //   MLLM_INFO("[QNN Actual Return Order] Last output tensor '{}' shape: {}",
-  //             last_wrapper->getName(), last_output.shape());
-  // }
-
   // Reorder outputs according to MLLM expected order
   const auto& expectedOrder = model->getExpectedOutputOrder();
 
   // Resize outputs to match QNN output count first
   outputs.resize(qnn_output_tensors.size());  // Ensure outputs has enough space for all QNN outputs
   if (!expectedOrder.empty() && expectedOrder.size() == qnn_output_tensors.size()) {
-    // Debug: Log output order information
-    // Uncomment below for debugging output order issues
-    // MLLM_INFO("QNNBackend::graphExecute: Checking output order for graph '{}'", graphName);
-    // MLLM_INFO("  MLLM Expected Output Order ({} outputs):", expectedOrder.size());
-    // for (size_t i = 0; i < expectedOrder.size(); i++) {
-    //   MLLM_INFO("    [{}] {}", i, expectedOrder[i]);
-    // }
-    // MLLM_INFO("  QNN Output Order ({} outputs):", model->getGraphOutputTensorWrappers().size());
-    // for (size_t i = 0; i < model->getGraphOutputTensorWrappers().size(); i++) {
-    //   auto wrapper = model->getGraphOutputTensorWrappers()[i];
-    //   MLLM_INFO("    [{}] {}", i, wrapper->getName());
-    // }
-
-    // Check if reordering is needed
-    // bool needs_reordering = false;
-    // std::vector<std::pair<size_t, int>> mismatches;
-    // for (size_t i = 0; i < expectedOrder.size(); i++) {
-    //   const std::string& expected_name = expectedOrder[i];
-    //   int qnn_index = model->getQnnOutputIndex(expected_name);
-    //   if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
-    //     if (static_cast<int>(i) != qnn_index) {
-    //       needs_reordering = true;
-    //       mismatches.emplace_back(i, qnn_index);
-    //     }
-    //   }
-    // }
-
-    // Debug: Verification messages
-    // Uncomment below for debugging output order issues
-    // if (needs_reordering) {
-    //   MLLM_INFO("  [VERIFICATION] QNN output order DIFFERS from MLLM expected order - REORDERING REQUIRED");
-    //   for (const auto& [mllm_idx, qnn_idx] : mismatches) {
-    //     MLLM_INFO("    Mismatch: MLLM[{}] expects '{}' but it's at QNN[{}]",
-    //               mllm_idx, expectedOrder[mllm_idx], qnn_idx);
-    //   }
-    // } else {
-    //   MLLM_INFO("  [VERIFICATION] QNN output order MATCHES MLLM expected order - no reordering needed");
-    // }
-
     // Reorder outputs according to expected order
     for (size_t i = 0; i < expectedOrder.size(); i++) {
       const std::string& expected_name = expectedOrder[i];
       int qnn_index = model->getQnnOutputIndex(expected_name);
       if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
         outputs[i] = qnn_output_tensors[qnn_index];
-        // Debug: Mapping information
-        // Uncomment below for debugging output order issues
-        // if (static_cast<int>(i) != qnn_index) {
-        //   MLLM_INFO("  Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [REORDERED]", i, qnn_index, expected_name);
-        // } else {
-        //   MLLM_INFO("  Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [SAME]", i, qnn_index, expected_name);
-        // }
       } else {
         MLLM_ERROR("QNNBackend::graphExecute: Failed to find QNN output index for tensor '{}' in graph '{}'", expected_name,
                    graphName);
diff --git a/mllm/backends/qnn/QNNBackend.hpp b/mllm/backends/qnn/QNNBackend.hpp
index 95f212549..408b45117 100644
--- a/mllm/backends/qnn/QNNBackend.hpp
+++ b/mllm/backends/qnn/QNNBackend.hpp
@@ -50,8 +50,8 @@ class QNNRuntime {
   }
 
   bool createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t** contextConfig = nullptr);
-  bool retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::shared_ptr<QNNModel>>& qnnModels,
-                       QnnContext_Config_t** contextConfig = nullptr);
+  bool retrieveContext(const std::string& contextBinaryPath, Qnn_ContextHandle_t& context,
+                       std::vector<std::shared_ptr<QNNModel>>& qnnModels, QnnContext_Config_t** contextConfig = nullptr);
 
  private:
   QNN_INTERFACE_VER_TYPE qnnInterface;
@@ -87,6 +87,9 @@ class QNNBackend final : public Backend {
  public:
   QNNBackend();
 
+  bool loadContext(const std::string& contextPath);
+  bool createContext();
+
   bool isWeightOnDevice() override { return false; }
 
   // QNN Graph build interfaces
diff --git a/mllm/backends/qnn/QNNUtils.cpp b/mllm/backends/qnn/QNNUtils.cpp
index 6e2862dd4..271b67200 100644
--- a/mllm/backends/qnn/QNNUtils.cpp
+++ b/mllm/backends/qnn/QNNUtils.cpp
@@ -483,10 +483,7 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::createStaticTensor(const std
 }
 
 void QNNTensorWrapper::alloc() {
-  if (isAlloc_) {
-    MLLM_WARN("Tensor {} has already been allocated.", name_);
-    return;
-  }
+  if (isAlloc_) { MLLM_WARN("Tensor {} has already been allocated.", name_); }
   MLLM_RT_ASSERT(dataContainer_.device() == kQNN);
 
   // if storage is not allocated, allocate it
diff --git a/mllm/backends/qnn/QNNUtils.hpp b/mllm/backends/qnn/QNNUtils.hpp
index 3feed39ea..e74f27f4a 100644
--- a/mllm/backends/qnn/QNNUtils.hpp
+++ b/mllm/backends/qnn/QNNUtils.hpp
@@ -205,6 +205,13 @@ class QNNTensorWrapper {
   Tensor& getDataContainer() { return dataContainer_; }
   const std::vector<uint32_t>* getDimension() { return &dimensions_; }
 
+  bool isAlloc() { return isAlloc_; }
+  void __setDataContainer(const Tensor& tensor) {
+    MLLM_RT_ASSERT(dataContainer_.isNil())
+    dataContainer_ = tensor;
+    if (!tensor.isNil()) { isAlloc_ = true; }
+  }
+
   // Helper to set complex quantization params and manage memory
   void setScaleOffsetQuantization(const std::vector<Qnn_ScaleOffset_t>& scaleOffsets, int32_t axis);
   void setBlockwiseQuantization(const Qnn_BlockwiseExpansion_t& blockwise, const std::vector<Qnn_ScaleOffset_t>& scaleOffsets);
diff --git a/mllm/backends/qnn/Register.cpp b/mllm/backends/qnn/Register.cpp
index a36df64ba..158294f35 100644
--- a/mllm/backends/qnn/Register.cpp
+++ b/mllm/backends/qnn/Register.cpp
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include <memory>
+#include <filesystem>
 #include "mllm/core/BaseOp.hpp"
 #include "mllm/core/DeviceTypes.hpp"
 #include "mllm/engine/Context.hpp"
@@ -13,12 +14,17 @@
 namespace mllm {
 
 // export initQnnBackend function to initialize QNN backend
-void initQnnBackend() {
+void initQnnBackend(const std::string& context_path) {
   MLLM_RT_ASSERT(isQnnAvailable());
   auto& ctx = Context::instance();
 
   // 1. Register backend
   auto backend = std::make_shared<qnn::QNNBackend>();
+  if (std::filesystem::exists(context_path)) {
+    if (!backend->loadContext(context_path)) { MLLM_ERROR_EXIT(1, "Failed to load QNN context from {}", context_path); }
+  } else {
+    if (!backend->createContext()) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); }
+  }
   ctx.registerBackend(backend);
 
   // 2. Initialize memory manager
diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
index 93709336e..829a47f2d 100644
--- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
+++ b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) MLLM Team.
 // Licensed under the MIT License.
 #include <memory>
+#include <fstream>
 
 #include <QnnTypes.h>
 
@@ -480,7 +481,37 @@ std::shared_ptr<QnnDeviceAndContext> QnnAOTEnv::createContext(const std::string&
 }
 
 void QnnAOTEnv::saveContext(const std::string& name, const std::string& path) {
-  // TODO
+  if (contexts_.find(name) == contexts_.end()) {
+    MLLM_ERROR("QnnAOTEnv::saveContext Context {} not found", name);
+    return;
+  }
+  auto context = contexts_[name];
+
+  uint64_t binarySize = 0;
+  uint64_t writtenSize = 0;
+
+  auto status = qnn_htp_func_symbols_.qnn_interface_.contextGetBinarySize(context->qnn_ctx_handle_, &binarySize);
+  MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS);
+
+  std::vector<uint8_t> binaryBuffer(binarySize);
+
+  status = qnn_htp_func_symbols_.qnn_interface_.contextGetBinary(
+      context->qnn_ctx_handle_, reinterpret_cast<void*>(binaryBuffer.data()), binarySize, &writtenSize);
+  MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS);
+
+  if (binarySize < writtenSize) {
+    MLLM_ERROR("QNN context binary size mismatch: expected {} bytes, but wrote {} bytes.", binarySize, writtenSize);
+  }
+
+  std::ofstream file(path, std::ios::binary);
+  if (!file.is_open()) {
+    MLLM_ERROR("Failed to open file {} for writing QNN context.", path);
+    return;
+  }
+  file.write(reinterpret_cast<char*>(binaryBuffer.data()), writtenSize);
+  file.close();
+
+  MLLM_INFO("QNN context {} saved to {} written {}", name, path, writtenSize);
 }
 
 void QnnAOTEnv::destroyContext(const std::string& name) {
diff --git a/mllm/backends/qnn/aot/visitor/RMSNorm.cpp b/mllm/backends/qnn/aot/visitor/RMSNorm.cpp
index f27ff77ba..27f72e2e2 100644
--- a/mllm/backends/qnn/aot/visitor/RMSNorm.cpp
+++ b/mllm/backends/qnn/aot/visitor/RMSNorm.cpp
@@ -1,10 +1,13 @@
 // Copyright (c) MLLM Team.
 // Licensed under the MIT License.
 
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/Tensor.hpp"
 #include "mllm/utils/Common.hpp"
 #include "mllm/core/aops/RMSNormOp.hpp"
 #include "mllm/compile/ir/linalg/Op.hpp"
 #include "mllm/compile/ir/builtin/Attribute.hpp"
+#include "mllm/compile/ir/linalg/Attribute.hpp"
 #include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp"
 #include "mllm/backends/qnn/aot/visitor/RMSNorm.hpp"
 #include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp"
@@ -40,6 +43,16 @@ bool QnnAOTRMSNormPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op)
   auto weight =
       writer.getContext()->lookupSymbolTable(a->getName() + ".weight")->outputs().front()->cast_<ir::tensor::TensorValue>();
 
+  // fake bias, nn module seems to be inconsistent with document (AMAZING!)
+  auto bias_tensor = mllm::Tensor::zeros(weight->tensor_.shape(), weight->tensor_.dtype());
+  auto bias_node = ir::tensor::TensorValue::build(writer.getContext().get(), bias_tensor);
+  bias_node->tensor_.setName(a->getName() + "_runtime_bias");
+
+  // fake bias quant recipe
+  auto quant_spec = mllm::ir::linalg::QuantizationSpecSymPerTensor::create(0, 0, kInt32, kFloat32, Tensor::ones({1}));
+  auto quant_attr = mllm::ir::linalg::LinalgIRQuantizatonSpecAttr::build(writer.getContext().get(), quant_spec);
+  bias_node->setAttr("quant_recipe", quant_attr);
+
   // Start to attach
   auto i_0 = op->inputs().front()->cast_<ir::tensor::TensorValue>();
   auto o_0 = op->outputs().front()->cast_<ir::tensor::TensorValue>();
@@ -56,6 +69,7 @@ bool QnnAOTRMSNormPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op)
 
   qnn_op_node->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_0))
       ->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, weight, true))
+      ->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, bias_node, true))
       ->emplaceOutput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, o_0))
       ->setName(rms_op->getAOp()->getName());
 
diff --git a/mllm/backends/qnn/aot_rt/KVCacheManager.cpp b/mllm/backends/qnn/aot_rt/KVCacheManager.cpp
new file mode 100644
index 000000000..787ca4148
--- /dev/null
+++ b/mllm/backends/qnn/aot_rt/KVCacheManager.cpp
@@ -0,0 +1,325 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp"
+#include <cstring>
+#include <algorithm>
+#include "mllm/utils/Log.hpp"
+
+namespace mllm::qnn::aot {
+
+template<typename T>
+KVCacheManager<T>::KVCacheManager(KVCacheConfig config) : config_(config) {
+  k_cache_.resize(config_.num_layers);
+  v_cache_.resize(config_.num_layers);
+
+  // Calculate cache size
+  size_t cache_in_bytes = config_.num_layers * config_.num_heads * config_.head_dim * config_.max_cache_len * sizeof(T);
+  size_t cache_out_bytes = config_.num_layers * config_.num_heads * config_.head_dim * config_.max_ar_len * sizeof(T);
+  total_cache_size_ = 2 * (cache_in_bytes + cache_out_bytes);
+}
+
+template<typename T>
+void KVCacheManager<T>::initCache(mllm::Allocator* allocator, int32_t ar_len) {
+  cur_ar_len_ = ar_len;
+  const size_t max_in_cache_block_in_bytes = config_.max_cache_len * sizeof(T);
+  const size_t max_out_cache_block_in_bytes = config_.max_ar_len * sizeof(T);
+
+  const size_t cache_in_bytes = config_.num_heads * config_.head_dim * max_in_cache_block_in_bytes;
+  const size_t cache_out_bytes = config_.num_heads * config_.head_dim * max_out_cache_block_in_bytes;
+
+  // Directly use Storage created by QNNAllocator
+  // TODO: QNN shared buffer pool(custom mem) support
+  for (int layer = 0; layer < config_.num_layers; ++layer) {
+    // Allocate buffer for key cache and value cache
+    auto k_storage_in = std::make_shared<mllm::Storage>();
+    k_storage_in->size_ = cache_in_bytes;
+    allocator->alloc(k_storage_in);
+    memset(k_storage_in->ptr_, 0, cache_in_bytes);
+
+    auto k_storage_out = std::make_shared<mllm::Storage>();
+    k_storage_out->size_ = cache_out_bytes;
+    allocator->alloc(k_storage_out);
+    memset(k_storage_out->ptr_, 0, cache_out_bytes);
+
+    auto v_storage_in = std::make_shared<mllm::Storage>();
+    v_storage_in->size_ = cache_in_bytes;
+    allocator->alloc(v_storage_in);
+    memset(v_storage_in->ptr_, 0, cache_in_bytes);
+
+    auto v_storage_out = std::make_shared<mllm::Storage>();
+    v_storage_out->size_ = cache_out_bytes;
+    allocator->alloc(v_storage_out);
+    memset(v_storage_out->ptr_, 0, cache_out_bytes);
+
+    k_cache_[layer].buffer_storage = k_storage_in;
+    k_cache_[layer].output_buffer_storage = k_storage_out;
+    k_cache_[layer].buffer = reinterpret_cast<T*>(k_storage_in->ptr_);
+    k_cache_[layer].output_buffer = reinterpret_cast<T*>(k_storage_out->ptr_);
+
+    v_cache_[layer].buffer_storage = v_storage_in;
+    v_cache_[layer].output_buffer_storage = v_storage_out;
+    v_cache_[layer].buffer = reinterpret_cast<T*>(v_storage_in->ptr_);
+    v_cache_[layer].output_buffer = reinterpret_cast<T*>(v_storage_out->ptr_);
+  }
+}
+
+template<typename T>
+void KVCacheManager<T>::initAttentionMask(uint16_t* attention_mask, const std::vector<int32_t>& attention_map, int32_t ar_len,
+                                          int32_t n_past) {
+  if (attention_map.size() > ar_len) {
+    MLLM_ERROR("The size of attention_map ({}) doesn't match with ar_len ({})", attention_map.size(), ar_len);
+    exit(1);
+  }
+
+  uint16_t neg_val = 0;
+  uint16_t pos_val = 65535;
+  // Clear the attention mask
+  std::fill_n(attention_mask, ar_len * config_.context_len, neg_val);
+
+  // SMART_MASK requires special handling of attention mask
+  uint16_t* past_ptr = attention_mask;
+  uint16_t* new_ptr = attention_mask + (config_.context_len - ar_len);
+  // All inputs will necessarily attend to n_past and itself
+  for (int i = 0; i < ar_len; i++) {
+    // Iterate across ar_len
+    if (attention_map[i] < 0) {
+      // If negative, attend to only past tokens
+      std::fill_n(past_ptr, n_past, pos_val);
+    } else {
+      // If positive, copy attention map from (relative to 0th input) parent
+      // Parent token index
+      const int32_t pidx = attention_map[i];
+      uint16_t* parent_ptr = attention_mask + pidx * config_.context_len;
+      std::memcpy(past_ptr, parent_ptr, config_.context_len * sizeof(uint16_t));
+    }
+    // Attend to itself
+    new_ptr[i] = pos_val;
+    past_ptr += config_.context_len;
+    new_ptr += config_.context_len;
+  }
+}
+
+template<typename T>
+void KVCacheManager<T>::initAttentionMask(uint16_t* attention_mask, const std::vector<int32_t>& attention_map, int32_t ar_len,
+                                          int32_t n_past, int32_t sliding_window, const std::vector<int32_t>& position_offset) {
+  if (attention_map.size() > ar_len) {
+    MLLM_ERROR("The size of attention_map ({}) doesn't match with ar_len ({})", attention_map.size(), ar_len);
+    exit(1);
+  }
+
+  uint16_t neg_val = 0;
+  uint16_t pos_val = 65535;
+  // Clear the attention mask
+  std::fill_n(attention_mask, ar_len * config_.context_len, neg_val);
+
+  // SMART_MASK requires special handling of attention mask
+  uint16_t* past_ptr = attention_mask;
+  uint16_t* new_ptr = attention_mask + (config_.context_len - ar_len);
+  // All inputs will necessarily attend to n_past and itself
+  for (int i = 0; i < ar_len; i++) {
+    // Iterate across ar_len
+    if (attention_map[i] < 0) {
+      // If negative, attend to only past tokens
+      std::fill_n(past_ptr, n_past, pos_val);
+    } else {
+      // If positive, copy attention map from (relative to 0th input) parent
+      // Parent token index
+      const int32_t pidx = attention_map[i];
+      uint16_t* parent_ptr = attention_mask + pidx * config_.context_len;
+      std::memcpy(past_ptr, parent_ptr, config_.context_len * sizeof(uint16_t));
+    }
+    // Attend to itself
+    new_ptr[i] = pos_val;
+
+    // mask by limitation of sliding_window
+    int32_t available_context_len =
+        position_offset.empty() ? sliding_window - (i + 1) - n_past : sliding_window - (position_offset[i] + 1) - n_past;
+    if (n_past > available_context_len) { std::fill_n(past_ptr, n_past - available_context_len, neg_val); }
+
+    past_ptr += config_.context_len;
+    new_ptr += config_.context_len;
+  }
+}
+
+template<typename T>
+void KVCacheManager<T>::updateAttentionMask(uint16_t* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update) {
+  uint16_t pos_val = 65535;
+  uint16_t* cur_ptr = attention_mask;
+  cur_ptr += n_past;
+
+  for (int i = 0; i < ar_len; i++) {
+    std::fill_n(cur_ptr, n_update, pos_val);
+    cur_ptr += config_.context_len;
+  }
+}
+
+template<typename T>
+void KVCacheManager<T>::updateAttentionMask(uint16_t* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update,
+                                            int32_t sliding_window, const std::vector<int32_t>& position_offset) {
+  uint16_t pos_val = 65535;
+  uint16_t neg_val = 0;
+  uint16_t* cur_ptr = attention_mask;
+  cur_ptr += n_past;
+
+  for (int i = 0; i < ar_len; i++) {
+    std::fill_n(cur_ptr, n_update, pos_val);
+    int32_t available_cache_len =
+        position_offset.empty() ? sliding_window - (i + 1) : sliding_window - (position_offset[i] + 1);
+    if (n_past + n_update > available_cache_len) {
+      std::fill_n(cur_ptr - n_past, n_past + n_update - available_cache_len, neg_val);
+    }
+    cur_ptr += config_.context_len;
+  }
+}
+
+template<typename T>
+void KVCacheManager<T>::rearrangeCache(int32_t ar_len_dst) {
+  // Don't need to rearrange if cur_ar_len_ is equal to target ar_len
+  if (cur_ar_len_ == ar_len_dst) return;
+  for (int layer = 0; layer < config_.num_layers; ++layer) {
+    rearrangeKey(k_cache_[layer], ar_len_dst);
+    rearrangeValue(v_cache_[layer], ar_len_dst);
+  }
+  // rearrange done.
+  cur_ar_len_ = ar_len_dst;
+}
+
+template<typename T>
+void KVCacheManager<T>::rearrangeKey(KVCache<T>& k_cache, int32_t ar_len_dst) {
+  const int32_t src_cache_num = (cur_ar_len_ == config_.context_len) ? config_.context_len : config_.context_len - cur_ar_len_;
+  const int32_t dst_cache_num = config_.context_len - ar_len_dst;
+  T* k_cache_in_read_ptr = k_cache.buffer;
+  T* k_cache_in_write_ptr = k_cache.buffer;
+
+  if (src_cache_num > dst_cache_num) {
+    // copy from first dimension
+    for (int i = 0; i < config_.head_dim * config_.num_heads; i++) {
+      std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, dst_cache_num * sizeof(T));
+      k_cache_in_read_ptr += src_cache_num;
+      k_cache_in_write_ptr += dst_cache_num;
+    }
+  } else {
+    k_cache_in_read_ptr += (config_.head_dim * config_.num_heads - 1) * src_cache_num;
+    k_cache_in_write_ptr += (config_.head_dim * config_.num_heads - 1) * dst_cache_num;
+    // copy from last dimension
+    for (int i = 0; i < config_.head_dim * config_.num_heads; i++) {
+      std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, src_cache_num * sizeof(T));
+      k_cache_in_read_ptr -= src_cache_num;
+      k_cache_in_write_ptr -= dst_cache_num;
+    }
+  }
+}
+
+template<typename T>
+void KVCacheManager<T>::rearrangeValue(KVCache<T>& v_cache, int32_t ar_len_dst) {
+  const int32_t src_cache_num = (cur_ar_len_ == config_.context_len) ? config_.context_len : config_.context_len - cur_ar_len_;
+  const int32_t dst_cache_num = config_.context_len - ar_len_dst;
+  T* v_cache_in_read_ptr = v_cache.buffer;
+  T* v_cache_in_write_ptr = v_cache.buffer;
+  if (src_cache_num > dst_cache_num) {
+    // copy from first dimension
+    for (int i = 0; i < config_.num_heads; i++) {
+      std::memmove(v_cache_in_write_ptr, v_cache_in_read_ptr, dst_cache_num * config_.head_dim * sizeof(T));
+      v_cache_in_read_ptr += src_cache_num * config_.head_dim;
+      v_cache_in_write_ptr += dst_cache_num * config_.head_dim;
+    }
+  } else {
+    v_cache_in_read_ptr += config_.head_dim * (config_.num_heads - 1) * src_cache_num;
+    v_cache_in_write_ptr += config_.head_dim * (config_.num_heads - 1) * dst_cache_num;
+    // copy from last dimension
+    for (int i = 0; i < config_.num_heads; i++) {
+      std::memmove(v_cache_in_write_ptr, v_cache_in_read_ptr, src_cache_num * config_.head_dim * sizeof(T));
+      v_cache_in_read_ptr -= src_cache_num * config_.head_dim;
+      v_cache_in_write_ptr -= dst_cache_num * config_.head_dim;
+    }
+  }
+}
+
+template<typename T>
+void KVCacheManager<T>::updateCache(int32_t ar_len, int32_t n_past, int32_t n_update, const std::vector<bool>& selected) {
+  if (cur_ar_len_ != ar_len) {
+    MLLM_ERROR("Current AR length ({}) is not matched with target AR length ({}). Please rearrange cache first.", cur_ar_len_,
+               ar_len);
+    exit(1);
+  }
+  for (int layer = 0; layer < config_.num_layers; ++layer) {
+    updateKey(k_cache_[layer], n_past, n_update, selected);
+    updateValue(v_cache_[layer], n_past, n_update, selected);
+  }
+}
+
+template<typename T>
+void KVCacheManager<T>::updateKey(KVCache<T>& k_cache, int32_t n_past, int32_t n_update, const std::vector<bool>& selected) {
+  T* write_ptr = k_cache.buffer;
+  T* read_ptr = k_cache.output_buffer;
+  const int32_t copy_size = n_update * sizeof(T);
+  const int32_t iter_size = (cur_ar_len_ == config_.context_len) ? config_.context_len : config_.context_len - cur_ar_len_;
+  const int32_t out_size = cur_ar_len_;
+  const int32_t past_size = n_past;
+  const int32_t n_iter = config_.head_dim * config_.num_heads;
+
+  write_ptr += past_size;
+  if (selected.empty()) {
+    for (int i = 0; i < n_iter; ++i) {
+      std::memcpy(write_ptr, read_ptr, copy_size);
+      write_ptr += iter_size;
+      read_ptr += out_size;
+    }
+  } else {
+    std::vector<int32_t> true_indices(n_update);
+    for (int i = 0, j = 0; i < selected.size() && j < n_update; ++i) {
+      if (selected[i]) { true_indices[j++] = i; }
+    }
+    for (int i = 0; i < n_iter; ++i) {
+      for (int j = 0; j < n_update; ++j) { write_ptr[j] = read_ptr[true_indices[j]]; }
+      write_ptr += iter_size;
+      read_ptr += out_size;
+    }
+  }
+}
+
+template<typename T>
+void KVCacheManager<T>::updateValue(KVCache<T>& v_cache, int32_t n_past, int32_t n_update, const std::vector<bool>& selected) {
+  T* write_ptr = v_cache.buffer;
+  T* read_ptr = v_cache.output_buffer;
+  const int32_t copy_size = n_update * config_.head_dim * sizeof(T);
+  const int32_t past_size = n_past * config_.head_dim;
+  const int32_t n_iter = config_.num_heads;
+  const int32_t iter_size = (cur_ar_len_ == config_.context_len) ? config_.context_len * config_.head_dim
+                                                                 : (config_.context_len - cur_ar_len_) * config_.head_dim;
+  const int32_t out_size = cur_ar_len_ * config_.head_dim;
+
+  write_ptr += past_size;
+
+  if (selected.empty()) {
+    for (int i = 0; i < n_iter; i++) {
+      std::memcpy(write_ptr, read_ptr, copy_size);
+      write_ptr += iter_size;
+      read_ptr += out_size;
+    }
+  } else {
+    for (int i = 0; i < n_iter; i++) {
+      auto wp = write_ptr;
+      auto rp = read_ptr;
+      int32_t update_cnt = 0;
+      for (auto sel : selected) {
+        if (sel) {
+          std::memcpy(wp, rp, config_.head_dim * sizeof(T));
+          wp += config_.head_dim;
+          update_cnt++;
+        }
+        rp += config_.head_dim;
+        if (update_cnt == n_update) break;
+      }
+      write_ptr += iter_size;
+      read_ptr += out_size;
+    }
+  }
+}
+
+// Explicit instantiations
+template class KVCacheManager<uint16_t>;
+template class KVCacheManager<uint8_t>;
+
+}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot_rt/KVCacheManager.hpp b/mllm/backends/qnn/aot_rt/KVCacheManager.hpp
new file mode 100644
index 000000000..fb85ff9ac
--- /dev/null
+++ b/mllm/backends/qnn/aot_rt/KVCacheManager.hpp
@@ -0,0 +1,69 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+#include <memory>
+#include "mllm/core/Storage.hpp"
+#include "mllm/backends/base/Allocator.hpp"
+
+namespace mllm::qnn::aot {
+
+template<typename T>
+struct KVCache {
+  std::shared_ptr<mllm::Storage> buffer_storage;
+  std::shared_ptr<mllm::Storage> output_buffer_storage;
+  T* buffer;
+  T* output_buffer;
+};
+
+struct KVCacheConfig {
+  int32_t context_len;
+  int64_t head_dim;
+  int32_t max_ar_len;
+  int32_t max_cache_len;
+  int64_t num_heads;
+  int64_t num_layers;
+};
+
+template<typename T>
+class KVCacheManager {
+ public:
+  explicit KVCacheManager(KVCacheConfig config);
+  ~KVCacheManager() = default;
+
+  void initCache(mllm::Allocator* allocator, int32_t ar_len);
+  void rearrangeCache(int32_t ar_len_dst);
+
+  void initAttentionMask(uint16_t* attention_mask, const std::vector<int32_t>& attention_map, int32_t ar_len, int32_t n_past);
+
+  void initAttentionMask(uint16_t* attention_mask, const std::vector<int32_t>& attention_map, int32_t ar_len, int32_t n_past,
+                         int32_t sliding_window, const std::vector<int32_t>& position_offset = {});
+
+  void updateAttentionMask(uint16_t* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update);
+
+  void updateAttentionMask(uint16_t* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update, int32_t sliding_window,
+                           const std::vector<int32_t>& position_offset = {});
+
+  void updateCache(int32_t ar_len, int32_t n_past, int32_t n_update, const std::vector<bool>& selected);
+
+  const std::vector<KVCache<T>>& getKCache() const { return k_cache_; }
+  const std::vector<KVCache<T>>& getVCache() const { return v_cache_; }
+  [[nodiscard]] size_t getTotalCacheSizeInBytes() const { return total_cache_size_; }
+
+ private:
+  void rearrangeKey(KVCache<T>& k_cache, int32_t ar_len_dst);
+  void rearrangeValue(KVCache<T>& v_cache, int32_t ar_len_dst);
+  void updateKey(KVCache<T>& k_cache, int32_t n_past, int32_t n_update, const std::vector<bool>& selected);
+  void updateValue(KVCache<T>& v_cache, int32_t n_past, int32_t n_update, const std::vector<bool>& selected);
+
+  KVCacheConfig config_;
+  size_t total_cache_size_;
+  int32_t cur_ar_len_;
+  std::vector<KVCache<T>> k_cache_;
+  std::vector<KVCache<T>> v_cache_;
+};
+
+}  // namespace mllm::qnn::aot
\ No newline at end of file
diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.cpp b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp
new file mode 100644
index 000000000..b13c66a0d
--- /dev/null
+++ b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp
@@ -0,0 +1,180 @@
+
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/qnn/aot_rt/PromptProcessor.hpp"
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/core/SlicePrimitives.hpp"
+#include "mllm/utils/Log.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <numeric>
+#include <utility>
+
+namespace mllm::qnn::aot {
+
+template<typename T>
+PromptProcessor<T>::PromptProcessor(KVCacheManager<T>* kv_manager, Config config)
+    : kv_manager_(kv_manager), config_(std::move(config)) {
+  std::string graph_name = "model.0.s" + std::to_string(config_.ar_len);
+  module_ = std::make_unique<QnnAOTModule>(config_.model_path, graph_name);
+  module_->to(kQNN);
+}
+
+template<typename T>
+void PromptProcessor<T>::init_io() {
+  input_tensors_.reserve(3 + 2 * config_.num_layers);
+
+  // 1. Input IDs
+  auto input_ids = Tensor::empty({1, config_.ar_len}, kInt32, kQNN).alloc();
+  input_ids.setName("input_ids");
+  input_tensors_.push_back(input_ids);
+
+  // // 2. Sliding Window Attention Mask
+  // input_tensors_.push_back(Tensor::empty({1, 1, config_.ar_len, config_.context_len}, kUInt16, kQNN).alloc());
+
+  // 3. Position IDs
+  auto pos_ids = Tensor::empty({config_.ar_len}, kInt32, kQNN).alloc();
+  pos_ids.setName("position_ids");
+  input_tensors_.push_back(pos_ids);
+
+  // 4. Attention Mask
+  auto attn_mask = Tensor::empty({1, 1, config_.ar_len, config_.context_len}, kUInt16, kQNN).alloc();
+  attn_mask.setName("attention_mask");
+  input_tensors_.push_back(attn_mask);
+
+  // 5. KV Caches
+  const auto& k_caches = kv_manager_->getKCache();
+  const auto& v_caches = kv_manager_->getVCache();
+  for (int l = 0; l < config_.num_layers; ++l) {
+    // K
+    auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.head_dim, config_.context_len - config_.ar_len},
+                                  config_.kv_dtype, kQNN);
+    k_tensor.impl()->storage()->ptr_ = k_caches[l].buffer;
+    k_tensor.impl()->storage()->mem_type_ = kManual;
+    k_tensor.setName("past_key_" + std::to_string(l));
+    input_tensors_.push_back(k_tensor);
+
+    // V
+    auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len - config_.ar_len, config_.head_dim},
+                                  config_.kv_dtype, kQNN);
+    v_tensor.impl()->storage()->ptr_ = v_caches[l].buffer;
+    v_tensor.impl()->storage()->mem_type_ = kManual;
+    v_tensor.setName("past_value_" + std::to_string(l));
+    input_tensors_.push_back(v_tensor);
+  }
+
+  // Output Tensors
+  output_tensors_.reserve(1 + 2 * config_.num_layers);
+
+  // 1. Logits
+  auto logits = Tensor::empty({1, 1, config_.ar_len, config_.vocab_size}, kUInt16, kQNN).alloc();
+  logits.setName("logits");
+  output_tensors_.push_back(logits);
+
+  // 2. KV Caches
+  for (int l = 0; l < config_.num_layers; ++l) {
+    // K Output
+    auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN);
+    k_tensor.impl()->storage()->ptr_ = k_caches[l].output_buffer;
+    k_tensor.impl()->storage()->mem_type_ = kManual;
+    k_tensor.setName("present_key_" + std::to_string(l));
+    output_tensors_.push_back(k_tensor);
+
+    // V Output
+    auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN);
+    v_tensor.impl()->storage()->ptr_ = v_caches[l].output_buffer;
+    v_tensor.impl()->storage()->mem_type_ = kManual;
+    v_tensor.setName("present_value_" + std::to_string(l));
+    output_tensors_.push_back(v_tensor);
+  }
+}
+
+template<typename T>
+void PromptProcessor<T>::prepare_io(const std::vector<int64_t>& prompt_tokens, int64_t prompt_pos, int64_t start_pos) {
+  int64_t num_tokens = prompt_tokens.size();
+  int64_t chunk_size = std::min((int64_t)config_.ar_len, num_tokens - prompt_pos);
+
+  // 1. Input IDs
+  int32_t* input_ids_ptr = input_tensors_[0].ptr<int32_t>();
+  for (int i = 0; i < config_.ar_len; ++i) {
+    if (i < chunk_size) {
+      input_ids_ptr[i] = (int32_t)prompt_tokens[prompt_pos + i];
+    } else {
+      input_ids_ptr[i] = 0;  // Padding
+    }
+  }
+
+  // 2. Position IDs
+  int32_t* pos_ids_ptr = input_tensors_[1].ptr<int32_t>();
+  for (int i = 0; i < config_.ar_len; ++i) { pos_ids_ptr[i] = (int32_t)(start_pos + i); }
+
+  // 3. Attention Mask
+  // We need to re-calculate attention mask based on start_pos
+  std::vector<uint16_t> attn_mask_data(config_.ar_len * config_.context_len);
+  std::vector<int32_t> attention_map(config_.ar_len);
+  for (int i = 0; i < config_.ar_len; ++i) {
+    if (i == 0) {
+      attention_map[i] = -1;
+    } else {
+      attention_map[i] = i - 1;
+    }
+  }
+
+  kv_manager_->initAttentionMask(attn_mask_data.data(), attention_map, config_.ar_len, start_pos);
+
+  uint16_t* attn_mask_ptr = input_tensors_[2].ptr<uint16_t>();
+  for (size_t k = 0; k < attn_mask_data.size(); ++k) { attn_mask_ptr[k] = (uint16_t)attn_mask_data[k]; }
+}
+
+template<typename T>
+int64_t PromptProcessor<T>::prefill(const std::vector<int64_t>& prompt_tokens, int64_t start_pos) {
+  MLLM_INFO("perform prefill");
+
+  int64_t num_tokens = prompt_tokens.size();
+  int64_t current_pos = start_pos;
+  int64_t processed_tokens = 0;
+
+  // Ensure KV cache is arranged for ar_len
+  kv_manager_->rearrangeCache(config_.ar_len);
+
+  std::vector<int32_t> attention_map(config_.ar_len);
+  std::iota(attention_map.begin(), attention_map.end(), -1);
+  kv_manager_->initAttentionMask(input_tensors_[3].ptr<uint16_t>(),  // TODO: use member rather than index
+                                 attention_map, config_.ar_len, start_pos);
+
+  module_->setOutputTensors(output_tensors_);
+
+  while (processed_tokens < num_tokens) {
+    int64_t chunk_size = std::min((int64_t)config_.ar_len, num_tokens - processed_tokens);
+
+    prepare_io(prompt_tokens, processed_tokens, current_pos);
+
+    // Run forward
+    output_tensors_ = (*module_)(input_tensors_);
+
+    int32_t n_update = chunk_size;
+
+    kv_manager_->updateCache(config_.ar_len, current_pos, n_update, {});
+
+    kv_manager_->updateAttentionMask(input_tensors_[3].ptr<uint16_t>(), config_.ar_len, current_pos, n_update,
+                                     config_.sliding_window);
+
+    processed_tokens += chunk_size;
+    current_pos += chunk_size;
+  }
+
+  auto logits = output_tensors_[0][{kAll, (num_tokens + config_.ar_len - 1) % config_.ar_len, kAll}];
+
+  auto cur_token = module_->sampleGreedy(logits);
+
+  return cur_token;
+}
+
+// Explicit instantiations
+template class PromptProcessor<uint16_t>;
+template class PromptProcessor<uint8_t>;
+
+}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.hpp b/mllm/backends/qnn/aot_rt/PromptProcessor.hpp
new file mode 100644
index 000000000..c867f0f0c
--- /dev/null
+++ b/mllm/backends/qnn/aot_rt/PromptProcessor.hpp
@@ -0,0 +1,53 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/backends/qnn/aot_rt/QnnAOTModule.hpp"
+#include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp"
+#include "mllm/core/Tensor.hpp"
+#include <vector>
+#include <memory>
+#include <string>
+
+namespace mllm::qnn::aot {
+
+template<typename T>
+class PromptProcessor {
+ public:
+  struct Config {
+    std::string model_path;
+    int32_t context_len;
+    int64_t num_heads;
+    int64_t num_layers;
+    int32_t ar_len;
+    int32_t vocab_size;
+    int32_t head_dim;
+    bool use_int64_token;
+    int sliding_window;
+    DataTypes kv_dtype = kUInt8;
+  };
+
+  PromptProcessor(KVCacheManager<T>* kv_manager, Config config);
+
+  /**
+   * Prefill an LLM Module with the given text input.
+   * @param prompt_tokens The text prompt tokens to the LLM Module.
+   * @param start_pos The starting position in KV cache.
+   * @return The next token (or logits).
+   */
+  int64_t prefill(const std::vector<int64_t>& prompt_tokens, int64_t start_pos = 0);
+
+  void init_io();
+  void prepare_io(const std::vector<int64_t>& prompt_tokens, int64_t prompt_pos, int64_t start_pos);
+
+ private:
+  std::unique_ptr<QnnAOTModule> module_;
+  KVCacheManager<T>* kv_manager_;
+  Config config_;
+
+  std::vector<mllm::Tensor> input_tensors_;
+  std::vector<mllm::Tensor> output_tensors_;
+};
+
+}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot_rt/QnnAOTModule.cpp b/mllm/backends/qnn/aot_rt/QnnAOTModule.cpp
new file mode 100644
index 000000000..f1cf6eb1d
--- /dev/null
+++ b/mllm/backends/qnn/aot_rt/QnnAOTModule.cpp
@@ -0,0 +1,17 @@
+#include "mllm/backends/qnn/aot_rt/QnnAOTModule.hpp"
+#include "mllm/nn/Module.hpp"
+#include "mllm/utils/Log.hpp"
+#include "mllm/engine/Context.hpp"
+#include "mllm/backends/qnn/QNNBackend.hpp"
+
+namespace mllm::qnn::aot {
+
+QnnAOTModule::QnnAOTModule(const std::string& model_path, const std::string& graph_name)
+    : mllm::nn::Module(graph_name), model_path_(model_path), graph_name_(graph_name) {}
+
+std::vector<mllm::Tensor> QnnAOTModule::forward(const std::vector<mllm::Tensor>& inputs,
+                                                const std::vector<mllm::AnyValue>& args) {
+  return output_tensors_;
+}
+
+}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp b/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp
new file mode 100644
index 000000000..0cfa464c5
--- /dev/null
+++ b/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "mllm/models/ARGeneration.hpp"
+#include "mllm/nn/Module.hpp"
+#include "mllm/utils/Common.hpp"
+
+#include <string>
+#include <vector>
+
+namespace mllm::qnn::aot {
+
+class QnnAOTModule : public mllm::nn::Module, public models::ARGeneration {
+ public:
+  QnnAOTModule(const std::string& model_path, const std::string& graph_name);
+
+  std::vector<mllm::Tensor> forward(const std::vector<mllm::Tensor>& inputs, const std::vector<mllm::AnyValue>& args) override;
+
+  models::ARGenerationOutputPast forward(const models::ARGenerationOutputPast& input,
+                                         const models::ARGenerationArgs& args) override {
+    NYI("ARGeneration forward is not implemented for QnnAOTModule");
+    return {};
+  };
+
+  void setOutputTensors(const std::vector<Tensor>& output_tensors) { output_tensors_ = output_tensors; }
+
+ private:
+  std::string model_path_;
+  std::string graph_name_;
+
+  std::vector<Tensor> output_tensors_;
+
+  std::string backend_path_;
+};
+
+}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp
index e69de29bb..6f0bcfd57 100644
--- a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp
+++ b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
+#include <algorithm>
+#include <cstring>
+#include "mllm/core/DeviceTypes.hpp"
+#include "mllm/preprocessor/tokenizers/Unicode.hpp"
+#include "mllm/utils/Log.hpp"
+
+namespace mllm::qnn::aot {
+Runner::Runner(const RunnerConfig& config, mllm::preprocessor::AutoTokenizer* tokenizer)
+    : config_(config), tokenizer_(tokenizer) {}
+
+bool Runner::load() {
+  // init KV cache manager
+  KVCacheConfig kv_config;
+  kv_config.context_len = config_.context_len;
+  kv_config.head_dim = config_.head_dim;
+
+  int32_t prompt_processor_ar_len = config_.ar_len;
+  int32_t token_generator_ar_len = 1;
+
+  if (prompt_processor_ar_len == config_.context_len) {
+    kv_config.max_cache_len = config_.context_len;
+  } else {
+    kv_config.max_cache_len = config_.context_len - std::min(token_generator_ar_len, prompt_processor_ar_len);
+  }
+  kv_config.max_ar_len = std::max(token_generator_ar_len, prompt_processor_ar_len);
+
+  kv_config.num_heads = config_.num_heads;
+  kv_config.num_layers = config_.num_layers;
+
+  kv_manager_ = std::make_unique<KVCacheManager<uint8_t>>(kv_config);
+
+  auto backend = mllm::Context::instance().getBackend(mllm::kQNN);
+  if (!backend) {
+    MLLM_ERROR("QNN Backend not found");
+    return false;
+  }
+
+  // init prompt processor(prefill)
+  PromptProcessor<uint8_t>::Config prefill_config;
+  prefill_config.model_path = config_.model_path;
+  prefill_config.context_len = config_.context_len;
+  prefill_config.num_heads = config_.num_heads;
+  prefill_config.num_layers = config_.num_layers;
+  prefill_config.ar_len = config_.ar_len;
+  prefill_config.vocab_size = config_.vocab_size;
+  prefill_config.head_dim = config_.head_dim;
+  prefill_config.use_int64_token = false;
+  prefill_config.sliding_window = config_.context_len;  // no sliding window for now
+
+  prompt_processor_ = std::make_unique<PromptProcessor<uint8_t>>(kv_manager_.get(), prefill_config);
+
+  // init token generator(decode)
+  TokenGenerator<uint8_t>::Config decode_config;
+  decode_config.model_path = config_.model_path;
+  decode_config.context_len = config_.context_len;
+  decode_config.num_heads = config_.num_heads;
+  decode_config.num_layers = config_.num_layers;
+  decode_config.vocab_size = config_.vocab_size;
+  decode_config.head_dim = config_.head_dim;
+  decode_config.use_int64_token = false;
+  decode_config.sliding_window = config_.context_len;
+
+  // TODO: EOS IDs
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
+  eos_ids->insert(151643);
+  eos_ids->insert(151645);
+
+  token_generator_ =
+      std::make_unique<TokenGenerator<uint8_t>>(tokenizer_, kv_manager_.get(), std::move(eos_ids), decode_config);
+
+  kv_manager_->initCache(backend->allocator().get(), config_.ar_len);
+  prompt_processor_->init_io();
+  token_generator_->init_io();
+
+  return true;
+}
+
+void Runner::generate(std::vector<uint64_t>& prompt_tokens, int32_t seq_len,
+                      const std::function<void(const std::string&)>& token_callback) {
+  int64_t start_pos = 0;
+
+  std::vector<int64_t> prompt_tokens_i64;
+  prompt_tokens_i64.reserve(prompt_tokens.size());
+  for (auto t : prompt_tokens) prompt_tokens_i64.push_back((int64_t)t);
+
+  int64_t next_token = prompt_processor_->prefill(prompt_tokens_i64, start_pos);
+
+  prompt_tokens.push_back((uint64_t)next_token);
+  if (token_callback) {
+    std::wstring wstr = tokenizer_->detokenize(next_token);
+    std::string str = mllm::preprocessor::wideString2Utf8String(wstr);
+    token_callback(str);
+  }
+
+  int64_t cur_pos = prompt_tokens.size();
+
+  token_generator_->generate(prompt_tokens, cur_pos, seq_len, token_callback, false);
+}
+
+}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp
index e69de29bb..dc41ad09f 100644
--- a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp
+++ b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp
@@ -0,0 +1,45 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/qnn/aot_rt/QnnAOTModule.hpp"
+#include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp"
+#include "mllm/backends/qnn/aot_rt/PromptProcessor.hpp"
+#include "mllm/backends/qnn/aot_rt/TokenGenerator.hpp"
+#include "mllm/preprocessor/tokenizers/AutoTokenizer.hpp"
+#include <string>
+#include <memory>
+#include <vector>
+#include <functional>
+
+namespace mllm::qnn::aot {
+
+struct RunnerConfig {
+  std::string model_path;
+  float temperature = 0.8f;
+  int num_layers = 28;
+  int num_heads = 12;
+  int head_dim = 128;
+  int vocab_size = 151936;
+  int context_len = 4096;
+  int ar_len = 128;  // Chunk size for prefill
+};
+
+class Runner {
+ public:
+  explicit Runner(const RunnerConfig& config, mllm::preprocessor::AutoTokenizer* tokenizer);
+  ~Runner() = default;
+
+  bool load();
+  void generate(std::vector<uint64_t>& prompt_tokens, int32_t seq_len,
+                const std::function<void(const std::string&)>& token_callback);
+
+ private:
+  RunnerConfig config_;
+  mllm::preprocessor::AutoTokenizer* tokenizer_;
+
+  std::unique_ptr<KVCacheManager<uint8_t>> kv_manager_;
+  std::unique_ptr<PromptProcessor<uint8_t>> prompt_processor_;
+  std::unique_ptr<TokenGenerator<uint8_t>> token_generator_;
+};
+
+}  // namespace mllm::qnn::aot
\ No newline at end of file
diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.cpp b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp
new file mode 100644
index 000000000..98986ee41
--- /dev/null
+++ b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp
@@ -0,0 +1,156 @@
+#include "mllm/backends/qnn/aot_rt/TokenGenerator.hpp"
+#include "mllm/preprocessor/tokenizers/Unicode.hpp"
+#include <cstring>
+#include <utility>
+
+namespace mllm::qnn::aot {
+
+template<typename T>
+TokenGenerator<T>::TokenGenerator(mllm::preprocessor::AutoTokenizer* tokenizer, KVCacheManager<T>* kv_manager,
+                                  std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids, Config config)
+    : tokenizer_(tokenizer), kv_manager_(kv_manager), eos_ids_(std::move(eos_ids)), config_(std::move(config)) {
+  std::string graph_name = "model.0.s1";
+  module_ = std::make_unique<QnnAOTModule>(config_.model_path, graph_name);
+  module_->to(kQNN);
+}
+
+template<typename T>
+void TokenGenerator<T>::init_io() {
+  input_tensors_.reserve(4 + 2 * config_.num_layers);
+
+  // 1. Input IDs
+  auto input_ids = Tensor::empty({1, 1, 1, 1}, kInt64, kQNN).alloc();
+  input_ids.setName("input_ids");
+  input_tensors_.push_back(input_ids);
+
+  // // 2. Sliding Window Attention Mask
+  // auto sliding_window_mask = Tensor::empty({1, 1, 1, config_.context_len}, kUInt16, kQNN).alloc();
+  // sliding_window_mask.setName("sliding_window_attention_mask");
+  // input_tensors_.push_back(sliding_window_mask);
+
+  // 3. Attention Mask
+  auto attn_mask = Tensor::empty({1, 1, 1, config_.context_len}, kUInt16, kQNN).alloc();
+  attn_mask.setName("attention_mask");
+  input_tensors_.push_back(attn_mask);
+
+  // 4. Position IDs
+  auto pos_ids = Tensor::empty({1, 1, 1, 1}, kInt32, kQNN).alloc();
+  pos_ids.setName("position_ids");
+  input_tensors_.push_back(pos_ids);
+
+  // 5. KV Caches
+  const auto& k_caches = kv_manager_->getKCache();
+  const auto& v_caches = kv_manager_->getVCache();
+  for (int l = 0; l < config_.num_layers; ++l) {
+    // K
+    auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN);
+    k_tensor.impl()->storage()->ptr_ = k_caches[l].buffer;
+    k_tensor.impl()->storage()->mem_type_ = kManual;
+    k_tensor.setName("past_key_" + std::to_string(l));
+    input_tensors_.push_back(k_tensor);
+
+    // V
+    auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN);
+    v_tensor.impl()->storage()->ptr_ = v_caches[l].buffer;
+    v_tensor.impl()->storage()->mem_type_ = kManual;
+    v_tensor.setName("past_value_" + std::to_string(l));
+    input_tensors_.push_back(v_tensor);
+  }
+
+  // Output Tensors
+  output_tensors_.reserve(1 + 2 * config_.num_layers);
+
+  // 1. Logits
+  auto logits = Tensor::empty({1, 1, 1, config_.vocab_size}, kUInt16, kQNN).alloc();
+  logits.setName("logits");
+  output_tensors_.push_back(logits);
+
+  // 2. KV Caches
+  for (int l = 0; l < config_.num_layers; ++l) {
+    // K Output
+    auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN);
+    k_tensor.impl()->storage()->ptr_ = k_caches[l].output_buffer;
+    k_tensor.impl()->storage()->mem_type_ = kManual;
+    k_tensor.setName("present_key_" + std::to_string(l));
+    output_tensors_.push_back(k_tensor);
+
+    // V Output
+    auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN);
+    v_tensor.impl()->storage()->ptr_ = v_caches[l].output_buffer;
+    v_tensor.impl()->storage()->mem_type_ = kManual;
+    v_tensor.setName("present_value_" + std::to_string(l));
+    output_tensors_.push_back(v_tensor);
+  }
+}
+
+template<typename T>
+const std::vector<float>& TokenGenerator<T>::get_all_logits() {
+  return token_all_logits_;
+}
+
+template<typename T>
+void TokenGenerator<T>::prepare_io(uint64_t cur_token, int64_t start_pos) {
+  // 1. Input IDs
+  int32_t* input_ids_ptr = input_tensors_[0].ptr<int32_t>();
+  input_ids_ptr[0] = (int32_t)cur_token;
+
+  // 2. Position IDs
+  int32_t* pos_ids_ptr = input_tensors_[3].ptr<int32_t>();
+  pos_ids_ptr[0] = (int32_t)start_pos;
+
+  // 3. Attention Mask
+  // Update attention mask for the current position
+  kv_manager_->updateAttentionMask(input_tensors_[2].ptr<uint16_t>(), 1, start_pos, 1, config_.sliding_window);
+}
+
+template<typename T>
+int64_t TokenGenerator<T>::generate(std::vector<uint64_t>& tokens, int64_t start_pos, int32_t seq_len,
+                                    const std::function<void(const std::string&)>& token_callback, bool dump_logits) {
+  int64_t current_pos = start_pos;
+  uint64_t next_token = tokens.back();
+  int64_t generated_count = 0;
+
+  // Ensure KV cache is arranged for decode (1 token)
+  kv_manager_->rearrangeCache(1);
+
+  module_->setOutputTensors(output_tensors_);
+
+  for (int i = 0; i < seq_len; ++i) {
+    if (current_pos >= config_.context_len) { break; }
+
+    prepare_io(next_token, current_pos);
+
+    output_tensors_ = module_->forward(input_tensors_, {});
+
+    // Update KV Cache
+    int32_t n_update = 1;
+    kv_manager_->updateCache(1, current_pos, n_update, {});
+
+    // Get logits
+    auto logits_tensor = output_tensors_[0];
+
+    // Sample
+    auto cur_token = module_->sampleGreedy(logits_tensor);
+
+    next_token = cur_token;
+    tokens.push_back(next_token);
+    current_pos++;
+    generated_count++;
+
+    if (token_callback) {
+      std::wstring wstr = tokenizer_->detokenize(next_token);
+      std::string str = mllm::preprocessor::wideString2Utf8String(wstr);
+      token_callback(str);
+    }
+
+    if (eos_ids_ && eos_ids_->count(next_token)) { break; }
+  }
+
+  return generated_count;
+}
+
+// Explicit instantiations
+template class TokenGenerator<uint16_t>;
+template class TokenGenerator<uint8_t>;
+
+}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.hpp b/mllm/backends/qnn/aot_rt/TokenGenerator.hpp
new file mode 100644
index 000000000..5c23da325
--- /dev/null
+++ b/mllm/backends/qnn/aot_rt/TokenGenerator.hpp
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "mllm/backends/qnn/aot_rt/QnnAOTModule.hpp"
+#include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp"
+#include "mllm/preprocessor/tokenizers/AutoTokenizer.hpp"
+#include "mllm/core/Tensor.hpp"
+#include <string>
+#include <vector>
+#include <memory>
+#include <unordered_set>
+#include <functional>
+
+namespace mllm::qnn::aot {
+
+template<typename T>
+class TokenGenerator {
+ public:
+  struct Config {
+    std::string model_path;
+    int32_t context_len;
+    int64_t num_heads;
+    int64_t num_layers;
+    int32_t vocab_size;
+    int32_t head_dim;
+    bool use_int64_token;
+    int sliding_window;
+    DataTypes kv_dtype = kUInt8;
+  };
+
+  TokenGenerator(mllm::preprocessor::AutoTokenizer* tokenizer, KVCacheManager<T>* kv_manager,
+                 std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids, Config config);
+
+  virtual ~TokenGenerator() = default;
+
+  void init_io();
+
+  virtual const std::vector<float>& get_all_logits();
+
+  virtual int64_t generate(std::vector<uint64_t>& tokens, int64_t start_pos, int32_t seq_len,
+                           const std::function<void(const std::string&)>& token_callback, bool dump_logits);
+
+ protected:
+  mllm::preprocessor::AutoTokenizer* tokenizer_;
+  std::unique_ptr<QnnAOTModule> module_;
+  KVCacheManager<T>* kv_manager_;
+  std::unique_ptr<std::unordered_set<uint64_t>> eos_ids_;
+  Config config_;
+
+  std::vector<mllm::Tensor> input_tensors_;
+  std::vector<mllm::Tensor> output_tensors_;
+  std::vector<float> token_all_logits_;
+
+  void prepare_io(uint64_t cur_token, int64_t start_pos);
+};
+
+}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot_rt/utils/MaskGen.cpp b/mllm/backends/qnn/aot_rt/utils/MaskGen.cpp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mllm/backends/qnn/aot_rt/utils/MaskGen.hpp b/mllm/backends/qnn/aot_rt/utils/MaskGen.hpp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mllm/backends/qnn/aot_rt/utils/PositionIdGen.cpp b/mllm/backends/qnn/aot_rt/utils/PositionIdGen.cpp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mllm/backends/qnn/aot_rt/utils/PositionIdGen.hpp b/mllm/backends/qnn/aot_rt/utils/PositionIdGen.hpp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mllm/backends/qnn/aot_rt/utils/RoPEGen.cpp b/mllm/backends/qnn/aot_rt/utils/RoPEGen.cpp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mllm/backends/qnn/aot_rt/utils/RoPEGen.hpp b/mllm/backends/qnn/aot_rt/utils/RoPEGen.hpp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mllm/mllm.hpp b/mllm/mllm.hpp
index 4a07f0ee7..27ea0abe0 100644
--- a/mllm/mllm.hpp
+++ b/mllm/mllm.hpp
@@ -197,7 +197,7 @@ extern void initAscendBackend();
 
 bool isQnnAvailable();
 
-extern void initQnnBackend();
+extern void initQnnBackend(const std::string& context_path = "qnn_context.bin");
 
 void cleanThisThread();