UbiquitousLearning · chenghuaWang · Jan 29, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -19,4 +19,6 @@ endif()
 
 if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE OR MLLM_BUILD_QNN_BACKEND)
   add_subdirectory(qwen3_qnn_aot)
+  add_subdirectory(qwen2_qnn_aot)
+  add_subdirectory(llama_qnn_aot)
 endif()
diff --git a/examples/llama_qnn_aot/CMakeLists.txt b/examples/llama_qnn_aot/CMakeLists.txt
@@ -0,0 +1,14 @@
+# AOT targets run on x86
+if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
+  add_executable(mllm-llama-aot-c compile.cpp)
+  target_link_libraries(mllm-llama-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+  target_include_directories(mllm-llama-aot-c PRIVATE ${MLLM_INCLUDE_DIR})
+
+  add_executable(mllm-llama-aot-c-sha compile_sha.cpp)
+  target_link_libraries(mllm-llama-aot-c-sha PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+  target_include_directories(mllm-llama-aot-c-sha PRIVATE ${MLLM_INCLUDE_DIR})
+endif()
+
+add_executable(mllm-llama-aot-runner aot_run.cpp)
+target_link_libraries(mllm-llama-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+target_include_directories(mllm-llama-aot-runner PRIVATE ${MLLM_INCLUDE_DIR})
diff --git a/examples/llama_qnn_aot/aot_run.cpp b/examples/llama_qnn_aot/aot_run.cpp
@@ -0,0 +1,68 @@
+#include <iostream>
+#include <fmt/core.h>
+#include <mllm/mllm.hpp>
+#include <string>
+#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
+#include "configuration_llama3.hpp"
+#include "mllm/models/llama/tokenization_tiny_llama.hpp"
+#include "mllm/models/qwen3/tokenization_qwen3.hpp"
+
+using mllm::Argparse;
+using namespace mllm::qnn::aot;  // NOLINT
+
+MLLM_MAIN({
+  auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
+  auto& model_path = Argparse::add<std::string>("-m|--model").help("Model path").def("llama_qnn.mllm");
+  auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json");
+  auto& config_path = Argparse::add<std::string>("-c|--config").help("Config path").required(true);
+  auto& ar_len = Argparse::add<int>("--ar_len").help("Autoregressive length (chunk size)").def(128);
+  auto& seq_len = Argparse::add<int>("--seq_len").help("Input sequence length").def(800);
+  auto& gen_len = Argparse::add<int>("--gen_len").help("Generate token length").def(32);
+
+  Argparse::parse(argc, argv);
+
+  if (help.isSet()) {
+    Argparse::printHelp();
+    return 0;
+  }
+
+  mllm::initQnnBackend(model_path.get());
+
+  auto llama_cfg = mllm::models::llama3::Llama3Config(config_path.get());
+
+  RunnerConfig config;
+  config.num_layers = llama_cfg.num_hidden_layers;
+  config.num_heads = llama_cfg.num_attention_heads;
+  config.head_dim = llama_cfg.head_dim;
+  config.vocab_size = llama_cfg.vocab_size;
+  config.context_len = 1024;
+  config.ar_len = ar_len.get();
+
+  // Note: Using Qwen3 tokenizer as a placeholder.
+  // For production use, you should implement a Llama3Tokenizer or use
+  // the appropriate tokenizer for your model.
+  auto tokenizer = mllm::models::llama::TinyLlamaTokenizer(tokenizer_path.get());
+
+  auto input_tensor = tokenizer.convertMessage({{
+      .role = "user",
+      .content = "hello",
+  }});
+
+  input_tensor["sequence"] = mllm::Tensor::arange(0, seq_len.get(), 1, mllm::kInt64, mllm::kCPU).view({1, -1});
+
+  // DBG:
+  mllm::print(input_tensor["sequence"].shape());
+  mllm::print(input_tensor["sequence"]);
+
+  Runner runner(config, &tokenizer);
+  if (!runner.load()) {
+    std::cerr << "Failed to load model\n";
+    return 1;
+  }
+
+  runner.generate(
+      input_tensor["sequence"], gen_len.get(), [](const std::string& token) { std::cout << token << std::flush; }, true);
+  std::cout << "\n";
+
+  return 0;
+});
diff --git a/examples/llama_qnn_aot/compile.cpp b/examples/llama_qnn_aot/compile.cpp
@@ -0,0 +1,159 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include <unordered_map>
+#include <mllm/mllm.hpp>
+#include <mllm/compile/PassManager.hpp>
+#include <mllm/backends/qnn/aot/QnnWrappersAPI.hpp>
+#include <mllm/backends/qnn/aot/passes/AOTPipeline.hpp>
+#include <mllm/backends/qnn/aot/QnnTargetMachineParser.hpp>
+
+#include "modeling_llama_qnn_aot.hpp"
+
+using mllm::Argparse;
+
+MLLM_MAIN({
+  auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
+  auto& model_path = Argparse::add<std::string>("-m|--model_path").help("Model file path.");
+  auto& model_cfg_path = Argparse::add<std::string>("-c|--config").help("Model config file path.");
+  auto& qnn_aot_cfg_files = Argparse::add<std::string>("-aot_cfg|--aot_config").help("AOT Config file path.");
+
+  Argparse::parse(argc, argv);
+
+  int N = 32;
+  int CL = 1024;
+
+  if (help.isSet()) {
+    Argparse::printHelp();
+    return 0;
+  }
+
+  if (!qnn_aot_cfg_files.isSet()) {
+    MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, "No input aot config file path provided");
+    Argparse::printHelp();
+    return -1;
+  }
+
+  auto model_cfg = mllm::models::llama3::Llama3Config(model_cfg_path.get());
+  auto model = mllm::models::llama3::LlamaForCausalLM(model_cfg);
+  auto params = mllm::load(model_path.get(), mllm::ModelFileVersion::kV2);
+  // Add params for causal mask
+  {
+    params->push("causal_mask.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
+    params->push("causal_mask.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
+    params->push("constant_zero.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
+    params->push("constant_zero.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
+  }
+  model.load(params);
+
+  // Create Qnn AOT Model
+  auto qnn_aot_env = mllm::qnn::aot::QnnAOTEnv("/opt/qcom/aistack/qairt/2.41.0.251128/lib/x86_64-linux-clang/",
+                                               mllm::qnn::aot::parseQcomTargetMachineFromJSONFile(qnn_aot_cfg_files.get()));
+
+  // Model length 32.
+
+  {
+    // Sequence: [B, N]
+    // past_key_i: [B, H, D, CL-N] for each layer i
+    // past_value_i: [B, H, CL-N, D] for each layer i
+    // causal_mask: [B, 1, N, CL]
+    auto sequence = mllm::Tensor::zeros({1, N}, mllm::kInt32);
+    auto causal_mask = mllm::Tensor::zeros({1, 1, N, CL}, mllm::kUInt16);
+
+    // NOTE: force set causal mask to UInt16Asy
+    // NOTE: Attach scale and zero point to causal mask
+    {
+      causal_mask = causal_mask.__unsafeSetDType(mllm::kUInt16PerTensorAsy);
+      causal_mask.attach("scale", params->pull("causal_mask.scale").impl(), true);
+      causal_mask.attach("zero_point", params->pull("causal_mask.zero_point").impl(), true);
+    }
+
+    // Create KV cache inputs for all layers
+    std::unordered_map<std::string, mllm::Tensor> trace_inputs;
+    trace_inputs["sequence"] = sequence;
+    trace_inputs["causal_mask"] = causal_mask;
+    for (int i = 0; i < model_cfg.num_hidden_layers; ++i) {
+      auto past_key_name = "past_key_" + std::to_string(i);
+      auto past_value_name = "past_value_" + std::to_string(i);
+
+      // clang-format off
+    trace_inputs[past_key_name] = mllm::Tensor::empty({
+        1,
+        model_cfg.num_key_value_heads,
+        model_cfg.head_dim,
+        CL - N,
+    }, mllm::kUInt8PerTensorSym);
+    trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);
+
+    trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
+    trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
+
+    trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
+    trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
+      // clang-format on
+    }
+
+    auto ir = model.trace(trace_inputs, {});
+
+    mllm::ir::PassManager pm(ir["model"]);
+    pm.reg(mllm::qnn::aot::createQnnAOTLoweringPipeline(&qnn_aot_env, qnn_aot_cfg_files.get(), params));
+    pm.run();
+
+    mllm::redirect("llama_qnn_aot_32.mir", [&]() { mllm::print(ir["model"]); });
+  }
+
+  // Model length 1.
+  {
+    N = 1;
+
+    // Sequence: [B, N]
+    // past_key_i: [B, H, D, CL-N] for each layer i
+    // past_value_i: [B, H, CL-N, D] for each layer i
+    // causal_mask: [B, 1, N, CL]
+    auto sequence = mllm::Tensor::zeros({1, N}, mllm::kInt32);
+    auto causal_mask = mllm::Tensor::zeros({1, 1, N, CL}, mllm::kUInt16);
+
+    // NOTE: force set causal mask to UInt16Asy
+    // NOTE: Attach scale and zero point to causal mask
+    {
+      causal_mask = causal_mask.__unsafeSetDType(mllm::kUInt16PerTensorAsy);
+      causal_mask.attach("scale", params->pull("causal_mask.scale").impl(), true);
+      causal_mask.attach("zero_point", params->pull("causal_mask.zero_point").impl(), true);
+    }
+
+    // Create KV cache inputs for all layers
+    std::unordered_map<std::string, mllm::Tensor> trace_inputs;
+    trace_inputs["sequence"] = sequence;
+    trace_inputs["causal_mask"] = causal_mask;
+    for (int i = 0; i < model_cfg.num_hidden_layers; ++i) {
+      auto past_key_name = "past_key_" + std::to_string(i);
+      auto past_value_name = "past_value_" + std::to_string(i);
+
+      // clang-format off
+    trace_inputs[past_key_name] = mllm::Tensor::empty({
+        1,
+        model_cfg.num_key_value_heads,
+        model_cfg.head_dim,
+        CL - N,
+    }, mllm::kUInt8PerTensorSym);
+    trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);
+
+    trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
+    trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
+
+    trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
+    trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
+      // clang-format on
+    }
+
+    auto ir = model.trace(trace_inputs, {});
+
+    mllm::ir::PassManager pm(ir["model"]);
+    pm.reg(mllm::qnn::aot::createQnnAOTLoweringPipeline(&qnn_aot_env, qnn_aot_cfg_files.get(), params));
+    pm.run();
+
+    mllm::redirect("llama_qnn_aot_1.mir", [&]() { mllm::print(ir["model"]); });
+  }
+
+  qnn_aot_env.saveContext("context.0", "llama-lpbq.bin");
+});