UbiquitousLearning · chenghuaWang · Mar 29, 2026 · Mar 25, 2026 · Mar 27, 2026 · Mar 28, 2026
diff --git a/examples/qwen3_qnn_aot/compile.cpp b/examples/qwen3_qnn_aot/compile.cpp
@@ -1,13 +1,13 @@
 // Copyright (c) MLLM Team.
 // Licensed under the MIT License.
 
-#include <unordered_map>
 #include <mllm/mllm.hpp>
 #include <mllm/compile/PassManager.hpp>
 #include <mllm/backends/qnn/aot/QnnWrappersAPI.hpp>
 #include <mllm/backends/qnn/aot/passes/AOTPipeline.hpp>
 #include <mllm/backends/qnn/aot/QnnTargetMachineParser.hpp>
 
+#include "compile_common.hpp"
 #include "modeling_qwen_qnn_aot.hpp"
 
 using mllm::Argparse;
@@ -20,11 +20,11 @@ MLLM_MAIN({
   auto& qnn_env_path = Argparse::add<std::string>("-qnn_env|--qnn_env_path")
                            .def("/opt/qcom/aistack/qairt/2.41.0.251128/lib/x86_64-linux-clang/")
                            .help("QNN AOT Environment path.");
+  auto& output_context_path = Argparse::add<std::string>("-o|--output_context_name").help("Output QNN context path.");
 
   Argparse::parse(argc, argv);
 
-  int N = 32;
-  int CL = 1024;
+  constexpr int kContextLength = 1024;
 
   if (help.isSet()) {
     Argparse::printHelp();
@@ -36,128 +36,33 @@ MLLM_MAIN({
     Argparse::printHelp();
     return -1;
   }
+  if (!output_context_path.isSet()) {
+    MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, "No output context path provided");
+    Argparse::printHelp();
+    return -1;
+  }
 
   auto model_cfg = mllm::models::qwen3::Qwen3Config(model_cfg_path.get());
   auto model = mllm::models::qwen3::Qwen3ForCausalLM(model_cfg);
   auto params = mllm::load(model_path.get(), mllm::ModelFileVersion::kV2);
-  // Add params for causal mask
-  {
-    params->push("causal_mask.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
-    params->push("causal_mask.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
-    params->push("constant_zero.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
-    params->push("constant_zero.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
-  }
+  qwen3_qnn_aot::addCausalMaskParams(params);
   model.load(params);
 
   // Create Qnn AOT Model
   auto qnn_aot_env = mllm::qnn::aot::QnnAOTEnv(qnn_env_path.get(),
                                                mllm::qnn::aot::parseQcomTargetMachineFromJSONFile(qnn_aot_cfg_files.get()));
 
-  // Model length 32.
-
-  {
-    // Sequence: [B, N]
-    // past_key_i: [B, H, D, CL-N] for each layer i
-    // past_value_i: [B, H, CL-N, D] for each layer i
-    // causal_mask: [B, 1, N, CL]
-    auto sequence = mllm::Tensor::zeros({1, N}, mllm::kInt32);
-    auto causal_mask = mllm::Tensor::zeros({1, 1, N, CL}, mllm::kUInt16);
-
-    // NOTE: force set causal mask to UInt16Asy
-    // NOTE: Attach scale and zero point to causal mask
-    {
-      causal_mask = causal_mask.__unsafeSetDType(mllm::kUInt16PerTensorAsy);
-      causal_mask.attach("scale", params->pull("causal_mask.scale").impl(), true);
-      causal_mask.attach("zero_point", params->pull("causal_mask.zero_point").impl(), true);
-    }
-
-    // Create KV cache inputs for all layers
-    std::unordered_map<std::string, mllm::Tensor> trace_inputs;
-    trace_inputs["sequence"] = sequence;
-    trace_inputs["causal_mask"] = causal_mask;
-
-    for (int i = 0; i < model_cfg.num_hidden_layers; ++i) {
-      auto past_key_name = "past_key_" + std::to_string(i);
-      auto past_value_name = "past_value_" + std::to_string(i);
-
-      // clang-format off
-    trace_inputs[past_key_name] = mllm::Tensor::empty({
-        1,
-        model_cfg.num_key_value_heads,
-        model_cfg.head_dim,
-        CL - N,
-    }, mllm::kUInt8PerTensorSym);
-    trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);
-
-    trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
-    trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
-
-    trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
-    trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
-      // clang-format on
-    }
-
+  auto trace_and_dump = [&](int seq_len, const std::string& mir_path) {
+    auto trace_inputs = qwen3_qnn_aot::makeTraceInputs(seq_len, kContextLength, model_cfg, params);
     auto ir = model.trace(trace_inputs, {});
-
     mllm::ir::PassManager pm(ir["model"]);
     pm.reg(mllm::qnn::aot::createQnnAOTLoweringPipeline(&qnn_aot_env, qnn_aot_cfg_files.get(), params));
     pm.run();
+    mllm::redirect(mir_path, [&]() { mllm::print(ir["model"]); });
+  };
 
-    mllm::redirect("qwen3_qnn_aot_32.mir", [&]() { mllm::print(ir["model"]); });
-  }
-
-  // Model length 1.
-  {
-    N = 1;
-
-    // Sequence: [B, N]
-    // past_key_i: [B, H, D, CL-N] for each layer i
-    // past_value_i: [B, H, CL-N, D] for each layer i
-    // causal_mask: [B, 1, N, CL]
-    auto sequence = mllm::Tensor::zeros({1, N}, mllm::kInt32);
-    auto causal_mask = mllm::Tensor::zeros({1, 1, N, CL}, mllm::kUInt16);
-
-    // NOTE: force set causal mask to UInt16Asy
-    // NOTE: Attach scale and zero point to causal mask
-    {
-      causal_mask = causal_mask.__unsafeSetDType(mllm::kUInt16PerTensorAsy);
-      causal_mask.attach("scale", params->pull("causal_mask.scale").impl(), true);
-      causal_mask.attach("zero_point", params->pull("causal_mask.zero_point").impl(), true);
-    }
-
-    // Create KV cache inputs for all layers
-    std::unordered_map<std::string, mllm::Tensor> trace_inputs;
-    trace_inputs["sequence"] = sequence;
-    trace_inputs["causal_mask"] = causal_mask;
-    for (int i = 0; i < model_cfg.num_hidden_layers; ++i) {
-      auto past_key_name = "past_key_" + std::to_string(i);
-      auto past_value_name = "past_value_" + std::to_string(i);
-
-      // clang-format off
-    trace_inputs[past_key_name] = mllm::Tensor::empty({
-        1,
-        model_cfg.num_key_value_heads,
-        model_cfg.head_dim,
-        CL - N,
-    }, mllm::kUInt8PerTensorSym);
-    trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);
-
-    trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
-    trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
-
-    trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
-    trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
-      // clang-format on
-    }
-
-    auto ir = model.trace(trace_inputs, {});
-
-    mllm::ir::PassManager pm(ir["model"]);
-    pm.reg(mllm::qnn::aot::createQnnAOTLoweringPipeline(&qnn_aot_env, qnn_aot_cfg_files.get(), params));
-    pm.run();
-
-    mllm::redirect("qwen3_qnn_aot_1.mir", [&]() { mllm::print(ir["model"]); });
-  }
+  trace_and_dump(32, "qwen3_qnn_aot_32.mir");
+  trace_and_dump(1, "qwen3_qnn_aot_1.mir");
 
-  qnn_aot_env.saveContext("context.0", "qwen3-1.7B-lpbq.bin");
+  qnn_aot_env.saveContext("context.0", output_context_path.get());
 });
diff --git a/examples/qwen3_qnn_aot/compile_common.hpp b/examples/qwen3_qnn_aot/compile_common.hpp
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include <mllm/mllm.hpp>
+#include <mllm/models/qwen3/configuration_qwen3.hpp>
+
+namespace qwen3_qnn_aot {
+
+template <typename ParamsT>
+inline void addCausalMaskParams(const ParamsT& params) {
+  params->push("causal_mask.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
+  params->push("causal_mask.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
+  params->push("constant_zero.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
+  params->push("constant_zero.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
+}
+
+template <typename ParamsT>
+inline std::unordered_map<std::string, mllm::Tensor> makeTraceInputs(int seq_len,
+                                                                     int context_len,
+                                                                     const mllm::models::qwen3::Qwen3Config& model_cfg,
+                                                                     const ParamsT& params) {
+  auto sequence = mllm::Tensor::zeros({1, seq_len}, mllm::kInt32);
+  auto causal_mask = mllm::Tensor::zeros({1, 1, seq_len, context_len}, mllm::kUInt16);
+  causal_mask = causal_mask.__unsafeSetDType(mllm::kUInt16PerTensorAsy);
+  causal_mask.attach("scale", params->pull("causal_mask.scale").impl(), true);
+  causal_mask.attach("zero_point", params->pull("causal_mask.zero_point").impl(), true);
+
+  std::unordered_map<std::string, mllm::Tensor> trace_inputs;
+  trace_inputs["sequence"] = sequence;
+  trace_inputs["causal_mask"] = causal_mask;
+
+  for (int i = 0; i < model_cfg.num_hidden_layers; ++i) {
+    auto past_key_name = "past_key_" + std::to_string(i);
+    auto past_value_name = "past_value_" + std::to_string(i);
+
+    trace_inputs[past_key_name] = mllm::Tensor::empty({
+        1,
+        model_cfg.num_key_value_heads,
+        model_cfg.head_dim,
+        context_len - seq_len,
+    }, mllm::kUInt8PerTensorSym);
+    trace_inputs[past_value_name] = mllm::Tensor::empty({
+        1,
+        model_cfg.num_key_value_heads,
+        context_len - seq_len,
+        model_cfg.head_dim,
+    }, mllm::kUInt8PerTensorSym);
+
+    trace_inputs[past_key_name].attach("scale",
+                                       params->pull("model.layers." + std::to_string(i)
+                                                    + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale")
+                                           .impl(),
+                                       true);
+    trace_inputs[past_key_name].attach("zero_point",
+                                       params->pull("model.layers." + std::to_string(i)
+                                                    + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point")
+                                           .impl(),
+                                       true);
+    trace_inputs[past_value_name].attach("scale",
+                                         params->pull("model.layers." + std::to_string(i)
+                                                      + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale")
+                                             .impl(),
+                                         true);
+    trace_inputs[past_value_name].attach("zero_point",
+                                         params->pull("model.layers." + std::to_string(i)
+                                                      + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point")
+                                             .impl(),
+                                         true);
+  }
+
+  return trace_inputs;
+}
+
+}  // namespace qwen3_qnn_aot