Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 16 additions & 111 deletions examples/qwen3_qnn_aot/compile.cpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
// Copyright (c) MLLM Team.
// Licensed under the MIT License.

#include <unordered_map>
#include <mllm/mllm.hpp>
#include <mllm/compile/PassManager.hpp>
#include <mllm/backends/qnn/aot/QnnWrappersAPI.hpp>
#include <mllm/backends/qnn/aot/passes/AOTPipeline.hpp>
#include <mllm/backends/qnn/aot/QnnTargetMachineParser.hpp>

#include "compile_common.hpp"
#include "modeling_qwen_qnn_aot.hpp"

using mllm::Argparse;
Expand All @@ -20,11 +20,11 @@ MLLM_MAIN({
auto& qnn_env_path = Argparse::add<std::string>("-qnn_env|--qnn_env_path")
.def("/opt/qcom/aistack/qairt/2.41.0.251128/lib/x86_64-linux-clang/")
.help("QNN AOT Environment path.");
auto& output_context_path = Argparse::add<std::string>("-o|--output_context_name").help("Output QNN context path.");

Argparse::parse(argc, argv);

int N = 32;
int CL = 1024;
constexpr int kContextLength = 1024;

if (help.isSet()) {
Argparse::printHelp();
Expand All @@ -36,128 +36,33 @@ MLLM_MAIN({
Argparse::printHelp();
return -1;
}
if (!output_context_path.isSet()) {
MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, "No output context path provided");
Argparse::printHelp();
return -1;
}

auto model_cfg = mllm::models::qwen3::Qwen3Config(model_cfg_path.get());
auto model = mllm::models::qwen3::Qwen3ForCausalLM(model_cfg);
auto params = mllm::load(model_path.get(), mllm::ModelFileVersion::kV2);
// Add params for causal mask
{
params->push("causal_mask.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
params->push("causal_mask.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
params->push("constant_zero.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
params->push("constant_zero.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
}
qwen3_qnn_aot::addCausalMaskParams(params);
model.load(params);

// Create Qnn AOT Model
auto qnn_aot_env = mllm::qnn::aot::QnnAOTEnv(qnn_env_path.get(),
mllm::qnn::aot::parseQcomTargetMachineFromJSONFile(qnn_aot_cfg_files.get()));

// Model length 32.

{
// Sequence: [B, N]
// past_key_i: [B, H, D, CL-N] for each layer i
// past_value_i: [B, H, CL-N, D] for each layer i
// causal_mask: [B, 1, N, CL]
auto sequence = mllm::Tensor::zeros({1, N}, mllm::kInt32);
auto causal_mask = mllm::Tensor::zeros({1, 1, N, CL}, mllm::kUInt16);

// NOTE: force set causal mask to UInt16Asy
// NOTE: Attach scale and zero point to causal mask
{
causal_mask = causal_mask.__unsafeSetDType(mllm::kUInt16PerTensorAsy);
causal_mask.attach("scale", params->pull("causal_mask.scale").impl(), true);
causal_mask.attach("zero_point", params->pull("causal_mask.zero_point").impl(), true);
}

// Create KV cache inputs for all layers
std::unordered_map<std::string, mllm::Tensor> trace_inputs;
trace_inputs["sequence"] = sequence;
trace_inputs["causal_mask"] = causal_mask;

for (int i = 0; i < model_cfg.num_hidden_layers; ++i) {
auto past_key_name = "past_key_" + std::to_string(i);
auto past_value_name = "past_value_" + std::to_string(i);

// clang-format off
trace_inputs[past_key_name] = mllm::Tensor::empty({
1,
model_cfg.num_key_value_heads,
model_cfg.head_dim,
CL - N,
}, mllm::kUInt8PerTensorSym);
trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);

trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);

trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
// clang-format on
}

auto trace_and_dump = [&](int seq_len, const std::string& mir_path) {
auto trace_inputs = qwen3_qnn_aot::makeTraceInputs(seq_len, kContextLength, model_cfg, params);
auto ir = model.trace(trace_inputs, {});

mllm::ir::PassManager pm(ir["model"]);
pm.reg(mllm::qnn::aot::createQnnAOTLoweringPipeline(&qnn_aot_env, qnn_aot_cfg_files.get(), params));
pm.run();
mllm::redirect(mir_path, [&]() { mllm::print(ir["model"]); });
};

mllm::redirect("qwen3_qnn_aot_32.mir", [&]() { mllm::print(ir["model"]); });
}

// Model length 1.
{
N = 1;

// Sequence: [B, N]
// past_key_i: [B, H, D, CL-N] for each layer i
// past_value_i: [B, H, CL-N, D] for each layer i
// causal_mask: [B, 1, N, CL]
auto sequence = mllm::Tensor::zeros({1, N}, mllm::kInt32);
auto causal_mask = mllm::Tensor::zeros({1, 1, N, CL}, mllm::kUInt16);

// NOTE: force set causal mask to UInt16Asy
// NOTE: Attach scale and zero point to causal mask
{
causal_mask = causal_mask.__unsafeSetDType(mllm::kUInt16PerTensorAsy);
causal_mask.attach("scale", params->pull("causal_mask.scale").impl(), true);
causal_mask.attach("zero_point", params->pull("causal_mask.zero_point").impl(), true);
}

// Create KV cache inputs for all layers
std::unordered_map<std::string, mllm::Tensor> trace_inputs;
trace_inputs["sequence"] = sequence;
trace_inputs["causal_mask"] = causal_mask;
for (int i = 0; i < model_cfg.num_hidden_layers; ++i) {
auto past_key_name = "past_key_" + std::to_string(i);
auto past_value_name = "past_value_" + std::to_string(i);

// clang-format off
trace_inputs[past_key_name] = mllm::Tensor::empty({
1,
model_cfg.num_key_value_heads,
model_cfg.head_dim,
CL - N,
}, mllm::kUInt8PerTensorSym);
trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);

trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);

trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
// clang-format on
}

auto ir = model.trace(trace_inputs, {});

mllm::ir::PassManager pm(ir["model"]);
pm.reg(mllm::qnn::aot::createQnnAOTLoweringPipeline(&qnn_aot_env, qnn_aot_cfg_files.get(), params));
pm.run();

mllm::redirect("qwen3_qnn_aot_1.mir", [&]() { mllm::print(ir["model"]); });
}
trace_and_dump(32, "qwen3_qnn_aot_32.mir");
trace_and_dump(1, "qwen3_qnn_aot_1.mir");

qnn_aot_env.saveContext("context.0", "qwen3-1.7B-lpbq.bin");
qnn_aot_env.saveContext("context.0", output_context_path.get());
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Propagate saveContext failures to the CLI.

QnnAOTEnv::saveContext only logs and returns on missing contexts or file-open failures, so this command can still exit successfully with no context file when -o points at an invalid destination. Please verify the file was written, or change saveContext to return status, before returning here. The identical call in examples/qwen3_qnn_aot/compile_sha.cpp needs the same guard.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@examples/qwen3_qnn_aot/compile.cpp` at line 67, qnn_aot_env.saveContext
currently only logs failures and doesn't propagate errors, so update the compile
flow to detect failure and return a non-zero exit code: either modify
QnnAOTEnv::saveContext to return a bool/status and check that result after
calling qnn_aot_env.saveContext("context.0", output_context_path.get()), or
after the call verify the file was actually written (e.g., attempt to open the
output path) and if it fails log an error and exit non-zero; apply the same
change/guard to the identical call in examples/qwen3_qnn_aot/compile_sha.cpp so
the CLI fails when the context file cannot be created.

});
76 changes: 76 additions & 0 deletions examples/qwen3_qnn_aot/compile_common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#pragma once

#include <string>
#include <unordered_map>

#include <mllm/mllm.hpp>
#include <mllm/models/qwen3/configuration_qwen3.hpp>

namespace qwen3_qnn_aot {

template <typename ParamsT>
inline void addCausalMaskParams(const ParamsT& params) {
params->push("causal_mask.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
params->push("causal_mask.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
params->push("constant_zero.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
params->push("constant_zero.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
}

template <typename ParamsT>
inline std::unordered_map<std::string, mllm::Tensor> makeTraceInputs(int seq_len,
int context_len,
const mllm::models::qwen3::Qwen3Config& model_cfg,
const ParamsT& params) {
auto sequence = mllm::Tensor::zeros({1, seq_len}, mllm::kInt32);
auto causal_mask = mllm::Tensor::zeros({1, 1, seq_len, context_len}, mllm::kUInt16);
causal_mask = causal_mask.__unsafeSetDType(mllm::kUInt16PerTensorAsy);
causal_mask.attach("scale", params->pull("causal_mask.scale").impl(), true);
causal_mask.attach("zero_point", params->pull("causal_mask.zero_point").impl(), true);

std::unordered_map<std::string, mllm::Tensor> trace_inputs;
trace_inputs["sequence"] = sequence;
trace_inputs["causal_mask"] = causal_mask;

for (int i = 0; i < model_cfg.num_hidden_layers; ++i) {
auto past_key_name = "past_key_" + std::to_string(i);
auto past_value_name = "past_value_" + std::to_string(i);

trace_inputs[past_key_name] = mllm::Tensor::empty({
1,
model_cfg.num_key_value_heads,
model_cfg.head_dim,
context_len - seq_len,
}, mllm::kUInt8PerTensorSym);
trace_inputs[past_value_name] = mllm::Tensor::empty({
1,
model_cfg.num_key_value_heads,
context_len - seq_len,
model_cfg.head_dim,
}, mllm::kUInt8PerTensorSym);

trace_inputs[past_key_name].attach("scale",
params->pull("model.layers." + std::to_string(i)
+ ".self_attn.k_cast_to_int8_qdq.fake_quant.scale")
.impl(),
true);
trace_inputs[past_key_name].attach("zero_point",
params->pull("model.layers." + std::to_string(i)
+ ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point")
.impl(),
true);
trace_inputs[past_value_name].attach("scale",
params->pull("model.layers." + std::to_string(i)
+ ".self_attn.v_cast_to_int8_qdq.fake_quant.scale")
.impl(),
true);
trace_inputs[past_value_name].attach("zero_point",
params->pull("model.layers." + std::to_string(i)
+ ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point")
.impl(),
true);
}

return trace_inputs;
}

} // namespace qwen3_qnn_aot
Loading