Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@ endif()

if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE OR MLLM_BUILD_QNN_BACKEND)
add_subdirectory(qwen3_qnn_aot)
add_subdirectory(qwen2_qnn_aot)
add_subdirectory(llama_qnn_aot)
endif()
14 changes: 14 additions & 0 deletions examples/llama_qnn_aot/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# AOT targets run on x86
if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
add_executable(mllm-llama-aot-c compile.cpp)
target_link_libraries(mllm-llama-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-llama-aot-c PRIVATE ${MLLM_INCLUDE_DIR})

add_executable(mllm-llama-aot-c-sha compile_sha.cpp)
target_link_libraries(mllm-llama-aot-c-sha PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-llama-aot-c-sha PRIVATE ${MLLM_INCLUDE_DIR})
endif()

add_executable(mllm-llama-aot-runner aot_run.cpp)
target_link_libraries(mllm-llama-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-llama-aot-runner PRIVATE ${MLLM_INCLUDE_DIR})
68 changes: 68 additions & 0 deletions examples/llama_qnn_aot/aot_run.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include <iostream>
#include <fmt/core.h>
#include <mllm/mllm.hpp>
#include <string>
#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
#include "configuration_llama3.hpp"
#include "mllm/models/llama/tokenization_tiny_llama.hpp"
#include "mllm/models/qwen3/tokenization_qwen3.hpp"

using mllm::Argparse;
using namespace mllm::qnn::aot; // NOLINT

MLLM_MAIN({
auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
auto& model_path = Argparse::add<std::string>("-m|--model").help("Model path").def("llama_qnn.mllm");
auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json");
auto& config_path = Argparse::add<std::string>("-c|--config").help("Config path").required(true);
auto& ar_len = Argparse::add<int>("--ar_len").help("Autoregressive length (chunk size)").def(128);
auto& seq_len = Argparse::add<int>("--seq_len").help("Input sequence length").def(800);
auto& gen_len = Argparse::add<int>("--gen_len").help("Generate token length").def(32);

Argparse::parse(argc, argv);

if (help.isSet()) {
Argparse::printHelp();
return 0;
}

mllm::initQnnBackend(model_path.get());

auto llama_cfg = mllm::models::llama3::Llama3Config(config_path.get());

RunnerConfig config;
config.num_layers = llama_cfg.num_hidden_layers;
config.num_heads = llama_cfg.num_attention_heads;
config.head_dim = llama_cfg.head_dim;
config.vocab_size = llama_cfg.vocab_size;
config.context_len = 1024;
config.ar_len = ar_len.get();

// Note: Using Qwen3 tokenizer as a placeholder.
// For production use, you should implement a Llama3Tokenizer or use
// the appropriate tokenizer for your model.
auto tokenizer = mllm::models::llama::TinyLlamaTokenizer(tokenizer_path.get());

auto input_tensor = tokenizer.convertMessage({{
.role = "user",
.content = "hello",
}});

input_tensor["sequence"] = mllm::Tensor::arange(0, seq_len.get(), 1, mllm::kInt64, mllm::kCPU).view({1, -1});

// DBG:
mllm::print(input_tensor["sequence"].shape());
mllm::print(input_tensor["sequence"]);

Runner runner(config, &tokenizer);
if (!runner.load()) {
std::cerr << "Failed to load model\n";
return 1;
}

runner.generate(
input_tensor["sequence"], gen_len.get(), [](const std::string& token) { std::cout << token << std::flush; }, true);
std::cout << "\n";

return 0;
});
159 changes: 159 additions & 0 deletions examples/llama_qnn_aot/compile.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// Copyright (c) MLLM Team.
// Licensed under the MIT License.

#include <unordered_map>
#include <mllm/mllm.hpp>
#include <mllm/compile/PassManager.hpp>
#include <mllm/backends/qnn/aot/QnnWrappersAPI.hpp>
#include <mllm/backends/qnn/aot/passes/AOTPipeline.hpp>
#include <mllm/backends/qnn/aot/QnnTargetMachineParser.hpp>

#include "modeling_llama_qnn_aot.hpp"

using mllm::Argparse;

MLLM_MAIN({
auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
auto& model_path = Argparse::add<std::string>("-m|--model_path").help("Model file path.");
auto& model_cfg_path = Argparse::add<std::string>("-c|--config").help("Model config file path.");
auto& qnn_aot_cfg_files = Argparse::add<std::string>("-aot_cfg|--aot_config").help("AOT Config file path.");

Argparse::parse(argc, argv);

int N = 32;
int CL = 1024;

if (help.isSet()) {
Argparse::printHelp();
return 0;
}

if (!qnn_aot_cfg_files.isSet()) {
MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, "No input aot config file path provided");
Argparse::printHelp();
return -1;
}

auto model_cfg = mllm::models::llama3::Llama3Config(model_cfg_path.get());
auto model = mllm::models::llama3::LlamaForCausalLM(model_cfg);
auto params = mllm::load(model_path.get(), mllm::ModelFileVersion::kV2);
// Add params for causal mask
{
params->push("causal_mask.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
params->push("causal_mask.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
params->push("constant_zero.scale", mllm::Tensor::constant(0.001 / 65535.f, mllm::kFloat32));
params->push("constant_zero.zero_point", mllm::Tensor::constant(65535, mllm::kInt32));
}
model.load(params);

// Create Qnn AOT Model
auto qnn_aot_env = mllm::qnn::aot::QnnAOTEnv("/opt/qcom/aistack/qairt/2.41.0.251128/lib/x86_64-linux-clang/",
mllm::qnn::aot::parseQcomTargetMachineFromJSONFile(qnn_aot_cfg_files.get()));

// Model length 32.

{
// Sequence: [B, N]
// past_key_i: [B, H, D, CL-N] for each layer i
// past_value_i: [B, H, CL-N, D] for each layer i
// causal_mask: [B, 1, N, CL]
auto sequence = mllm::Tensor::zeros({1, N}, mllm::kInt32);
auto causal_mask = mllm::Tensor::zeros({1, 1, N, CL}, mllm::kUInt16);

// NOTE: force set causal mask to UInt16Asy
// NOTE: Attach scale and zero point to causal mask
{
causal_mask = causal_mask.__unsafeSetDType(mllm::kUInt16PerTensorAsy);
causal_mask.attach("scale", params->pull("causal_mask.scale").impl(), true);
causal_mask.attach("zero_point", params->pull("causal_mask.zero_point").impl(), true);
}

// Create KV cache inputs for all layers
std::unordered_map<std::string, mllm::Tensor> trace_inputs;
trace_inputs["sequence"] = sequence;
trace_inputs["causal_mask"] = causal_mask;
for (int i = 0; i < model_cfg.num_hidden_layers; ++i) {
auto past_key_name = "past_key_" + std::to_string(i);
auto past_value_name = "past_value_" + std::to_string(i);

// clang-format off
trace_inputs[past_key_name] = mllm::Tensor::empty({
1,
model_cfg.num_key_value_heads,
model_cfg.head_dim,
CL - N,
}, mllm::kUInt8PerTensorSym);
trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);

trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);

trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
// clang-format on
}

auto ir = model.trace(trace_inputs, {});

mllm::ir::PassManager pm(ir["model"]);
pm.reg(mllm::qnn::aot::createQnnAOTLoweringPipeline(&qnn_aot_env, qnn_aot_cfg_files.get(), params));
pm.run();

mllm::redirect("llama_qnn_aot_32.mir", [&]() { mllm::print(ir["model"]); });
}

// Model length 1.
{
N = 1;

// Sequence: [B, N]
// past_key_i: [B, H, D, CL-N] for each layer i
// past_value_i: [B, H, CL-N, D] for each layer i
// causal_mask: [B, 1, N, CL]
auto sequence = mllm::Tensor::zeros({1, N}, mllm::kInt32);
auto causal_mask = mllm::Tensor::zeros({1, 1, N, CL}, mllm::kUInt16);

// NOTE: force set causal mask to UInt16Asy
// NOTE: Attach scale and zero point to causal mask
{
causal_mask = causal_mask.__unsafeSetDType(mllm::kUInt16PerTensorAsy);
causal_mask.attach("scale", params->pull("causal_mask.scale").impl(), true);
causal_mask.attach("zero_point", params->pull("causal_mask.zero_point").impl(), true);
}

// Create KV cache inputs for all layers
std::unordered_map<std::string, mllm::Tensor> trace_inputs;
trace_inputs["sequence"] = sequence;
trace_inputs["causal_mask"] = causal_mask;
for (int i = 0; i < model_cfg.num_hidden_layers; ++i) {
auto past_key_name = "past_key_" + std::to_string(i);
auto past_value_name = "past_value_" + std::to_string(i);

// clang-format off
trace_inputs[past_key_name] = mllm::Tensor::empty({
1,
model_cfg.num_key_value_heads,
model_cfg.head_dim,
CL - N,
}, mllm::kUInt8PerTensorSym);
trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);

trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);

trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
// clang-format on
}

auto ir = model.trace(trace_inputs, {});

mllm::ir::PassManager pm(ir["model"]);
pm.reg(mllm::qnn::aot::createQnnAOTLoweringPipeline(&qnn_aot_env, qnn_aot_cfg_files.get(), params));
pm.run();

mllm::redirect("llama_qnn_aot_1.mir", [&]() { mllm::print(ir["model"]); });
}

qnn_aot_env.saveContext("context.0", "llama-lpbq.bin");
});
Loading