diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f963dc9e2..180c3cbe6 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -17,6 +17,6 @@ if(MLLM_TRACY_ENABLE) add_subdirectory(tracy_example) endif() -if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE) +if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE OR MLLM_BUILD_QNN_BACKEND) add_subdirectory(qwen3_qnn_aot) endif() diff --git a/examples/qwen3_qnn_aot/CMakeLists.txt b/examples/qwen3_qnn_aot/CMakeLists.txt index efc9f2db8..18041bdcb 100644 --- a/examples/qwen3_qnn_aot/CMakeLists.txt +++ b/examples/qwen3_qnn_aot/CMakeLists.txt @@ -1,3 +1,10 @@ -add_executable(mllm-qwen3-aot-c compile.cpp) -target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend) -target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR}) +# AOT targets run on x86 +if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE) + add_executable(mllm-qwen3-aot-c compile.cpp) + target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend) + target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR}) +endif() + +add_executable(mllm-qwen3-aot-runner aot_run.cpp) +target_link_libraries(mllm-qwen3-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend) +target_include_directories(mllm-qwen3-aot-runner PRIVATE ${MLLM_INCLUDE_DIR}) \ No newline at end of file diff --git a/examples/qwen3_qnn_aot/aot_run.cpp b/examples/qwen3_qnn_aot/aot_run.cpp new file mode 100644 index 000000000..56203bc14 --- /dev/null +++ b/examples/qwen3_qnn_aot/aot_run.cpp @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp" +#include "mllm/models/qwen3/configuration_qwen3.hpp" +#include "mllm/models/qwen3/tokenization_qwen3.hpp" + +using mllm::Argparse; +using namespace mllm::qnn::aot; // NOLINT + +MLLM_MAIN({ + auto& help = Argparse::add("-h|--help").help("Show help message"); + auto& model_path = Argparse::add("-m|--model").help("Model path").def("qwen3_qnn.mllm"); + auto& tokenizer_path = Argparse::add("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json"); + auto& config_path = Argparse::add("-c|--config").help("Config path").required(true); + auto& temperature = Argparse::add("--temperature").help("Temperature").def(0.8f); + auto& ar_len = Argparse::add("--ar_len").help("Autoregressive length (chunk size)").def(128); + + Argparse::parse(argc, argv); + + mllm::initQnnBackend(model_path.get()); + + if (help.isSet()) { + Argparse::printHelp(); + return 0; + } + + auto qwen3_cfg = mllm::models::qwen3::Qwen3Config(config_path.get()); + + RunnerConfig config; + config.model_path = model_path.get(); + config.temperature = temperature.get(); + config.num_layers = qwen3_cfg.num_hidden_layers; + config.num_heads = qwen3_cfg.num_attention_heads; + config.head_dim = qwen3_cfg.head_dim; + config.vocab_size = qwen3_cfg.vocab_size; + config.context_len = 1024; + config.ar_len = ar_len.get(); + + auto tokenizer = mllm::models::qwen3::Qwen3Tokenizer(tokenizer_path.get()); + + std::string prompt_text; + fmt::print("💬 Prompt text (or 'exit/quit'): "); + std::getline(std::cin, prompt_text); + + auto input_tensor = tokenizer.convertMessage({.prompt = prompt_text}); + + Runner runner(config, &tokenizer); + if (!runner.load()) { + std::cerr << "Failed to load model\n"; + return 1; + } + + std::vector prompt_tokens; + auto sequence = input_tensor["sequence"]; + int64_t* ptr = sequence.ptr(); + for (int i = 0; i < sequence.shape()[1]; ++i) { prompt_tokens.push_back((uint64_t)ptr[i]); } + + runner.generate(prompt_tokens, config.context_len, [](const std::string& token) { std::cout << token << std::flush; }); + std::cout << "\n"; + + return 0; +}); \ No newline at end of file diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp index ce9936e7b..9eed37267 100644 --- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp +++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp @@ -242,7 +242,7 @@ class Qwen3Attention final : public nn::Module { "k_rope_add_0_output_qdq"); // De-quantization and quantization again - key_states = key_states.to(kFloat16); + key_states = key_states.to(kFloat32); key_states = key_states.to(kUInt8PerTensorSym); key_states = ptq::QDQ_KV(this, key_states, "k_cast_to_int8_qdq"); @@ -251,7 +251,7 @@ class Qwen3Attention final : public nn::Module { // Handle KV Cache value_states = ptq::QDQ(this, value_states, "v_cast_to_int16_qdq"); - value_states = value_states.to(kFloat16); + value_states = value_states.to(kFloat32); value_states = value_states.to(kUInt8PerTensorSym); value_states = ptq::QDQ_KV(this, value_states, "v_cast_to_int8_qdq"); diff --git a/mllm/backends/qnn/CMakeLists.txt b/mllm/backends/qnn/CMakeLists.txt index 0e4203e08..0ad833792 100644 --- a/mllm/backends/qnn/CMakeLists.txt +++ b/mllm/backends/qnn/CMakeLists.txt @@ -21,6 +21,12 @@ if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE) list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_SRC}) endif() +file(GLOB_RECURSE MLLM_QUALCOMM_AOT_RT_SRC + ${CMAKE_CURRENT_LIST_DIR}/aot_rt/*.hpp + ${CMAKE_CURRENT_LIST_DIR}/aot_rt/*.cpp + ) +list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_RT_SRC}) + add_library( MllmQNNBackend SHARED diff --git a/mllm/backends/qnn/QNNBackend.cpp b/mllm/backends/qnn/QNNBackend.cpp index 5fe81efe4..abcdb6519 100644 --- a/mllm/backends/qnn/QNNBackend.cpp +++ b/mllm/backends/qnn/QNNBackend.cpp @@ -55,24 +55,6 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) { MLLM_INFO("QNN backend supports early termination"); } - bool contextStatus = false; - // check if the qnn_context.bin file exists - if (!std::filesystem::exists("qnn_context.bin")) { - contextStatus = runtime_->createContext(context_, nullptr); - } else { - contextStatus = runtime_->retrieveContext(context_, qnnModels_, nullptr); - - // fill qnnModelIndexMap_ info according to qnnModels_ - for (size_t i = 0; i < qnnModels_.size(); i++) { - auto graphName = qnnModels_[i]->getQnnGraphName(); - qnnModelIndexMap_.insert(std::make_pair(graphName, i)); - } - } - if (!contextStatus) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); } - - // init QNN Allocator - static_pointer_cast(allocator_)->setQNNPointer(runtime_->qnnInterface, context_); - // set performance parameters for better performance on HTP perf_ = QNNPerf::create(&runtime_->qnnInterface); perf_->setPowerConfigBurst(); @@ -348,10 +330,10 @@ bool QNNRuntime::createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t return true; } -bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vector>& qnnModels, - QnnContext_Config_t** contextConfig) { +bool QNNRuntime::retrieveContext(const std::string& contextBinaryPath, Qnn_ContextHandle_t& context, + std::vector>& qnnModels, QnnContext_Config_t** contextConfig) { // Read the binary from qnn_context.bin and get the size in byte - std::ifstream file(QNN_Context_File, std::ios::binary | std::ios::ate); + std::ifstream file(contextBinaryPath, std::ios::binary | std::ios::ate); std::streamsize size = file.tellg(); file.seekg(0, std::ios::beg); @@ -436,6 +418,25 @@ bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vectorcreateContext(context_, nullptr)) { return false; } + // init QNN Allocator + static_pointer_cast(allocator_)->setQNNPointer(runtime_->qnnInterface, context_); + return true; +} + +bool QNNBackend::loadContext(const std::string& contextPath) { + if (!runtime_->retrieveContext(contextPath, context_, qnnModels_, nullptr)) { return false; } + // fill qnnModelIndexMap_ info according to qnnModels_ + for (size_t i = 0; i < qnnModels_.size(); i++) { + auto graphName = qnnModels_[i]->getQnnGraphName(); + qnnModelIndexMap_.insert(std::make_pair(graphName, i)); + } + // init QNN Allocator + static_pointer_cast(allocator_)->setQNNPointer(runtime_->qnnInterface, context_); + return true; +} + std::shared_ptr QNNBackend::createQnnGraph(const std::string& graphName) { // If the graph already exists, return the existing model if (qnnModelIndexMap_.find(graphName) != qnnModelIndexMap_.end()) { @@ -535,8 +536,6 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector& return; } - // Prepare QNN input tensors by copying data from runtime inputs to graph input wrappers - // This handles the case where input tensor sizes may differ between prefill and decode phases std::vector qnn_inputs; std::vector qnn_outputs; for (int i = 0; i < model->getGraphInputTensorWrappers().size(); i++) { @@ -550,52 +549,8 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector& return; } - if (wrapper_tensor.isNil()) { - MLLM_ERROR("Graph input wrapper {} for graph '{}' has no backing tensor", i, graphName); - return; - } - - // Check for size mismatches (can occur in decode phase where inputs may be smaller) - size_t dst_bytes = wrapper_tensor.bytes(); - size_t src_bytes = runtime_input.bytes(); - if (dst_bytes != src_bytes) { - MLLM_WARN("Graph '{}' input tensor {} byte-size mismatch: wrapper={} bytes, runtime input={} bytes. Copying " - "min(dst, src), but this may truncate data.", - graphName, i, dst_bytes, src_bytes); - } - - if (dst_bytes > 0) { - void* dst_ptr = wrapper_tensor.ptr(); - if (!dst_ptr) { - wrapper_tensor.alloc(); - dst_ptr = wrapper_tensor.ptr(); - } - - const void* src_ptr = runtime_input.ptr(); - size_t bytes_to_copy = std::min(dst_bytes, src_bytes); - if (!src_ptr) { - MLLM_ERROR("Runtime input tensor {} for graph '{}' has null data pointer", i, graphName); - return; - } - if (dst_ptr && src_ptr && dst_ptr != src_ptr) { - // Copy source data to destination buffer - // This ensures that the graph input wrapper has the correct data for execution - if (bytes_to_copy > 0) { std::memcpy(dst_ptr, src_ptr, bytes_to_copy); } - - // If source is smaller than destination, zero out the remaining bytes - // This is important for decode phase where input tensors may be smaller than prefill - // For example, decode phase may use [1, 1] input while wrapper expects [1, 128] - // Note: In current implementation with full [1, 128] tensor, this should not trigger - // but it's kept as a safety measure for future optimizations - if (src_bytes < dst_bytes) { - size_t remaining_bytes = dst_bytes - src_bytes; - std::memset(static_cast(dst_ptr) + bytes_to_copy, 0, remaining_bytes); - // Only log if zero-padding actually occurs (unexpected case) - MLLM_WARN("[QNN graphExecute] Graph '{}' input tensor {}: zero-padded {} bytes (src={} bytes, dst={} bytes)", - graphName, i, remaining_bytes, src_bytes, dst_bytes); - } - } - } + // input wrapper is empty, set wrapper's dataContainer(mllm::Tensor) + if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_input); } // Allocate and register the wrapper tensor with QNN allocator // QNNAllocator will handle registered memory descriptor when needed @@ -617,74 +572,18 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector& if (ProfilingLevel::OFF != profilingLevel_) { extractBackendProfilingInfo(runtime_->profileHandle); } - // Debug: Print last output shape from QNN actual return order (before reordering) - // Uncomment below for debugging output order issues - // if (!qnn_output_tensors.empty()) { - // const auto& last_output = qnn_output_tensors.back(); - // const auto& output_wrappers = model->getGraphOutputTensorWrappers(); - // const auto& last_wrapper = output_wrappers.back(); - // MLLM_INFO("[QNN Actual Return Order] Last output tensor '{}' shape: {}", - // last_wrapper->getName(), last_output.shape()); - // } - // Reorder outputs according to MLLM expected order const auto& expectedOrder = model->getExpectedOutputOrder(); // Resize outputs to match QNN output count first outputs.resize(qnn_output_tensors.size()); // Ensure outputs has enough space for all QNN outputs if (!expectedOrder.empty() && expectedOrder.size() == qnn_output_tensors.size()) { - // Debug: Log output order information - // Uncomment below for debugging output order issues - // MLLM_INFO("QNNBackend::graphExecute: Checking output order for graph '{}'", graphName); - // MLLM_INFO(" MLLM Expected Output Order ({} outputs):", expectedOrder.size()); - // for (size_t i = 0; i < expectedOrder.size(); i++) { - // MLLM_INFO(" [{}] {}", i, expectedOrder[i]); - // } - // MLLM_INFO(" QNN Output Order ({} outputs):", model->getGraphOutputTensorWrappers().size()); - // for (size_t i = 0; i < model->getGraphOutputTensorWrappers().size(); i++) { - // auto wrapper = model->getGraphOutputTensorWrappers()[i]; - // MLLM_INFO(" [{}] {}", i, wrapper->getName()); - // } - - // Check if reordering is needed - // bool needs_reordering = false; - // std::vector> mismatches; - // for (size_t i = 0; i < expectedOrder.size(); i++) { - // const std::string& expected_name = expectedOrder[i]; - // int qnn_index = model->getQnnOutputIndex(expected_name); - // if (qnn_index >= 0 && qnn_index < static_cast(qnn_output_tensors.size())) { - // if (static_cast(i) != qnn_index) { - // needs_reordering = true; - // mismatches.emplace_back(i, qnn_index); - // } - // } - // } - - // Debug: Verification messages - // Uncomment below for debugging output order issues - // if (needs_reordering) { - // MLLM_INFO(" [VERIFICATION] QNN output order DIFFERS from MLLM expected order - REORDERING REQUIRED"); - // for (const auto& [mllm_idx, qnn_idx] : mismatches) { - // MLLM_INFO(" Mismatch: MLLM[{}] expects '{}' but it's at QNN[{}]", - // mllm_idx, expectedOrder[mllm_idx], qnn_idx); - // } - // } else { - // MLLM_INFO(" [VERIFICATION] QNN output order MATCHES MLLM expected order - no reordering needed"); - // } - // Reorder outputs according to expected order for (size_t i = 0; i < expectedOrder.size(); i++) { const std::string& expected_name = expectedOrder[i]; int qnn_index = model->getQnnOutputIndex(expected_name); if (qnn_index >= 0 && qnn_index < static_cast(qnn_output_tensors.size())) { outputs[i] = qnn_output_tensors[qnn_index]; - // Debug: Mapping information - // Uncomment below for debugging output order issues - // if (static_cast(i) != qnn_index) { - // MLLM_INFO(" Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [REORDERED]", i, qnn_index, expected_name); - // } else { - // MLLM_INFO(" Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [SAME]", i, qnn_index, expected_name); - // } } else { MLLM_ERROR("QNNBackend::graphExecute: Failed to find QNN output index for tensor '{}' in graph '{}'", expected_name, graphName); diff --git a/mllm/backends/qnn/QNNBackend.hpp b/mllm/backends/qnn/QNNBackend.hpp index 95f212549..408b45117 100644 --- a/mllm/backends/qnn/QNNBackend.hpp +++ b/mllm/backends/qnn/QNNBackend.hpp @@ -50,8 +50,8 @@ class QNNRuntime { } bool createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t** contextConfig = nullptr); - bool retrieveContext(Qnn_ContextHandle_t& context, std::vector>& qnnModels, - QnnContext_Config_t** contextConfig = nullptr); + bool retrieveContext(const std::string& contextBinaryPath, Qnn_ContextHandle_t& context, + std::vector>& qnnModels, QnnContext_Config_t** contextConfig = nullptr); private: QNN_INTERFACE_VER_TYPE qnnInterface; @@ -87,6 +87,9 @@ class QNNBackend final : public Backend { public: QNNBackend(); + bool loadContext(const std::string& contextPath); + bool createContext(); + bool isWeightOnDevice() override { return false; } // QNN Graph build interfaces diff --git a/mllm/backends/qnn/QNNUtils.cpp b/mllm/backends/qnn/QNNUtils.cpp index 6e2862dd4..271b67200 100644 --- a/mllm/backends/qnn/QNNUtils.cpp +++ b/mllm/backends/qnn/QNNUtils.cpp @@ -483,10 +483,7 @@ std::shared_ptr QNNTensorWrapper::createStaticTensor(const std } void QNNTensorWrapper::alloc() { - if (isAlloc_) { - MLLM_WARN("Tensor {} has already been allocated.", name_); - return; - } + if (isAlloc_) { MLLM_WARN("Tensor {} has already been allocated.", name_); } MLLM_RT_ASSERT(dataContainer_.device() == kQNN); // if storage is not allocated, allocate it diff --git a/mllm/backends/qnn/QNNUtils.hpp b/mllm/backends/qnn/QNNUtils.hpp index 3feed39ea..e74f27f4a 100644 --- a/mllm/backends/qnn/QNNUtils.hpp +++ b/mllm/backends/qnn/QNNUtils.hpp @@ -205,6 +205,13 @@ class QNNTensorWrapper { Tensor& getDataContainer() { return dataContainer_; } const std::vector* getDimension() { return &dimensions_; } + bool isAlloc() { return isAlloc_; } + void __setDataContainer(const Tensor& tensor) { + MLLM_RT_ASSERT(dataContainer_.isNil()) + dataContainer_ = tensor; + if (!tensor.isNil()) { isAlloc_ = true; } + } + // Helper to set complex quantization params and manage memory void setScaleOffsetQuantization(const std::vector& scaleOffsets, int32_t axis); void setBlockwiseQuantization(const Qnn_BlockwiseExpansion_t& blockwise, const std::vector& scaleOffsets); diff --git a/mllm/backends/qnn/Register.cpp b/mllm/backends/qnn/Register.cpp index a36df64ba..158294f35 100644 --- a/mllm/backends/qnn/Register.cpp +++ b/mllm/backends/qnn/Register.cpp @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include +#include #include "mllm/core/BaseOp.hpp" #include "mllm/core/DeviceTypes.hpp" #include "mllm/engine/Context.hpp" @@ -13,12 +14,17 @@ namespace mllm { // export initQnnBackend function to initialize QNN backend -void initQnnBackend() { +void initQnnBackend(const std::string& context_path) { MLLM_RT_ASSERT(isQnnAvailable()); auto& ctx = Context::instance(); // 1. Register backend auto backend = std::make_shared(); + if (std::filesystem::exists(context_path)) { + if (!backend->loadContext(context_path)) { MLLM_ERROR_EXIT(1, "Failed to load QNN context from {}", context_path); } + } else { + if (!backend->createContext()) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); } + } ctx.registerBackend(backend); // 2. Initialize memory manager diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp index 93709336e..829a47f2d 100644 --- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp +++ b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp @@ -1,6 +1,7 @@ // Copyright (c) MLLM Team. // Licensed under the MIT License. #include +#include #include @@ -480,7 +481,37 @@ std::shared_ptr QnnAOTEnv::createContext(const std::string& } void QnnAOTEnv::saveContext(const std::string& name, const std::string& path) { - // TODO + if (contexts_.find(name) == contexts_.end()) { + MLLM_ERROR("QnnAOTEnv::saveContext Context {} not found", name); + return; + } + auto context = contexts_[name]; + + uint64_t binarySize = 0; + uint64_t writtenSize = 0; + + auto status = qnn_htp_func_symbols_.qnn_interface_.contextGetBinarySize(context->qnn_ctx_handle_, &binarySize); + MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS); + + std::vector binaryBuffer(binarySize); + + status = qnn_htp_func_symbols_.qnn_interface_.contextGetBinary( + context->qnn_ctx_handle_, reinterpret_cast(binaryBuffer.data()), binarySize, &writtenSize); + MLLM_RT_ASSERT_EQ(status, QNN_SUCCESS); + + if (binarySize < writtenSize) { + MLLM_ERROR("QNN context binary size mismatch: expected {} bytes, but wrote {} bytes.", binarySize, writtenSize); + } + + std::ofstream file(path, std::ios::binary); + if (!file.is_open()) { + MLLM_ERROR("Failed to open file {} for writing QNN context.", path); + return; + } + file.write(reinterpret_cast(binaryBuffer.data()), writtenSize); + file.close(); + + MLLM_INFO("QNN context {} saved to {} written {}", name, path, writtenSize); } void QnnAOTEnv::destroyContext(const std::string& name) { diff --git a/mllm/backends/qnn/aot/visitor/RMSNorm.cpp b/mllm/backends/qnn/aot/visitor/RMSNorm.cpp index f27ff77ba..27f72e2e2 100644 --- a/mllm/backends/qnn/aot/visitor/RMSNorm.cpp +++ b/mllm/backends/qnn/aot/visitor/RMSNorm.cpp @@ -1,10 +1,13 @@ // Copyright (c) MLLM Team. // Licensed under the MIT License. +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/Tensor.hpp" #include "mllm/utils/Common.hpp" #include "mllm/core/aops/RMSNormOp.hpp" #include "mllm/compile/ir/linalg/Op.hpp" #include "mllm/compile/ir/builtin/Attribute.hpp" +#include "mllm/compile/ir/linalg/Attribute.hpp" #include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp" #include "mllm/backends/qnn/aot/visitor/RMSNorm.hpp" #include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp" @@ -40,6 +43,16 @@ bool QnnAOTRMSNormPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) auto weight = writer.getContext()->lookupSymbolTable(a->getName() + ".weight")->outputs().front()->cast_(); + // fake bias, nn module seems to be inconsistent with document (AMAZING!) + auto bias_tensor = mllm::Tensor::zeros(weight->tensor_.shape(), weight->tensor_.dtype()); + auto bias_node = ir::tensor::TensorValue::build(writer.getContext().get(), bias_tensor); + bias_node->tensor_.setName(a->getName() + "_runtime_bias"); + + // fake bias quant recipe + auto quant_spec = mllm::ir::linalg::QuantizationSpecSymPerTensor::create(0, 0, kInt32, kFloat32, Tensor::ones({1})); + auto quant_attr = mllm::ir::linalg::LinalgIRQuantizatonSpecAttr::build(writer.getContext().get(), quant_spec); + bias_node->setAttr("quant_recipe", quant_attr); + // Start to attach auto i_0 = op->inputs().front()->cast_(); auto o_0 = op->outputs().front()->cast_(); @@ -56,6 +69,7 @@ bool QnnAOTRMSNormPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) qnn_op_node->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, i_0)) ->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, weight, true)) + ->emplaceInput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, bias_node, true)) ->emplaceOutput(env->captureQnnAOTNodeTensor(qnn_context_name, qnn_graph_name, o_0)) ->setName(rms_op->getAOp()->getName()); diff --git a/mllm/backends/qnn/aot_rt/KVCacheManager.cpp b/mllm/backends/qnn/aot_rt/KVCacheManager.cpp new file mode 100644 index 000000000..787ca4148 --- /dev/null +++ b/mllm/backends/qnn/aot_rt/KVCacheManager.cpp @@ -0,0 +1,325 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp" +#include +#include +#include "mllm/utils/Log.hpp" + +namespace mllm::qnn::aot { + +template +KVCacheManager::KVCacheManager(KVCacheConfig config) : config_(config) { + k_cache_.resize(config_.num_layers); + v_cache_.resize(config_.num_layers); + + // Calculate cache size + size_t cache_in_bytes = config_.num_layers * config_.num_heads * config_.head_dim * config_.max_cache_len * sizeof(T); + size_t cache_out_bytes = config_.num_layers * config_.num_heads * config_.head_dim * config_.max_ar_len * sizeof(T); + total_cache_size_ = 2 * (cache_in_bytes + cache_out_bytes); +} + +template +void KVCacheManager::initCache(mllm::Allocator* allocator, int32_t ar_len) { + cur_ar_len_ = ar_len; + const size_t max_in_cache_block_in_bytes = config_.max_cache_len * sizeof(T); + const size_t max_out_cache_block_in_bytes = config_.max_ar_len * sizeof(T); + + const size_t cache_in_bytes = config_.num_heads * config_.head_dim * max_in_cache_block_in_bytes; + const size_t cache_out_bytes = config_.num_heads * config_.head_dim * max_out_cache_block_in_bytes; + + // Directly use Storage created by QNNAllocator + // TODO: QNN shared buffer pool(custom mem) support + for (int layer = 0; layer < config_.num_layers; ++layer) { + // Allocate buffer for key cache and value cache + auto k_storage_in = std::make_shared(); + k_storage_in->size_ = cache_in_bytes; + allocator->alloc(k_storage_in); + memset(k_storage_in->ptr_, 0, cache_in_bytes); + + auto k_storage_out = std::make_shared(); + k_storage_out->size_ = cache_out_bytes; + allocator->alloc(k_storage_out); + memset(k_storage_out->ptr_, 0, cache_out_bytes); + + auto v_storage_in = std::make_shared(); + v_storage_in->size_ = cache_in_bytes; + allocator->alloc(v_storage_in); + memset(v_storage_in->ptr_, 0, cache_in_bytes); + + auto v_storage_out = std::make_shared(); + v_storage_out->size_ = cache_out_bytes; + allocator->alloc(v_storage_out); + memset(v_storage_out->ptr_, 0, cache_out_bytes); + + k_cache_[layer].buffer_storage = k_storage_in; + k_cache_[layer].output_buffer_storage = k_storage_out; + k_cache_[layer].buffer = reinterpret_cast(k_storage_in->ptr_); + k_cache_[layer].output_buffer = reinterpret_cast(k_storage_out->ptr_); + + v_cache_[layer].buffer_storage = v_storage_in; + v_cache_[layer].output_buffer_storage = v_storage_out; + v_cache_[layer].buffer = reinterpret_cast(v_storage_in->ptr_); + v_cache_[layer].output_buffer = reinterpret_cast(v_storage_out->ptr_); + } +} + +template +void KVCacheManager::initAttentionMask(uint16_t* attention_mask, const std::vector& attention_map, int32_t ar_len, + int32_t n_past) { + if (attention_map.size() > ar_len) { + MLLM_ERROR("The size of attention_map ({}) doesn't match with ar_len ({})", attention_map.size(), ar_len); + exit(1); + } + + uint16_t neg_val = 0; + uint16_t pos_val = 65535; + // Clear the attention mask + std::fill_n(attention_mask, ar_len * config_.context_len, neg_val); + + // SMART_MASK requires special handling of attention mask + uint16_t* past_ptr = attention_mask; + uint16_t* new_ptr = attention_mask + (config_.context_len - ar_len); + // All inputs will necessarily attend to n_past and itself + for (int i = 0; i < ar_len; i++) { + // Iterate across ar_len + if (attention_map[i] < 0) { + // If negative, attend to only past tokens + std::fill_n(past_ptr, n_past, pos_val); + } else { + // If positive, copy attention map from (relative to 0th input) parent + // Parent token index + const int32_t pidx = attention_map[i]; + uint16_t* parent_ptr = attention_mask + pidx * config_.context_len; + std::memcpy(past_ptr, parent_ptr, config_.context_len * sizeof(uint16_t)); + } + // Attend to itself + new_ptr[i] = pos_val; + past_ptr += config_.context_len; + new_ptr += config_.context_len; + } +} + +template +void KVCacheManager::initAttentionMask(uint16_t* attention_mask, const std::vector& attention_map, int32_t ar_len, + int32_t n_past, int32_t sliding_window, const std::vector& position_offset) { + if (attention_map.size() > ar_len) { + MLLM_ERROR("The size of attention_map ({}) doesn't match with ar_len ({})", attention_map.size(), ar_len); + exit(1); + } + + uint16_t neg_val = 0; + uint16_t pos_val = 65535; + // Clear the attention mask + std::fill_n(attention_mask, ar_len * config_.context_len, neg_val); + + // SMART_MASK requires special handling of attention mask + uint16_t* past_ptr = attention_mask; + uint16_t* new_ptr = attention_mask + (config_.context_len - ar_len); + // All inputs will necessarily attend to n_past and itself + for (int i = 0; i < ar_len; i++) { + // Iterate across ar_len + if (attention_map[i] < 0) { + // If negative, attend to only past tokens + std::fill_n(past_ptr, n_past, pos_val); + } else { + // If positive, copy attention map from (relative to 0th input) parent + // Parent token index + const int32_t pidx = attention_map[i]; + uint16_t* parent_ptr = attention_mask + pidx * config_.context_len; + std::memcpy(past_ptr, parent_ptr, config_.context_len * sizeof(uint16_t)); + } + // Attend to itself + new_ptr[i] = pos_val; + + // mask by limitation of sliding_window + int32_t available_context_len = + position_offset.empty() ? sliding_window - (i + 1) - n_past : sliding_window - (position_offset[i] + 1) - n_past; + if (n_past > available_context_len) { std::fill_n(past_ptr, n_past - available_context_len, neg_val); } + + past_ptr += config_.context_len; + new_ptr += config_.context_len; + } +} + +template +void KVCacheManager::updateAttentionMask(uint16_t* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update) { + uint16_t pos_val = 65535; + uint16_t* cur_ptr = attention_mask; + cur_ptr += n_past; + + for (int i = 0; i < ar_len; i++) { + std::fill_n(cur_ptr, n_update, pos_val); + cur_ptr += config_.context_len; + } +} + +template +void KVCacheManager::updateAttentionMask(uint16_t* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update, + int32_t sliding_window, const std::vector& position_offset) { + uint16_t pos_val = 65535; + uint16_t neg_val = 0; + uint16_t* cur_ptr = attention_mask; + cur_ptr += n_past; + + for (int i = 0; i < ar_len; i++) { + std::fill_n(cur_ptr, n_update, pos_val); + int32_t available_cache_len = + position_offset.empty() ? sliding_window - (i + 1) : sliding_window - (position_offset[i] + 1); + if (n_past + n_update > available_cache_len) { + std::fill_n(cur_ptr - n_past, n_past + n_update - available_cache_len, neg_val); + } + cur_ptr += config_.context_len; + } +} + +template +void KVCacheManager::rearrangeCache(int32_t ar_len_dst) { + // Don't need to rearrange if cur_ar_len_ is equal to target ar_len + if (cur_ar_len_ == ar_len_dst) return; + for (int layer = 0; layer < config_.num_layers; ++layer) { + rearrangeKey(k_cache_[layer], ar_len_dst); + rearrangeValue(v_cache_[layer], ar_len_dst); + } + // rearrange done. + cur_ar_len_ = ar_len_dst; +} + +template +void KVCacheManager::rearrangeKey(KVCache& k_cache, int32_t ar_len_dst) { + const int32_t src_cache_num = (cur_ar_len_ == config_.context_len) ? config_.context_len : config_.context_len - cur_ar_len_; + const int32_t dst_cache_num = config_.context_len - ar_len_dst; + T* k_cache_in_read_ptr = k_cache.buffer; + T* k_cache_in_write_ptr = k_cache.buffer; + + if (src_cache_num > dst_cache_num) { + // copy from first dimension + for (int i = 0; i < config_.head_dim * config_.num_heads; i++) { + std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, dst_cache_num * sizeof(T)); + k_cache_in_read_ptr += src_cache_num; + k_cache_in_write_ptr += dst_cache_num; + } + } else { + k_cache_in_read_ptr += (config_.head_dim * config_.num_heads - 1) * src_cache_num; + k_cache_in_write_ptr += (config_.head_dim * config_.num_heads - 1) * dst_cache_num; + // copy from last dimension + for (int i = 0; i < config_.head_dim * config_.num_heads; i++) { + std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, src_cache_num * sizeof(T)); + k_cache_in_read_ptr -= src_cache_num; + k_cache_in_write_ptr -= dst_cache_num; + } + } +} + +template +void KVCacheManager::rearrangeValue(KVCache& v_cache, int32_t ar_len_dst) { + const int32_t src_cache_num = (cur_ar_len_ == config_.context_len) ? config_.context_len : config_.context_len - cur_ar_len_; + const int32_t dst_cache_num = config_.context_len - ar_len_dst; + T* v_cache_in_read_ptr = v_cache.buffer; + T* v_cache_in_write_ptr = v_cache.buffer; + if (src_cache_num > dst_cache_num) { + // copy from first dimension + for (int i = 0; i < config_.num_heads; i++) { + std::memmove(v_cache_in_write_ptr, v_cache_in_read_ptr, dst_cache_num * config_.head_dim * sizeof(T)); + v_cache_in_read_ptr += src_cache_num * config_.head_dim; + v_cache_in_write_ptr += dst_cache_num * config_.head_dim; + } + } else { + v_cache_in_read_ptr += config_.head_dim * (config_.num_heads - 1) * src_cache_num; + v_cache_in_write_ptr += config_.head_dim * (config_.num_heads - 1) * dst_cache_num; + // copy from last dimension + for (int i = 0; i < config_.num_heads; i++) { + std::memmove(v_cache_in_write_ptr, v_cache_in_read_ptr, src_cache_num * config_.head_dim * sizeof(T)); + v_cache_in_read_ptr -= src_cache_num * config_.head_dim; + v_cache_in_write_ptr -= dst_cache_num * config_.head_dim; + } + } +} + +template +void KVCacheManager::updateCache(int32_t ar_len, int32_t n_past, int32_t n_update, const std::vector& selected) { + if (cur_ar_len_ != ar_len) { + MLLM_ERROR("Current AR length ({}) is not matched with target AR length ({}). Please rearrange cache first.", cur_ar_len_, + ar_len); + exit(1); + } + for (int layer = 0; layer < config_.num_layers; ++layer) { + updateKey(k_cache_[layer], n_past, n_update, selected); + updateValue(v_cache_[layer], n_past, n_update, selected); + } +} + +template +void KVCacheManager::updateKey(KVCache& k_cache, int32_t n_past, int32_t n_update, const std::vector& selected) { + T* write_ptr = k_cache.buffer; + T* read_ptr = k_cache.output_buffer; + const int32_t copy_size = n_update * sizeof(T); + const int32_t iter_size = (cur_ar_len_ == config_.context_len) ? config_.context_len : config_.context_len - cur_ar_len_; + const int32_t out_size = cur_ar_len_; + const int32_t past_size = n_past; + const int32_t n_iter = config_.head_dim * config_.num_heads; + + write_ptr += past_size; + if (selected.empty()) { + for (int i = 0; i < n_iter; ++i) { + std::memcpy(write_ptr, read_ptr, copy_size); + write_ptr += iter_size; + read_ptr += out_size; + } + } else { + std::vector true_indices(n_update); + for (int i = 0, j = 0; i < selected.size() && j < n_update; ++i) { + if (selected[i]) { true_indices[j++] = i; } + } + for (int i = 0; i < n_iter; ++i) { + for (int j = 0; j < n_update; ++j) { write_ptr[j] = read_ptr[true_indices[j]]; } + write_ptr += iter_size; + read_ptr += out_size; + } + } +} + +template +void KVCacheManager::updateValue(KVCache& v_cache, int32_t n_past, int32_t n_update, const std::vector& selected) { + T* write_ptr = v_cache.buffer; + T* read_ptr = v_cache.output_buffer; + const int32_t copy_size = n_update * config_.head_dim * sizeof(T); + const int32_t past_size = n_past * config_.head_dim; + const int32_t n_iter = config_.num_heads; + const int32_t iter_size = (cur_ar_len_ == config_.context_len) ? config_.context_len * config_.head_dim + : (config_.context_len - cur_ar_len_) * config_.head_dim; + const int32_t out_size = cur_ar_len_ * config_.head_dim; + + write_ptr += past_size; + + if (selected.empty()) { + for (int i = 0; i < n_iter; i++) { + std::memcpy(write_ptr, read_ptr, copy_size); + write_ptr += iter_size; + read_ptr += out_size; + } + } else { + for (int i = 0; i < n_iter; i++) { + auto wp = write_ptr; + auto rp = read_ptr; + int32_t update_cnt = 0; + for (auto sel : selected) { + if (sel) { + std::memcpy(wp, rp, config_.head_dim * sizeof(T)); + wp += config_.head_dim; + update_cnt++; + } + rp += config_.head_dim; + if (update_cnt == n_update) break; + } + write_ptr += iter_size; + read_ptr += out_size; + } + } +} + +// Explicit instantiations +template class KVCacheManager; +template class KVCacheManager; + +} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/KVCacheManager.hpp b/mllm/backends/qnn/aot_rt/KVCacheManager.hpp new file mode 100644 index 000000000..fb85ff9ac --- /dev/null +++ b/mllm/backends/qnn/aot_rt/KVCacheManager.hpp @@ -0,0 +1,69 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include "mllm/core/Storage.hpp" +#include "mllm/backends/base/Allocator.hpp" + +namespace mllm::qnn::aot { + +template +struct KVCache { + std::shared_ptr buffer_storage; + std::shared_ptr output_buffer_storage; + T* buffer; + T* output_buffer; +}; + +struct KVCacheConfig { + int32_t context_len; + int64_t head_dim; + int32_t max_ar_len; + int32_t max_cache_len; + int64_t num_heads; + int64_t num_layers; +}; + +template +class KVCacheManager { + public: + explicit KVCacheManager(KVCacheConfig config); + ~KVCacheManager() = default; + + void initCache(mllm::Allocator* allocator, int32_t ar_len); + void rearrangeCache(int32_t ar_len_dst); + + void initAttentionMask(uint16_t* attention_mask, const std::vector& attention_map, int32_t ar_len, int32_t n_past); + + void initAttentionMask(uint16_t* attention_mask, const std::vector& attention_map, int32_t ar_len, int32_t n_past, + int32_t sliding_window, const std::vector& position_offset = {}); + + void updateAttentionMask(uint16_t* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update); + + void updateAttentionMask(uint16_t* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update, int32_t sliding_window, + const std::vector& position_offset = {}); + + void updateCache(int32_t ar_len, int32_t n_past, int32_t n_update, const std::vector& selected); + + const std::vector>& getKCache() const { return k_cache_; } + const std::vector>& getVCache() const { return v_cache_; } + [[nodiscard]] size_t getTotalCacheSizeInBytes() const { return total_cache_size_; } + + private: + void rearrangeKey(KVCache& k_cache, int32_t ar_len_dst); + void rearrangeValue(KVCache& v_cache, int32_t ar_len_dst); + void updateKey(KVCache& k_cache, int32_t n_past, int32_t n_update, const std::vector& selected); + void updateValue(KVCache& v_cache, int32_t n_past, int32_t n_update, const std::vector& selected); + + KVCacheConfig config_; + size_t total_cache_size_; + int32_t cur_ar_len_; + std::vector> k_cache_; + std::vector> v_cache_; +}; + +} // namespace mllm::qnn::aot \ No newline at end of file diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.cpp b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp new file mode 100644 index 000000000..b13c66a0d --- /dev/null +++ b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp @@ -0,0 +1,180 @@ + +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/qnn/aot_rt/PromptProcessor.hpp" +#include "mllm/core/DataTypes.hpp" +#include "mllm/core/SlicePrimitives.hpp" +#include "mllm/utils/Log.hpp" +#include +#include +#include +#include +#include +#include + +namespace mllm::qnn::aot { + +template +PromptProcessor::PromptProcessor(KVCacheManager* kv_manager, Config config) + : kv_manager_(kv_manager), config_(std::move(config)) { + std::string graph_name = "model.0.s" + std::to_string(config_.ar_len); + module_ = std::make_unique(config_.model_path, graph_name); + module_->to(kQNN); +} + +template +void PromptProcessor::init_io() { + input_tensors_.reserve(3 + 2 * config_.num_layers); + + // 1. Input IDs + auto input_ids = Tensor::empty({1, config_.ar_len}, kInt32, kQNN).alloc(); + input_ids.setName("input_ids"); + input_tensors_.push_back(input_ids); + + // // 2. Sliding Window Attention Mask + // input_tensors_.push_back(Tensor::empty({1, 1, config_.ar_len, config_.context_len}, kUInt16, kQNN).alloc()); + + // 3. Position IDs + auto pos_ids = Tensor::empty({config_.ar_len}, kInt32, kQNN).alloc(); + pos_ids.setName("position_ids"); + input_tensors_.push_back(pos_ids); + + // 4. Attention Mask + auto attn_mask = Tensor::empty({1, 1, config_.ar_len, config_.context_len}, kUInt16, kQNN).alloc(); + attn_mask.setName("attention_mask"); + input_tensors_.push_back(attn_mask); + + // 5. KV Caches + const auto& k_caches = kv_manager_->getKCache(); + const auto& v_caches = kv_manager_->getVCache(); + for (int l = 0; l < config_.num_layers; ++l) { + // K + auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.head_dim, config_.context_len - config_.ar_len}, + config_.kv_dtype, kQNN); + k_tensor.impl()->storage()->ptr_ = k_caches[l].buffer; + k_tensor.impl()->storage()->mem_type_ = kManual; + k_tensor.setName("past_key_" + std::to_string(l)); + input_tensors_.push_back(k_tensor); + + // V + auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len - config_.ar_len, config_.head_dim}, + config_.kv_dtype, kQNN); + v_tensor.impl()->storage()->ptr_ = v_caches[l].buffer; + v_tensor.impl()->storage()->mem_type_ = kManual; + v_tensor.setName("past_value_" + std::to_string(l)); + input_tensors_.push_back(v_tensor); + } + + // Output Tensors + output_tensors_.reserve(1 + 2 * config_.num_layers); + + // 1. Logits + auto logits = Tensor::empty({1, 1, config_.ar_len, config_.vocab_size}, kUInt16, kQNN).alloc(); + logits.setName("logits"); + output_tensors_.push_back(logits); + + // 2. KV Caches + for (int l = 0; l < config_.num_layers; ++l) { + // K Output + auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + k_tensor.impl()->storage()->ptr_ = k_caches[l].output_buffer; + k_tensor.impl()->storage()->mem_type_ = kManual; + k_tensor.setName("present_key_" + std::to_string(l)); + output_tensors_.push_back(k_tensor); + + // V Output + auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + v_tensor.impl()->storage()->ptr_ = v_caches[l].output_buffer; + v_tensor.impl()->storage()->mem_type_ = kManual; + v_tensor.setName("present_value_" + std::to_string(l)); + output_tensors_.push_back(v_tensor); + } +} + +template +void PromptProcessor::prepare_io(const std::vector& prompt_tokens, int64_t prompt_pos, int64_t start_pos) { + int64_t num_tokens = prompt_tokens.size(); + int64_t chunk_size = std::min((int64_t)config_.ar_len, num_tokens - prompt_pos); + + // 1. Input IDs + int32_t* input_ids_ptr = input_tensors_[0].ptr(); + for (int i = 0; i < config_.ar_len; ++i) { + if (i < chunk_size) { + input_ids_ptr[i] = (int32_t)prompt_tokens[prompt_pos + i]; + } else { + input_ids_ptr[i] = 0; // Padding + } + } + + // 2. Position IDs + int32_t* pos_ids_ptr = input_tensors_[1].ptr(); + for (int i = 0; i < config_.ar_len; ++i) { pos_ids_ptr[i] = (int32_t)(start_pos + i); } + + // 3. Attention Mask + // We need to re-calculate attention mask based on start_pos + std::vector attn_mask_data(config_.ar_len * config_.context_len); + std::vector attention_map(config_.ar_len); + for (int i = 0; i < config_.ar_len; ++i) { + if (i == 0) { + attention_map[i] = -1; + } else { + attention_map[i] = i - 1; + } + } + + kv_manager_->initAttentionMask(attn_mask_data.data(), attention_map, config_.ar_len, start_pos); + + uint16_t* attn_mask_ptr = input_tensors_[2].ptr(); + for (size_t k = 0; k < attn_mask_data.size(); ++k) { attn_mask_ptr[k] = (uint16_t)attn_mask_data[k]; } +} + +template +int64_t PromptProcessor::prefill(const std::vector& prompt_tokens, int64_t start_pos) { + MLLM_INFO("perform prefill"); + + int64_t num_tokens = prompt_tokens.size(); + int64_t current_pos = start_pos; + int64_t processed_tokens = 0; + + // Ensure KV cache is arranged for ar_len + kv_manager_->rearrangeCache(config_.ar_len); + + std::vector attention_map(config_.ar_len); + std::iota(attention_map.begin(), attention_map.end(), -1); + kv_manager_->initAttentionMask(input_tensors_[3].ptr(), // TODO: use member rather than index + attention_map, config_.ar_len, start_pos); + + module_->setOutputTensors(output_tensors_); + + while (processed_tokens < num_tokens) { + int64_t chunk_size = std::min((int64_t)config_.ar_len, num_tokens - processed_tokens); + + prepare_io(prompt_tokens, processed_tokens, current_pos); + + // Run forward + output_tensors_ = (*module_)(input_tensors_); + + int32_t n_update = chunk_size; + + kv_manager_->updateCache(config_.ar_len, current_pos, n_update, {}); + + kv_manager_->updateAttentionMask(input_tensors_[3].ptr(), config_.ar_len, current_pos, n_update, + config_.sliding_window); + + processed_tokens += chunk_size; + current_pos += chunk_size; + } + + auto logits = output_tensors_[0][{kAll, (num_tokens + config_.ar_len - 1) % config_.ar_len, kAll}]; + + auto cur_token = module_->sampleGreedy(logits); + + return cur_token; +} + +// Explicit instantiations +template class PromptProcessor; +template class PromptProcessor; + +} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.hpp b/mllm/backends/qnn/aot_rt/PromptProcessor.hpp new file mode 100644 index 000000000..c867f0f0c --- /dev/null +++ b/mllm/backends/qnn/aot_rt/PromptProcessor.hpp @@ -0,0 +1,53 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/backends/qnn/aot_rt/QnnAOTModule.hpp" +#include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp" +#include "mllm/core/Tensor.hpp" +#include +#include +#include + +namespace mllm::qnn::aot { + +template +class PromptProcessor { + public: + struct Config { + std::string model_path; + int32_t context_len; + int64_t num_heads; + int64_t num_layers; + int32_t ar_len; + int32_t vocab_size; + int32_t head_dim; + bool use_int64_token; + int sliding_window; + DataTypes kv_dtype = kUInt8; + }; + + PromptProcessor(KVCacheManager* kv_manager, Config config); + + /** + * Prefill an LLM Module with the given text input. + * @param prompt_tokens The text prompt tokens to the LLM Module. + * @param start_pos The starting position in KV cache. + * @return The next token (or logits). + */ + int64_t prefill(const std::vector& prompt_tokens, int64_t start_pos = 0); + + void init_io(); + void prepare_io(const std::vector& prompt_tokens, int64_t prompt_pos, int64_t start_pos); + + private: + std::unique_ptr module_; + KVCacheManager* kv_manager_; + Config config_; + + std::vector input_tensors_; + std::vector output_tensors_; +}; + +} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/QnnAOTModule.cpp b/mllm/backends/qnn/aot_rt/QnnAOTModule.cpp new file mode 100644 index 000000000..f1cf6eb1d --- /dev/null +++ b/mllm/backends/qnn/aot_rt/QnnAOTModule.cpp @@ -0,0 +1,17 @@ +#include "mllm/backends/qnn/aot_rt/QnnAOTModule.hpp" +#include "mllm/nn/Module.hpp" +#include "mllm/utils/Log.hpp" +#include "mllm/engine/Context.hpp" +#include "mllm/backends/qnn/QNNBackend.hpp" + +namespace mllm::qnn::aot { + +QnnAOTModule::QnnAOTModule(const std::string& model_path, const std::string& graph_name) + : mllm::nn::Module(graph_name), model_path_(model_path), graph_name_(graph_name) {} + +std::vector QnnAOTModule::forward(const std::vector& inputs, + const std::vector& args) { + return output_tensors_; +} + +} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp b/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp new file mode 100644 index 000000000..0cfa464c5 --- /dev/null +++ b/mllm/backends/qnn/aot_rt/QnnAOTModule.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "mllm/models/ARGeneration.hpp" +#include "mllm/nn/Module.hpp" +#include "mllm/utils/Common.hpp" + +#include +#include + +namespace mllm::qnn::aot { + +class QnnAOTModule : public mllm::nn::Module, public models::ARGeneration { + public: + QnnAOTModule(const std::string& model_path, const std::string& graph_name); + + std::vector forward(const std::vector& inputs, const std::vector& args) override; + + models::ARGenerationOutputPast forward(const models::ARGenerationOutputPast& input, + const models::ARGenerationArgs& args) override { + NYI("ARGeneration forward is not implemented for QnnAOTModule"); + return {}; + }; + + void setOutputTensors(const std::vector& output_tensors) { output_tensors_ = output_tensors; } + + private: + std::string model_path_; + std::string graph_name_; + + std::vector output_tensors_; + + std::string backend_path_; +}; + +} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp index e69de29bb..6f0bcfd57 100644 --- a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp +++ b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.cpp @@ -0,0 +1,104 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp" +#include +#include +#include "mllm/core/DeviceTypes.hpp" +#include "mllm/preprocessor/tokenizers/Unicode.hpp" +#include "mllm/utils/Log.hpp" + +namespace mllm::qnn::aot { +Runner::Runner(const RunnerConfig& config, mllm::preprocessor::AutoTokenizer* tokenizer) + : config_(config), tokenizer_(tokenizer) {} + +bool Runner::load() { + // init KV cache manager + KVCacheConfig kv_config; + kv_config.context_len = config_.context_len; + kv_config.head_dim = config_.head_dim; + + int32_t prompt_processor_ar_len = config_.ar_len; + int32_t token_generator_ar_len = 1; + + if (prompt_processor_ar_len == config_.context_len) { + kv_config.max_cache_len = config_.context_len; + } else { + kv_config.max_cache_len = config_.context_len - std::min(token_generator_ar_len, prompt_processor_ar_len); + } + kv_config.max_ar_len = std::max(token_generator_ar_len, prompt_processor_ar_len); + + kv_config.num_heads = config_.num_heads; + kv_config.num_layers = config_.num_layers; + + kv_manager_ = std::make_unique>(kv_config); + + auto backend = mllm::Context::instance().getBackend(mllm::kQNN); + if (!backend) { + MLLM_ERROR("QNN Backend not found"); + return false; + } + + // init prompt processor(prefill) + PromptProcessor::Config prefill_config; + prefill_config.model_path = config_.model_path; + prefill_config.context_len = config_.context_len; + prefill_config.num_heads = config_.num_heads; + prefill_config.num_layers = config_.num_layers; + prefill_config.ar_len = config_.ar_len; + prefill_config.vocab_size = config_.vocab_size; + prefill_config.head_dim = config_.head_dim; + prefill_config.use_int64_token = false; + prefill_config.sliding_window = config_.context_len; // no sliding window for now + + prompt_processor_ = std::make_unique>(kv_manager_.get(), prefill_config); + + // init token generator(decode) + TokenGenerator::Config decode_config; + decode_config.model_path = config_.model_path; + decode_config.context_len = config_.context_len; + decode_config.num_heads = config_.num_heads; + decode_config.num_layers = config_.num_layers; + decode_config.vocab_size = config_.vocab_size; + decode_config.head_dim = config_.head_dim; + decode_config.use_int64_token = false; + decode_config.sliding_window = config_.context_len; + + // TODO: EOS IDs + auto eos_ids = std::make_unique>(); + eos_ids->insert(151643); + eos_ids->insert(151645); + + token_generator_ = + std::make_unique>(tokenizer_, kv_manager_.get(), std::move(eos_ids), decode_config); + + kv_manager_->initCache(backend->allocator().get(), config_.ar_len); + prompt_processor_->init_io(); + token_generator_->init_io(); + + return true; +} + +void Runner::generate(std::vector& prompt_tokens, int32_t seq_len, + const std::function& token_callback) { + int64_t start_pos = 0; + + std::vector prompt_tokens_i64; + prompt_tokens_i64.reserve(prompt_tokens.size()); + for (auto t : prompt_tokens) prompt_tokens_i64.push_back((int64_t)t); + + int64_t next_token = prompt_processor_->prefill(prompt_tokens_i64, start_pos); + + prompt_tokens.push_back((uint64_t)next_token); + if (token_callback) { + std::wstring wstr = tokenizer_->detokenize(next_token); + std::string str = mllm::preprocessor::wideString2Utf8String(wstr); + token_callback(str); + } + + int64_t cur_pos = prompt_tokens.size(); + + token_generator_->generate(prompt_tokens, cur_pos, seq_len, token_callback, false); +} + +} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp index e69de29bb..dc41ad09f 100644 --- a/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp +++ b/mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp @@ -0,0 +1,45 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/qnn/aot_rt/QnnAOTModule.hpp" +#include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp" +#include "mllm/backends/qnn/aot_rt/PromptProcessor.hpp" +#include "mllm/backends/qnn/aot_rt/TokenGenerator.hpp" +#include "mllm/preprocessor/tokenizers/AutoTokenizer.hpp" +#include +#include +#include +#include + +namespace mllm::qnn::aot { + +struct RunnerConfig { + std::string model_path; + float temperature = 0.8f; + int num_layers = 28; + int num_heads = 12; + int head_dim = 128; + int vocab_size = 151936; + int context_len = 4096; + int ar_len = 128; // Chunk size for prefill +}; + +class Runner { + public: + explicit Runner(const RunnerConfig& config, mllm::preprocessor::AutoTokenizer* tokenizer); + ~Runner() = default; + + bool load(); + void generate(std::vector& prompt_tokens, int32_t seq_len, + const std::function& token_callback); + + private: + RunnerConfig config_; + mllm::preprocessor::AutoTokenizer* tokenizer_; + + std::unique_ptr> kv_manager_; + std::unique_ptr> prompt_processor_; + std::unique_ptr> token_generator_; +}; + +} // namespace mllm::qnn::aot \ No newline at end of file diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.cpp b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp new file mode 100644 index 000000000..98986ee41 --- /dev/null +++ b/mllm/backends/qnn/aot_rt/TokenGenerator.cpp @@ -0,0 +1,156 @@ +#include "mllm/backends/qnn/aot_rt/TokenGenerator.hpp" +#include "mllm/preprocessor/tokenizers/Unicode.hpp" +#include +#include + +namespace mllm::qnn::aot { + +template +TokenGenerator::TokenGenerator(mllm::preprocessor::AutoTokenizer* tokenizer, KVCacheManager* kv_manager, + std::unique_ptr>&& eos_ids, Config config) + : tokenizer_(tokenizer), kv_manager_(kv_manager), eos_ids_(std::move(eos_ids)), config_(std::move(config)) { + std::string graph_name = "model.0.s1"; + module_ = std::make_unique(config_.model_path, graph_name); + module_->to(kQNN); +} + +template +void TokenGenerator::init_io() { + input_tensors_.reserve(4 + 2 * config_.num_layers); + + // 1. Input IDs + auto input_ids = Tensor::empty({1, 1, 1, 1}, kInt64, kQNN).alloc(); + input_ids.setName("input_ids"); + input_tensors_.push_back(input_ids); + + // // 2. Sliding Window Attention Mask + // auto sliding_window_mask = Tensor::empty({1, 1, 1, config_.context_len}, kUInt16, kQNN).alloc(); + // sliding_window_mask.setName("sliding_window_attention_mask"); + // input_tensors_.push_back(sliding_window_mask); + + // 3. Attention Mask + auto attn_mask = Tensor::empty({1, 1, 1, config_.context_len}, kUInt16, kQNN).alloc(); + attn_mask.setName("attention_mask"); + input_tensors_.push_back(attn_mask); + + // 4. Position IDs + auto pos_ids = Tensor::empty({1, 1, 1, 1}, kInt32, kQNN).alloc(); + pos_ids.setName("position_ids"); + input_tensors_.push_back(pos_ids); + + // 5. KV Caches + const auto& k_caches = kv_manager_->getKCache(); + const auto& v_caches = kv_manager_->getVCache(); + for (int l = 0; l < config_.num_layers; ++l) { + // K + auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + k_tensor.impl()->storage()->ptr_ = k_caches[l].buffer; + k_tensor.impl()->storage()->mem_type_ = kManual; + k_tensor.setName("past_key_" + std::to_string(l)); + input_tensors_.push_back(k_tensor); + + // V + auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + v_tensor.impl()->storage()->ptr_ = v_caches[l].buffer; + v_tensor.impl()->storage()->mem_type_ = kManual; + v_tensor.setName("past_value_" + std::to_string(l)); + input_tensors_.push_back(v_tensor); + } + + // Output Tensors + output_tensors_.reserve(1 + 2 * config_.num_layers); + + // 1. Logits + auto logits = Tensor::empty({1, 1, 1, config_.vocab_size}, kUInt16, kQNN).alloc(); + logits.setName("logits"); + output_tensors_.push_back(logits); + + // 2. KV Caches + for (int l = 0; l < config_.num_layers; ++l) { + // K Output + auto k_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + k_tensor.impl()->storage()->ptr_ = k_caches[l].output_buffer; + k_tensor.impl()->storage()->mem_type_ = kManual; + k_tensor.setName("present_key_" + std::to_string(l)); + output_tensors_.push_back(k_tensor); + + // V Output + auto v_tensor = Tensor::empty({1, (int)config_.num_heads, config_.context_len, config_.head_dim}, config_.kv_dtype, kQNN); + v_tensor.impl()->storage()->ptr_ = v_caches[l].output_buffer; + v_tensor.impl()->storage()->mem_type_ = kManual; + v_tensor.setName("present_value_" + std::to_string(l)); + output_tensors_.push_back(v_tensor); + } +} + +template +const std::vector& TokenGenerator::get_all_logits() { + return token_all_logits_; +} + +template +void TokenGenerator::prepare_io(uint64_t cur_token, int64_t start_pos) { + // 1. Input IDs + int32_t* input_ids_ptr = input_tensors_[0].ptr(); + input_ids_ptr[0] = (int32_t)cur_token; + + // 2. Position IDs + int32_t* pos_ids_ptr = input_tensors_[3].ptr(); + pos_ids_ptr[0] = (int32_t)start_pos; + + // 3. Attention Mask + // Update attention mask for the current position + kv_manager_->updateAttentionMask(input_tensors_[2].ptr(), 1, start_pos, 1, config_.sliding_window); +} + +template +int64_t TokenGenerator::generate(std::vector& tokens, int64_t start_pos, int32_t seq_len, + const std::function& token_callback, bool dump_logits) { + int64_t current_pos = start_pos; + uint64_t next_token = tokens.back(); + int64_t generated_count = 0; + + // Ensure KV cache is arranged for decode (1 token) + kv_manager_->rearrangeCache(1); + + module_->setOutputTensors(output_tensors_); + + for (int i = 0; i < seq_len; ++i) { + if (current_pos >= config_.context_len) { break; } + + prepare_io(next_token, current_pos); + + output_tensors_ = module_->forward(input_tensors_, {}); + + // Update KV Cache + int32_t n_update = 1; + kv_manager_->updateCache(1, current_pos, n_update, {}); + + // Get logits + auto logits_tensor = output_tensors_[0]; + + // Sample + auto cur_token = module_->sampleGreedy(logits_tensor); + + next_token = cur_token; + tokens.push_back(next_token); + current_pos++; + generated_count++; + + if (token_callback) { + std::wstring wstr = tokenizer_->detokenize(next_token); + std::string str = mllm::preprocessor::wideString2Utf8String(wstr); + token_callback(str); + } + + if (eos_ids_ && eos_ids_->count(next_token)) { break; } + } + + return generated_count; +} + +// Explicit instantiations +template class TokenGenerator; +template class TokenGenerator; + +} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/TokenGenerator.hpp b/mllm/backends/qnn/aot_rt/TokenGenerator.hpp new file mode 100644 index 000000000..5c23da325 --- /dev/null +++ b/mllm/backends/qnn/aot_rt/TokenGenerator.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include "mllm/backends/qnn/aot_rt/QnnAOTModule.hpp" +#include "mllm/backends/qnn/aot_rt/KVCacheManager.hpp" +#include "mllm/preprocessor/tokenizers/AutoTokenizer.hpp" +#include "mllm/core/Tensor.hpp" +#include +#include +#include +#include +#include + +namespace mllm::qnn::aot { + +template +class TokenGenerator { + public: + struct Config { + std::string model_path; + int32_t context_len; + int64_t num_heads; + int64_t num_layers; + int32_t vocab_size; + int32_t head_dim; + bool use_int64_token; + int sliding_window; + DataTypes kv_dtype = kUInt8; + }; + + TokenGenerator(mllm::preprocessor::AutoTokenizer* tokenizer, KVCacheManager* kv_manager, + std::unique_ptr>&& eos_ids, Config config); + + virtual ~TokenGenerator() = default; + + void init_io(); + + virtual const std::vector& get_all_logits(); + + virtual int64_t generate(std::vector& tokens, int64_t start_pos, int32_t seq_len, + const std::function& token_callback, bool dump_logits); + + protected: + mllm::preprocessor::AutoTokenizer* tokenizer_; + std::unique_ptr module_; + KVCacheManager* kv_manager_; + std::unique_ptr> eos_ids_; + Config config_; + + std::vector input_tensors_; + std::vector output_tensors_; + std::vector token_all_logits_; + + void prepare_io(uint64_t cur_token, int64_t start_pos); +}; + +} // namespace mllm::qnn::aot diff --git a/mllm/backends/qnn/aot_rt/utils/MaskGen.cpp b/mllm/backends/qnn/aot_rt/utils/MaskGen.cpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/mllm/backends/qnn/aot_rt/utils/MaskGen.hpp b/mllm/backends/qnn/aot_rt/utils/MaskGen.hpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/mllm/backends/qnn/aot_rt/utils/PositionIdGen.cpp b/mllm/backends/qnn/aot_rt/utils/PositionIdGen.cpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/mllm/backends/qnn/aot_rt/utils/PositionIdGen.hpp b/mllm/backends/qnn/aot_rt/utils/PositionIdGen.hpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/mllm/backends/qnn/aot_rt/utils/RoPEGen.cpp b/mllm/backends/qnn/aot_rt/utils/RoPEGen.cpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/mllm/backends/qnn/aot_rt/utils/RoPEGen.hpp b/mllm/backends/qnn/aot_rt/utils/RoPEGen.hpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/mllm/mllm.hpp b/mllm/mllm.hpp index 4a07f0ee7..27ea0abe0 100644 --- a/mllm/mllm.hpp +++ b/mllm/mllm.hpp @@ -197,7 +197,7 @@ extern void initAscendBackend(); bool isQnnAvailable(); -extern void initQnnBackend(); +extern void initQnnBackend(const std::string& context_path = "qnn_context.bin"); void cleanThisThread();