Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ if(MLLM_TRACY_ENABLE)
add_subdirectory(tracy_example)
endif()

if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE OR MLLM_BUILD_QNN_BACKEND)
add_subdirectory(qwen3_qnn_aot)
endif()
13 changes: 10 additions & 3 deletions examples/qwen3_qnn_aot/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
add_executable(mllm-qwen3-aot-c compile.cpp)
target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR})
# AOT targets run on x86
if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
add_executable(mllm-qwen3-aot-c compile.cpp)
target_link_libraries(mllm-qwen3-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-qwen3-aot-c PRIVATE ${MLLM_INCLUDE_DIR})
endif()

add_executable(mllm-qwen3-aot-runner aot_run.cpp)
target_link_libraries(mllm-qwen3-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-qwen3-aot-runner PRIVATE ${MLLM_INCLUDE_DIR})
64 changes: 64 additions & 0 deletions examples/qwen3_qnn_aot/aot_run.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include <iostream>
#include <fmt/core.h>
#include <mllm/mllm.hpp>
#include <string>
#include "mllm/backends/qnn/aot_rt/QnnAOTRuntime.hpp"
#include "mllm/models/qwen3/configuration_qwen3.hpp"
#include "mllm/models/qwen3/tokenization_qwen3.hpp"

using mllm::Argparse;
using namespace mllm::qnn::aot; // NOLINT

MLLM_MAIN({
auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
auto& model_path = Argparse::add<std::string>("-m|--model").help("Model path").def("qwen3_qnn.mllm");
auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer").help("Tokenizer path").def("tokenizer.json");
auto& config_path = Argparse::add<std::string>("-c|--config").help("Config path").required(true);
auto& temperature = Argparse::add<float>("--temperature").help("Temperature").def(0.8f);
auto& ar_len = Argparse::add<int>("--ar_len").help("Autoregressive length (chunk size)").def(128);

Argparse::parse(argc, argv);

mllm::initQnnBackend(model_path.get());

if (help.isSet()) {
Argparse::printHelp();
return 0;
}
Comment thread
oreomaker marked this conversation as resolved.

auto qwen3_cfg = mllm::models::qwen3::Qwen3Config(config_path.get());

RunnerConfig config;
config.model_path = model_path.get();
config.temperature = temperature.get();
config.num_layers = qwen3_cfg.num_hidden_layers;
config.num_heads = qwen3_cfg.num_attention_heads;
config.head_dim = qwen3_cfg.head_dim;
config.vocab_size = qwen3_cfg.vocab_size;
config.context_len = 1024;
config.ar_len = ar_len.get();

auto tokenizer = mllm::models::qwen3::Qwen3Tokenizer(tokenizer_path.get());

std::string prompt_text;
fmt::print("💬 Prompt text (or 'exit/quit'): ");
std::getline(std::cin, prompt_text);

auto input_tensor = tokenizer.convertMessage({.prompt = prompt_text});

Runner runner(config, &tokenizer);
if (!runner.load()) {
std::cerr << "Failed to load model\n";
return 1;
}

std::vector<uint64_t> prompt_tokens;
auto sequence = input_tensor["sequence"];
int64_t* ptr = sequence.ptr<int64_t>();
for (int i = 0; i < sequence.shape()[1]; ++i) { prompt_tokens.push_back((uint64_t)ptr[i]); }

runner.generate(prompt_tokens, config.context_len, [](const std::string& token) { std::cout << token << std::flush; });
std::cout << "\n";

return 0;
});
4 changes: 2 additions & 2 deletions examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ class Qwen3Attention final : public nn::Module {
"k_rope_add_0_output_qdq");

// De-quantization and quantization again
key_states = key_states.to(kFloat16);
key_states = key_states.to(kFloat32);
key_states = key_states.to(kUInt8PerTensorSym);
key_states = ptq::QDQ_KV(this, key_states, "k_cast_to_int8_qdq");

Expand All @@ -251,7 +251,7 @@ class Qwen3Attention final : public nn::Module {

// Handle KV Cache
value_states = ptq::QDQ(this, value_states, "v_cast_to_int16_qdq");
value_states = value_states.to(kFloat16);
value_states = value_states.to(kFloat32);
value_states = value_states.to(kUInt8PerTensorSym);
value_states = ptq::QDQ_KV(this, value_states, "v_cast_to_int8_qdq");

Expand Down
6 changes: 6 additions & 0 deletions mllm/backends/qnn/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_SRC})
endif()

file(GLOB_RECURSE MLLM_QUALCOMM_AOT_RT_SRC
${CMAKE_CURRENT_LIST_DIR}/aot_rt/*.hpp
${CMAKE_CURRENT_LIST_DIR}/aot_rt/*.cpp
)
list(APPEND MLLM_QNN_SRC ${MLLM_QUALCOMM_AOT_RT_SRC})

add_library(
MllmQNNBackend
SHARED
Expand Down
149 changes: 24 additions & 125 deletions mllm/backends/qnn/QNNBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,24 +55,6 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) {
MLLM_INFO("QNN backend supports early termination");
}

bool contextStatus = false;
// check if the qnn_context.bin file exists
if (!std::filesystem::exists("qnn_context.bin")) {
contextStatus = runtime_->createContext(context_, nullptr);
} else {
contextStatus = runtime_->retrieveContext(context_, qnnModels_, nullptr);

// fill qnnModelIndexMap_ info according to qnnModels_
for (size_t i = 0; i < qnnModels_.size(); i++) {
auto graphName = qnnModels_[i]->getQnnGraphName();
qnnModelIndexMap_.insert(std::make_pair(graphName, i));
}
}
if (!contextStatus) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); }

// init QNN Allocator
static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);

// set performance parameters for better performance on HTP
perf_ = QNNPerf::create(&runtime_->qnnInterface);
perf_->setPowerConfigBurst();
Expand Down Expand Up @@ -348,10 +330,10 @@ bool QNNRuntime::createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t
return true;
}

bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::shared_ptr<QNNModel>>& qnnModels,
QnnContext_Config_t** contextConfig) {
bool QNNRuntime::retrieveContext(const std::string& contextBinaryPath, Qnn_ContextHandle_t& context,
std::vector<std::shared_ptr<QNNModel>>& qnnModels, QnnContext_Config_t** contextConfig) {
// Read the binary from qnn_context.bin and get the size in byte
std::ifstream file(QNN_Context_File, std::ios::binary | std::ios::ate);
std::ifstream file(contextBinaryPath, std::ios::binary | std::ios::ate);
std::streamsize size = file.tellg();
file.seekg(0, std::ios::beg);

Expand Down Expand Up @@ -436,6 +418,25 @@ bool QNNRuntime::retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::
return true;
}

bool QNNBackend::createContext() {
if (!runtime_->createContext(context_, nullptr)) { return false; }
// init QNN Allocator
static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
return true;
}

bool QNNBackend::loadContext(const std::string& contextPath) {
if (!runtime_->retrieveContext(contextPath, context_, qnnModels_, nullptr)) { return false; }
// fill qnnModelIndexMap_ info according to qnnModels_
for (size_t i = 0; i < qnnModels_.size(); i++) {
auto graphName = qnnModels_[i]->getQnnGraphName();
qnnModelIndexMap_.insert(std::make_pair(graphName, i));
}
// init QNN Allocator
static_pointer_cast<QNNAllocator>(allocator_)->setQNNPointer(runtime_->qnnInterface, context_);
return true;
}

std::shared_ptr<QNNModel> QNNBackend::createQnnGraph(const std::string& graphName) {
// If the graph already exists, return the existing model
if (qnnModelIndexMap_.find(graphName) != qnnModelIndexMap_.end()) {
Expand Down Expand Up @@ -535,8 +536,6 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
return;
}

// Prepare QNN input tensors by copying data from runtime inputs to graph input wrappers
// This handles the case where input tensor sizes may differ between prefill and decode phases
std::vector<Qnn_Tensor_t> qnn_inputs;
std::vector<Qnn_Tensor_t> qnn_outputs;
for (int i = 0; i < model->getGraphInputTensorWrappers().size(); i++) {
Expand All @@ -550,52 +549,8 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&
return;
}

if (wrapper_tensor.isNil()) {
MLLM_ERROR("Graph input wrapper {} for graph '{}' has no backing tensor", i, graphName);
return;
}

// Check for size mismatches (can occur in decode phase where inputs may be smaller)
size_t dst_bytes = wrapper_tensor.bytes();
size_t src_bytes = runtime_input.bytes();
if (dst_bytes != src_bytes) {
MLLM_WARN("Graph '{}' input tensor {} byte-size mismatch: wrapper={} bytes, runtime input={} bytes. Copying "
"min(dst, src), but this may truncate data.",
graphName, i, dst_bytes, src_bytes);
}

if (dst_bytes > 0) {
void* dst_ptr = wrapper_tensor.ptr<void>();
if (!dst_ptr) {
wrapper_tensor.alloc();
dst_ptr = wrapper_tensor.ptr<void>();
}

const void* src_ptr = runtime_input.ptr<void>();
size_t bytes_to_copy = std::min(dst_bytes, src_bytes);
if (!src_ptr) {
MLLM_ERROR("Runtime input tensor {} for graph '{}' has null data pointer", i, graphName);
return;
}
if (dst_ptr && src_ptr && dst_ptr != src_ptr) {
// Copy source data to destination buffer
// This ensures that the graph input wrapper has the correct data for execution
if (bytes_to_copy > 0) { std::memcpy(dst_ptr, src_ptr, bytes_to_copy); }

// If source is smaller than destination, zero out the remaining bytes
// This is important for decode phase where input tensors may be smaller than prefill
// For example, decode phase may use [1, 1] input while wrapper expects [1, 128]
// Note: In current implementation with full [1, 128] tensor, this should not trigger
// but it's kept as a safety measure for future optimizations
if (src_bytes < dst_bytes) {
size_t remaining_bytes = dst_bytes - src_bytes;
std::memset(static_cast<char*>(dst_ptr) + bytes_to_copy, 0, remaining_bytes);
// Only log if zero-padding actually occurs (unexpected case)
MLLM_WARN("[QNN graphExecute] Graph '{}' input tensor {}: zero-padded {} bytes (src={} bytes, dst={} bytes)",
graphName, i, remaining_bytes, src_bytes, dst_bytes);
}
}
}
// input wrapper is empty, set wrapper's dataContainer(mllm::Tensor)
if (!wrapper->isAlloc()) { wrapper->__setDataContainer(runtime_input); }
Comment on lines +552 to +553
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Search for usage patterns of __setDataContainer and isAlloc
rg -n -C3 '__setDataContainer|isAlloc' --type=cpp

Repository: UbiquitousLearning/mllm

Length of output: 3427


🏁 Script executed:

#!/bin/bash
# Get broader context around line 552-553 in QNNBackend.cpp
sed -n '520,580p' mllm/backends/qnn/QNNBackend.cpp

echo "---"
# Search for wrapper creation/reuse patterns
rg -n 'wrapper\s*=' mllm/backends/qnn/QNNBackend.cpp | head -20

Repository: UbiquitousLearning/mllm

Length of output: 2855


🏁 Script executed:

#!/bin/bash
# Search for wrapper lifecycle - how are input wrappers stored and reused?
rg -n 'getGraphInputTensorWrappers' mllm/backends/qnn/ --type=cpp

echo "---"
# Search for reset/cleanup methods in QNNTensorWrapper
rg -n 'reset|clear|cleanup' mllm/backends/qnn/QNNUtils.hpp --type=hpp

Repository: UbiquitousLearning/mllm

Length of output: 827


🏁 Script executed:

#!/bin/bash
# Search for reset/cleanup methods in QNNTensorWrapper without type specification
rg -n 'reset|clear|cleanup' mllm/backends/qnn/QNNUtils.hpp

echo "---"
# Look at the full QNNTensorWrapper class definition
sed -n '180,230p' mllm/backends/qnn/QNNUtils.hpp

Repository: UbiquitousLearning/mllm

Length of output: 2348


🏁 Script executed:

#!/bin/bash
# Check how wrappers are created and stored in the model
rg -n 'inputTensorWrappers_' mllm/backends/qnn/QNNModel.cpp -A2 -B2 | head -40

echo "---"
# Check if there's any mechanism to reset or recreate wrappers between executions
rg -n 'graphExecute' mllm/backends/qnn/QNNBackend.cpp | head -5

Repository: UbiquitousLearning/mllm

Length of output: 1164


🏁 Script executed:

#!/bin/bash
# Understand the alloc() method and whether it updates data
sed -n '484,496p' mllm/backends/qnn/QNNUtils.cpp

echo "---"
# Check the create method to understand initial wrapper state
rg -n 'QNNTensorWrapper::create' mllm/backends/qnn/QNNUtils.cpp -A15 | head -40

Repository: UbiquitousLearning/mllm

Length of output: 2599


🏁 Script executed:

#!/bin/bash
# Check the lifecycle of the wrapper between multiple executions
# Does graphExecute clear or reset wrappers?
rg -n 'clear|reset|new ' mllm/backends/qnn/QNNBackend.cpp | grep -i wrapper

echo "---"
# Check how the input wrapper is actually used after alloc() in graphExecute
sed -n '553,572p' mllm/backends/qnn/QNNBackend.cpp

Repository: UbiquitousLearning/mllm

Length of output: 1138


🏁 Script executed:

#!/bin/bash
# Final verification: Check if there's any mechanism to reset isAlloc_ or update data between executions
rg -n 'isAlloc_\s*=' mllm/backends/qnn/ --type=cpp

echo "---"
# Check comment or documentation about wrapper reuse
rg -n 'reuse|lifecycle|between execution' mllm/backends/qnn/QNNUtils.hpp

echo "---"
# Look for any destructor or cleanup in QNNModel
rg -n '~QNNModel|reset.*wrapper' mllm/backends/qnn/QNNModel.hpp

Repository: UbiquitousLearning/mllm

Length of output: 366


🏁 Script executed:

#!/bin/bash
# Final critical check: verify what happens on second execution
# Look at the actual behavior flow
echo "=== First Execution Flow ==="
echo "1. !wrapper->isAlloc() is TRUE (initial state: isAlloc_ = false)"
echo "2. Calls __setDataContainer(runtime_input) - sets dataContainer_ to runtime_input"
echo "3. Calls wrapper->alloc() - sets isAlloc_ = true"
echo ""
echo "=== Second Execution Flow ==="
echo "1. !wrapper->isAlloc() is FALSE (already allocated)"
echo "2. SKIPS __setDataContainer() - wrapper->dataContainer_ NOT updated"
echo "3. Calls wrapper->alloc() again - but dataContainer_ still has old data"
echo ""

# Verify the key insight: once isAlloc_ is true, it never becomes false
rg -n 'isAlloc_\s*=\s*false' mllm/backends/qnn/

Repository: UbiquitousLearning/mllm

Length of output: 575


Fix stale data bug when wrappers are reused across multiple graph executions.

The current logic skips data updates when wrapper->isAlloc() is true, but once a wrapper is allocated (first execution), isAlloc_ is never reset. On subsequent graphExecute calls with the same wrapper objects, the condition if (!wrapper->isAlloc()) evaluates to false, preventing __setDataContainer() from updating the wrapper's data. This causes stale input data from the previous execution to be reused.

To fix this, consider either:

  1. Creating fresh wrappers for each execution, or
  2. Resetting/updating the wrapper's data unconditionally before allocation (e.g., check if input data changed rather than allocation status), or
  3. Adding a reset mechanism to clear isAlloc_ and dataContainer_ between executions.
🤖 Prompt for AI Agents
In `@mllm/backends/qnn/QNNBackend.cpp` around lines 552 - 553, The wrapper reuse
causes stale inputs because wrapper->isAlloc() prevents updating dataContainer_;
change the flow so the wrapper's data container is refreshed each graphExecute:
either always call wrapper->__setDataContainer(runtime_input) before
allocation/usage (i.e., do not gate it on wrapper->isAlloc()), or add/invoke a
reset method that clears isAlloc_ and dataContainer_ on reuse (implement
resetWrapper/resetAllocation that zeroes dataContainer_ and sets isAlloc_ =
false and call it at start of graphExecute). Ensure __setDataContainer(),
isAlloc_ and dataContainer_ are the referenced symbols updated so allocation
uses the current runtime_input.


// Allocate and register the wrapper tensor with QNN allocator
// QNNAllocator will handle registered memory descriptor when needed
Expand All @@ -617,74 +572,18 @@ void QNNBackend::graphExecute(const std::string& graphName, std::vector<Tensor>&

if (ProfilingLevel::OFF != profilingLevel_) { extractBackendProfilingInfo(runtime_->profileHandle); }

// Debug: Print last output shape from QNN actual return order (before reordering)
// Uncomment below for debugging output order issues
// if (!qnn_output_tensors.empty()) {
// const auto& last_output = qnn_output_tensors.back();
// const auto& output_wrappers = model->getGraphOutputTensorWrappers();
// const auto& last_wrapper = output_wrappers.back();
// MLLM_INFO("[QNN Actual Return Order] Last output tensor '{}' shape: {}",
// last_wrapper->getName(), last_output.shape());
// }

// Reorder outputs according to MLLM expected order
const auto& expectedOrder = model->getExpectedOutputOrder();

// Resize outputs to match QNN output count first
outputs.resize(qnn_output_tensors.size()); // Ensure outputs has enough space for all QNN outputs
if (!expectedOrder.empty() && expectedOrder.size() == qnn_output_tensors.size()) {
// Debug: Log output order information
// Uncomment below for debugging output order issues
// MLLM_INFO("QNNBackend::graphExecute: Checking output order for graph '{}'", graphName);
// MLLM_INFO(" MLLM Expected Output Order ({} outputs):", expectedOrder.size());
// for (size_t i = 0; i < expectedOrder.size(); i++) {
// MLLM_INFO(" [{}] {}", i, expectedOrder[i]);
// }
// MLLM_INFO(" QNN Output Order ({} outputs):", model->getGraphOutputTensorWrappers().size());
// for (size_t i = 0; i < model->getGraphOutputTensorWrappers().size(); i++) {
// auto wrapper = model->getGraphOutputTensorWrappers()[i];
// MLLM_INFO(" [{}] {}", i, wrapper->getName());
// }

// Check if reordering is needed
// bool needs_reordering = false;
// std::vector<std::pair<size_t, int>> mismatches;
// for (size_t i = 0; i < expectedOrder.size(); i++) {
// const std::string& expected_name = expectedOrder[i];
// int qnn_index = model->getQnnOutputIndex(expected_name);
// if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
// if (static_cast<int>(i) != qnn_index) {
// needs_reordering = true;
// mismatches.emplace_back(i, qnn_index);
// }
// }
// }

// Debug: Verification messages
// Uncomment below for debugging output order issues
// if (needs_reordering) {
// MLLM_INFO(" [VERIFICATION] QNN output order DIFFERS from MLLM expected order - REORDERING REQUIRED");
// for (const auto& [mllm_idx, qnn_idx] : mismatches) {
// MLLM_INFO(" Mismatch: MLLM[{}] expects '{}' but it's at QNN[{}]",
// mllm_idx, expectedOrder[mllm_idx], qnn_idx);
// }
// } else {
// MLLM_INFO(" [VERIFICATION] QNN output order MATCHES MLLM expected order - no reordering needed");
// }

// Reorder outputs according to expected order
for (size_t i = 0; i < expectedOrder.size(); i++) {
const std::string& expected_name = expectedOrder[i];
int qnn_index = model->getQnnOutputIndex(expected_name);
if (qnn_index >= 0 && qnn_index < static_cast<int>(qnn_output_tensors.size())) {
outputs[i] = qnn_output_tensors[qnn_index];
// Debug: Mapping information
// Uncomment below for debugging output order issues
// if (static_cast<int>(i) != qnn_index) {
// MLLM_INFO(" Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [REORDERED]", i, qnn_index, expected_name);
// } else {
// MLLM_INFO(" Mapping: MLLM[{}] = QNN[{}] (tensor: {}) [SAME]", i, qnn_index, expected_name);
// }
} else {
MLLM_ERROR("QNNBackend::graphExecute: Failed to find QNN output index for tensor '{}' in graph '{}'", expected_name,
graphName);
Expand Down
7 changes: 5 additions & 2 deletions mllm/backends/qnn/QNNBackend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ class QNNRuntime {
}

bool createContext(Qnn_ContextHandle_t& context, QnnContext_Config_t** contextConfig = nullptr);
bool retrieveContext(Qnn_ContextHandle_t& context, std::vector<std::shared_ptr<QNNModel>>& qnnModels,
QnnContext_Config_t** contextConfig = nullptr);
bool retrieveContext(const std::string& contextBinaryPath, Qnn_ContextHandle_t& context,
std::vector<std::shared_ptr<QNNModel>>& qnnModels, QnnContext_Config_t** contextConfig = nullptr);
Comment on lines +53 to +54
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# First, find and read the header file to confirm the signature change
find . -name "QNNBackend.hpp" -type f

Repository: UbiquitousLearning/mllm

Length of output: 101


🏁 Script executed:

# Search for all usages of retrieveContext in the codebase
rg -n "retrieveContext" --type=cpp --type=h -C 3

Repository: UbiquitousLearning/mllm

Length of output: 1879


🏁 Script executed:

# Search more comprehensively for all retrieveContext usages, including in tests and other files
rg -n "retrieveContext" --type=cpp --type=h --type=cc

Repository: UbiquitousLearning/mllm

Length of output: 93


🏁 Script executed:

# Read the header file to check for docstrings and API documentation
cat -n mllm/backends/qnn/QNNBackend.hpp | head -80

Repository: UbiquitousLearning/mllm

Length of output: 3618


🏁 Script executed:

# Also search in Python files in case there are bindings
rg -n "retrieveContext" --type=py

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Search more comprehensively across all source files
rg -n "retrieveContext" --type=cpp

Repository: UbiquitousLearning/mllm

Length of output: 464


🏁 Script executed:

# Check if there are any other file types that might use this API
find . -type f \( -name "*.cpp" -o -name "*.h" -o -name "*.hpp" \) | wc -l

Repository: UbiquitousLearning/mllm

Length of output: 71


🏁 Script executed:

# Look at the complete context around the method declaration to check for docstrings
sed -n '40,60p' mllm/backends/qnn/QNNBackend.hpp

Repository: UbiquitousLearning/mllm

Length of output: 874


🏁 Script executed:

# Check if createContext has docstrings (line 52)
sed -n '45,56p' mllm/backends/qnn/QNNBackend.hpp

Repository: UbiquitousLearning/mllm

Length of output: 686


🏁 Script executed:

# Check the complete implementation to understand the context better
sed -n '333,360p' mllm/backends/qnn/QNNBackend.cpp

Repository: UbiquitousLearning/mllm

Length of output: 1343


🏁 Script executed:

# Verify no other files in the project might be calling this method
find . -type f \( -name "*.cpp" -o -name "*.h" -o -name "*.hpp" \) -exec grep -l "QNNRuntime\|QNNBackend" {} \; | head -20

Repository: UbiquitousLearning/mllm

Length of output: 665


🏁 Script executed:

# Check if there are any examples or test files that might use the old API
find . -type f -name "*test*" -o -name "*example*" | grep -i qnn

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Double-check for any remaining usages of the old signature or other related calls
rg -n "retrieveContext\|loadContext\|createContext" mllm/backends/qnn/QNNBackend.cpp -A 2

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Check if the coding guideline about docstrings is being violated
# Look at similar public methods in other backends to see the pattern
find . -type f -name "*.hpp" -path "*/backends/*" -exec grep -l "bool.*(" {} \; | head -5 | xargs head -100

Repository: UbiquitousLearning/mllm

Length of output: 9600


🏁 Script executed:

# Verify the specific lines mentioned in the review (53-54) and surrounding context for any documentation
sed -n '48,58p' mllm/backends/qnn/QNNBackend.hpp

Repository: UbiquitousLearning/mllm

Length of output: 662


🏁 Script executed:

# Verify no trailing whitespace or other formatting issues in the header file
sed -n '52,55p' mllm/backends/qnn/QNNBackend.hpp | cat -A

Repository: UbiquitousLearning/mllm

Length of output: 383


🏁 Script executed:

# Check if there are any docstring patterns used elsewhere in QNN backend files
rg -n "///|//!" mllm/backends/qnn/QNNBackend.hpp | head -20

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Check Base Backend header for documentation patterns
rg -n "bool.*\(.*\);" mllm/backends/base/Backend.hpp -B 2 | head -30

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Check the entire QNNRuntime class declaration to see if there are any docstrings at all
sed -n '41,100p' mllm/backends/qnn/QNNBackend.hpp

Repository: UbiquitousLearning/mllm

Length of output: 2621


🏁 Script executed:

# Look at the implementation to see if docstrings are in the .cpp file instead
sed -n '325,340p' mllm/backends/qnn/QNNBackend.cpp

Repository: UbiquitousLearning/mllm

Length of output: 795


🏁 Script executed:

# Check how many methods in QNNRuntime lack documentation
grep -n "bool\|void" mllm/backends/qnn/QNNBackend.hpp | grep -A 1 "QNNRuntime" -m 20

Repository: UbiquitousLearning/mllm

Length of output: 49


🏁 Script executed:

# Check if these methods are newly added or modified (look at git context if available)
git log --oneline -1 mllm/backends/qnn/QNNBackend.hpp 2>/dev/null || echo "Git history not available"

Repository: UbiquitousLearning/mllm

Length of output: 125


🏁 Script executed:

# Verify the coding guideline compliance by checking if similar public methods in this file have docstrings
sed -n '85,130p' mllm/backends/qnn/QNNBackend.hpp

Repository: UbiquitousLearning/mllm

Length of output: 2112


Add docstrings to public API methods.

The retrieveContext and createContext methods (lines 52-54) lack docstrings. Per coding guidelines, public APIs must have clear docstrings explaining purpose, parameters, returns, and errors. The signature change itself is correct and the caller at QNNBackend.cpp:429 has been properly updated.

🤖 Prompt for AI Agents
In `@mllm/backends/qnn/QNNBackend.hpp` around lines 53 - 54, Add clear docstrings
for the public methods retrieveContext and createContext in QNNBackend.hpp: for
each method describe its purpose, list and explain every parameter (e.g.,
contextBinaryPath, context, qnnModels, contextConfig for retrieveContext and
relevant params for createContext), state the return value semantics (what a
true/false or success/failure indicates) and document possible errors/exceptions
or failure conditions and side effects (e.g., ownership of pointers, when
contextConfig may be modified, thread-safety expectations). Place these comments
immediately above the function declarations so they become part of the public
API docs.


private:
QNN_INTERFACE_VER_TYPE qnnInterface;
Expand Down Expand Up @@ -87,6 +87,9 @@ class QNNBackend final : public Backend {
public:
QNNBackend();

bool loadContext(const std::string& contextPath);
bool createContext();

bool isWeightOnDevice() override { return false; }

// QNN Graph build interfaces
Expand Down
5 changes: 1 addition & 4 deletions mllm/backends/qnn/QNNUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -483,10 +483,7 @@ std::shared_ptr<QNNTensorWrapper> QNNTensorWrapper::createStaticTensor(const std
}

void QNNTensorWrapper::alloc() {
if (isAlloc_) {
MLLM_WARN("Tensor {} has already been allocated.", name_);
return;
}
if (isAlloc_) { MLLM_WARN("Tensor {} has already been allocated.", name_); }
MLLM_RT_ASSERT(dataContainer_.device() == kQNN);

// if storage is not allocated, allocate it
Expand Down
7 changes: 7 additions & 0 deletions mllm/backends/qnn/QNNUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,13 @@ class QNNTensorWrapper {
Tensor& getDataContainer() { return dataContainer_; }
const std::vector<uint32_t>* getDimension() { return &dimensions_; }

bool isAlloc() { return isAlloc_; }
void __setDataContainer(const Tensor& tensor) {
MLLM_RT_ASSERT(dataContainer_.isNil())
dataContainer_ = tensor;
if (!tensor.isNil()) { isAlloc_ = true; }
}

// Helper to set complex quantization params and manage memory
void setScaleOffsetQuantization(const std::vector<Qnn_ScaleOffset_t>& scaleOffsets, int32_t axis);
void setBlockwiseQuantization(const Qnn_BlockwiseExpansion_t& blockwise, const std::vector<Qnn_ScaleOffset_t>& scaleOffsets);
Expand Down
8 changes: 7 additions & 1 deletion mllm/backends/qnn/Register.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the MIT License.

#include <memory>
#include <filesystem>
#include "mllm/core/BaseOp.hpp"
#include "mllm/core/DeviceTypes.hpp"
#include "mllm/engine/Context.hpp"
Expand All @@ -13,12 +14,17 @@
namespace mllm {

// export initQnnBackend function to initialize QNN backend
void initQnnBackend() {
void initQnnBackend(const std::string& context_path) {
MLLM_RT_ASSERT(isQnnAvailable());
auto& ctx = Context::instance();

// 1. Register backend
auto backend = std::make_shared<qnn::QNNBackend>();
if (std::filesystem::exists(context_path)) {
if (!backend->loadContext(context_path)) { MLLM_ERROR_EXIT(1, "Failed to load QNN context from {}", context_path); }
} else {
if (!backend->createContext()) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); }
}
ctx.registerBackend(backend);

// 2. Initialize memory manager
Expand Down
Loading