UbiquitousLearning · yirongjie · Jun 9, 2025 · May 27, 2025 · Jun 7, 2025 · Jun 9, 2025
diff --git a/.gitignore b/.gitignore
@@ -36,5 +36,4 @@ examples/demo_deepseek.cpp
 src/models/deepseek/*
 examples/demo.cpp
 
-src/backends/qnn/sdk/*
-*.mllm
+src/backends/qnn/sdk/*
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -35,7 +35,12 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
 endif ()
 
 if (ARM)
-    set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm)
+if(QNN)
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm-qnn)
+else()
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm)
+endif()
+
     add_compile_definitions(__ARM_FEATURE_DOTPROD)
     # 检查是否使用的是 GCC 或 Clang 编译器
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")

diff --git a/README.md b/README.md
@@ -128,6 +128,7 @@ mllm is a lightweight, fast, and easy-to-use (multimodal) on-device LLM inferenc
 ```bash
 git clone https://github.com/UbiquitousLearning/mllm
 cd mllm
+git submodule update --init --recursive
 ```
 
 ### Check prerequisites

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -81,9 +81,6 @@ func_llm_add_executable(demo_phonelm)
 func_llm_add_executable(demo_llama3)
 func_llm_add_executable(demo_minicpm_moe_mbm)
 func_llm_add_executable(demo_minicpm_moe_mbp)
-
-
-
 func_vlm_add_executable(demo_llava)
 func_vlm_add_executable(demo_fuyu)
 func_vlm_add_executable(demo_vit)
@@ -99,11 +96,12 @@ func_vlm_add_executable(demo_showui)
 
 if(QNN)
     func_llm_add_executable(demo_qwen_npu)
-    func_llm_add_executable(main_qwen_npu)
+    # func_llm_add_executable(main_qwen_npu)
     func_llm_add_executable(demo_phonelm_npu)
-    func_llm_add_executable(main_phonelm_npu)
+    # func_llm_add_executable(main_phonelm_npu)
     func_llm_add_executable(demo_qwen2.5_npu)
-    func_llm_add_executable(demo_qwen_pipeline)
+    # func_llm_add_executable(demo_qwen_pipeline)
+    func_vlm_add_executable(demo_qwen2_vl_npu)
 endif()
 
 

diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp
@@ -1,4 +1,5 @@
 #include "Module.hpp"
+#include "QNNBackend.hpp"
 #include "Types.hpp"
 #include <memory>
 #include "backends/cpu/CPUBackend.hpp"
@@ -13,8 +14,10 @@ int main(int argc, char **argv) {
     cmdline::parser cmdParser;
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
+
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-int8.mllm");
     cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
+
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.add<int>("chunk", 'c', "chunk size", false, 64);
@@ -28,6 +31,8 @@ int main(int argc, char **argv) {
     int chunk_size = cmdParser.get<int>("chunk");
     CPUBackend::cpu_threads = cmdParser.get<int>("thread");
 
+    Module::initBackend(MLLM_QNN);
+
     auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
     PhoneLMConfig config(tokens_limit, "1.5B");
     auto model = PhoneLMForCausalLM_NPU(config, chunk_size);
@@ -57,8 +62,13 @@ int main(int argc, char **argv) {
     static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
     // turn on the multi-chunk prefilling
     Module::isMultiChunkPrefilling = true;
+
     // warmup END
     std::cout << "Warmup finished." << std::endl;
+    if (!std::filesystem::exists("qnn_context.bin")) {
+        static_cast<QNNBackend *>(Backend::global_backends[MLLM_QNN])->saveQNNContext();
+    }
+
 
     vector<string> in_strs = {
         "Give me a short introduction to large language model.",

diff --git a/examples/demo_qwen.cpp b/examples/demo_qwen.cpp
@@ -39,9 +39,7 @@ int main(int argc, char **argv) {
     model.load(model_path);
 
     vector<string> in_strs = {
-        "Hello, who are you?",
-        "What can you do?",
-        "Please introduce Beijing University of Posts and Telecommunications.",
+        " Give me a short introduction to large language model.",
     };
     for (int i = 0; i < in_strs.size(); ++i) {
         auto input_str = tokenizer.apply_chat_template(in_strs[i]);
@@ -50,8 +48,8 @@ int main(int argc, char **argv) {
         std::cout << "[A] " << std::flush;
 
         LlmTextGeneratorOpts opt{
-            .max_new_tokens = 100,
-            .do_sample = true,
+            .max_new_tokens = 1,
+            .do_sample = false,
             .temperature = 0.3F,
             .top_k = 50,
             .top_p = 0.F,

diff --git a/examples/demo_qwen2.5_npu.cpp b/examples/demo_qwen2.5_npu.cpp
@@ -1,3 +1,5 @@
+#include "QNNBackend.hpp"
+#include "Types.hpp"
 #include "backends/cpu/CPUBackend.hpp"
 #include "cmdline.h"
 #include "models/qwen/configuration_qwen.hpp"
@@ -12,8 +14,8 @@ int main(int argc, char **argv) {
     cmdline::parser cmdParser;
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt");
-    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/Qwen2.5-1.5B-Instruct.mllm");
-    cmdParser.add<string>("billion", 'b', "[0.5B | 1.8B | 1.5B]", false, "1.8B");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-2-int8-int32bias-0kproj-test.mllm");
+    cmdParser.add<string>("billion", 'b', "[0.5B | 1.8B | 1.5B]", false, "1.5B");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.parse_check(argc, argv);
@@ -26,19 +28,22 @@ int main(int argc, char **argv) {
     CPUBackend::cpu_threads = cmdParser.get<int>("thread");
 
     auto tokenizer = QWenTokenizer(vocab_path, merge_path);
-    QWenConfig config(tokens_limit, "1.5B", RoPEType::HFHUBROPE);
-    auto model = QWenForCausalLM_NPU(config, 64);
+    QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
+    auto model = v2::QWenForCausalLM_NPU(config, 32);
+
+    Module::initBackend(MLLM_QNN);
+
     model.load(model_path);
     auto decoding_model = QWenForCausalLM(config);
-    decoding_model.load("../models/qwen-2.5-1.5b-instruct-q4_0_4_4.mllm");
+    decoding_model.load("../models/qwen-2.5-1.5b-instruct-q4_k.mllm");
 
     vector<string> in_strs = {
         " Give me a short introduction to large language model.",
     };
 
     for (int i = 0; i < in_strs.size(); ++i) {
         auto input_str = tokenizer.apply_chat_template(in_strs[i]);
-        auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
+        auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 32, config.vocab_size);
         std::cout << "[Q] " << in_strs[i] << std::endl;
         std::cout << "[A] " << std::flush;
 
@@ -48,9 +53,7 @@ int main(int argc, char **argv) {
         LlmTextGeneratorOpts opt{
             .max_new_tokens = 1,
             .do_sample = false,
-            .temperature = 0.3f,
-            .top_k = 50,
-            .top_p = 0.f,
+
             .is_padding = true,
             .seq_before_padding = real_seq_length,
         };
@@ -67,7 +70,7 @@ int main(int argc, char **argv) {
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
 
         LlmTextGeneratorOpts decoding_opt{
-            .max_new_tokens = 100,
+            .max_new_tokens = 50,
             .do_sample = false,
             .temperature = 0.3f,
             .top_k = 50,
@@ -79,6 +82,7 @@ int main(int argc, char **argv) {
             // call only once of switchDecodeTag
             if (!isSwitched) {
                 static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+
                 isSwitched = true;
             }
             auto out_string = tokenizer.detokenize({out_token});
@@ -96,5 +100,9 @@ int main(int argc, char **argv) {
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
         std::cout << "\n";
+
+        if (!std::filesystem::exists("qnn_context.bin")) {
+            static_cast<QNNBackend *>(Backend::global_backends[MLLM_QNN])->saveQNNContext();
+        }
     }
-}
+}
diff --git a/examples/demo_qwen2.5_pipeline.cpp b/examples/demo_qwen2.5_pipeline.cpp
@@ -0,0 +1,124 @@
+#include "Backend.hpp"
+#include "Trace.hpp"
+#include "Types.hpp"
+#include "backends/cpu/CPUBackend.hpp"
+#include "cmdline.h"
+#include "models/qwen/configuration_qwen.hpp"
+#include "models/qwen/modeling_qwen_npu.hpp"
+#include "models/qwen/modeling_qwen.hpp"
+#include "models/qwen/tokenization_qwen.hpp"
+#include "processor/PostProcess.hpp"
+#include "Parallel.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm");
+    cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-2-int8-test.mllm");
+    cmdParser.add<string>("billion", 'b', "[0.5B | 1.8B | 1.5B]", false, "1.5B");
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.parse_check(argc, argv);
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    string merge_path = cmdParser.get<string>("merge");
+    string model_path = cmdParser.get<string>("model");
+    string model_billion = cmdParser.get<string>("billion");
+    int tokens_limit = cmdParser.get<int>("limits");
+    const int chunk_size = 128;
+    CPUBackend::cpu_threads = cmdParser.get<int>("thread");
+
+    Module::initBackend(MLLM_QNN);
+
+    auto tokenizer = QWenTokenizer(vocab_path, merge_path);
+    QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
+    auto model = v2::QWenForCausalLM_NPU(config, chunk_size);
+    model.load(model_path);
+    auto decoding_model = QWenForCausalLM(config);
+    decoding_model.load("../models/qwen-2.5-1.5b-instruct-q4_0_4_4.mllm");
+
+    string trace_string = " ";
+    auto [_, input_tensor] = tokenizer.tokenizePaddingByChunk(trace_string, chunk_size, config.vocab_size);
+    Tracer::trace(&model, {input_tensor});
+    std::cout << "Trace and Warmup finished" << std::endl;
+
+    vector<string> in_strs = {
+        // " Give me a short introduction to large language model.",
+        "\"Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand and generate human-like text. These models are trained on vast amounts of data, enabling them to perform a wide range of tasks, from answering questions and summarizing text to generating creative content and engaging in conversational dialogue. LLMs like GPT-3 and GPT-4, developed by OpenAI, have set new benchmarks in natural language processing by leveraging deep learning architectures, particularly transformer models, which excel at capturing context and relationships within text. The scalability and versatility of LLMs make them invaluable tools for applications in education, customer service, content creation, and more. However, their deployment also raises ethical considerations, including issues of bias, misinformation, and the potential for misuse. As the field continues to evolve, ongoing research and responsible deployment strategies are essential to harnessing the full potential of these powerful AI systems while mitigating their risks.\"\nGenerate a title based on the above text."};
+
+    for (int i = 0; i < in_strs.size(); ++i) {
+        auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+        auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
+
+        // set total seq length for HeadLinear execute, which can not get the real seq length from Opts
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setTotalSequenceLength(real_seq_length);
+        // set chunk size for the HeadLinear execute, which can not get the chunk size from Opts
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setChunkSize(chunk_size);
+
+        std::cout << "[Q] " << in_strs[i] << std::endl;
+        std::cout << "[A] " << std::flush;
+        std::cout << "real_seq_length: " << real_seq_length << std::endl;
+
+        LlmTextGeneratorOpts opt{
+            .max_new_tokens = 1,
+            .do_sample = false,
+            .is_padding = true,
+            .seq_before_padding = real_seq_length,
+            .chunk_size = chunk_size,
+        };
+
+        // tensor vectors to save the chunked tensors of the QNN prefilling input
+        bool isSwitched = false;
+
+        ChunkPipeline pipeline(real_seq_length, chunk_size);
+        auto prefill_result = pipeline.run(input_tensor, opt, tokenizer, model, isSwitched);
+
+        Module::isMultiChunkPrefilling = true;
+        Module::isFirstChunk = false;
+
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(real_seq_length);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+
+        LlmTextGeneratorOpts decoding_opt{
+            .max_new_tokens = 100,
+            .do_sample = false,
+            .temperature = 0.3f,
+            .top_k = 50,
+            .top_p = 0.f,
+            .is_padding = false,
+        };
+        isSwitched = false;
+
+        Tensor decoding_input;
+        decoding_input.setBackend(Backend::global_backends[MLLM_CPU]);
+        decoding_input.setTtype(INPUT_TENSOR);
+        decoding_input.reshape(1, 1, 1, 1);
+        decoding_input.setName("input0");
+        decoding_input.alloc();
+        decoding_input.setDataAt(0, 0, 0, 0, prefill_result->dataAt<float>(0, 0, 0, 0));
+        decoding_model.generate(decoding_input, decoding_opt, [&](unsigned int out_token) -> bool {
+            // call only once of switchDecodeTag
+            if (!isSwitched) {
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+                isSwitched = true;
+            }
+            auto out_string = tokenizer.detokenize({out_token});
+            auto [isOk, print_string] = tokenizer.postprocess(out_string);
+            if (isOk) {
+                std::cout << print_string << std::flush;
+            } else {
+                return false;
+            }
+            return true;
+        });
+
+        // turn on switching, set sequence length and execution type
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(0);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+        std::cout << "\n";
+    }
+}
diff --git a/examples/demo_qwen2_vl.cpp b/examples/demo_qwen2_vl.cpp
@@ -2,6 +2,7 @@
 #include "cmdline.h"
 #include "models/qwen2_vl/configuration_qwen2_vl.hpp"
 #include "models/qwen2_vl/modeling_qwen2_vl.hpp"
+// #include "models/qwen2_vl/vtp/modeling_qwen2_vl.hpp"
 #include "models/qwen2_vl/processing_qwen2_vl.hpp"
 #include "processor/PostProcess.hpp"
 
@@ -11,7 +12,7 @@ int main(int argc, char **argv) {
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2vl_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2vl_merges.txt");
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-2-vl-2b-instruct-q4_k.mllm");
-    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 2000);
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 800);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.parse_check(argc, argv);
 
@@ -25,8 +26,7 @@ int main(int argc, char **argv) {
     ParamLoader param_loader(model_path);
     auto processor = Qwen2VLProcessor(vocab_path, merge_path);
     Qwen2VLConfig config(tokens_limit, "1.5b");
-    auto model_config = Qwen2VLConfig(config);
-    auto model = Qwen2VLModel(model_config);
+    auto model = Qwen2VLModel(config);
     model.load(model_path);
 
     vector<string> in_imgs = {
-Original file line number
+Diff line change
@@ Expand Up @@
     ```bash
     git clone https://github.com/UbiquitousLearning/mllm
     cd mllm
+    git submodule update --init --recursive
     ```
     ### Check prerequisites
@@ Expand Down @@