Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,4 @@ examples/demo_deepseek.cpp
src/models/deepseek/*
examples/demo.cpp

src/backends/qnn/sdk/*
*.mllm
src/backends/qnn/sdk/*
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,12 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
endif ()

if (ARM)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm)
if(QNN)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm-qnn)
else()
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm)
endif()

add_compile_definitions(__ARM_FEATURE_DOTPROD)
# 检查是否使用的是 GCC 或 Clang 编译器
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ mllm is a lightweight, fast, and easy-to-use (multimodal) on-device LLM inferenc
```bash
git clone https://github.com/UbiquitousLearning/mllm
cd mllm
git submodule update --init --recursive
```

### Check prerequisites
Expand Down
10 changes: 4 additions & 6 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,6 @@ func_llm_add_executable(demo_phonelm)
func_llm_add_executable(demo_llama3)
func_llm_add_executable(demo_minicpm_moe_mbm)
func_llm_add_executable(demo_minicpm_moe_mbp)



func_vlm_add_executable(demo_llava)
func_vlm_add_executable(demo_fuyu)
func_vlm_add_executable(demo_vit)
Expand All @@ -99,11 +96,12 @@ func_vlm_add_executable(demo_showui)

if(QNN)
func_llm_add_executable(demo_qwen_npu)
func_llm_add_executable(main_qwen_npu)
# func_llm_add_executable(main_qwen_npu)
func_llm_add_executable(demo_phonelm_npu)
func_llm_add_executable(main_phonelm_npu)
# func_llm_add_executable(main_phonelm_npu)
func_llm_add_executable(demo_qwen2.5_npu)
func_llm_add_executable(demo_qwen_pipeline)
# func_llm_add_executable(demo_qwen_pipeline)
func_vlm_add_executable(demo_qwen2_vl_npu)
endif()


Expand Down
10 changes: 10 additions & 0 deletions examples/demo_phonelm_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "Module.hpp"
#include "QNNBackend.hpp"
#include "Types.hpp"
#include <memory>
#include "backends/cpu/CPUBackend.hpp"
Expand All @@ -13,8 +14,10 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");

cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-int8.mllm");
cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");

cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.add<int>("chunk", 'c', "chunk size", false, 64);
Expand All @@ -28,6 +31,8 @@ int main(int argc, char **argv) {
int chunk_size = cmdParser.get<int>("chunk");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

Module::initBackend(MLLM_QNN);

auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
PhoneLMConfig config(tokens_limit, "1.5B");
auto model = PhoneLMForCausalLM_NPU(config, chunk_size);
Expand Down Expand Up @@ -57,8 +62,13 @@ int main(int argc, char **argv) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
// turn on the multi-chunk prefilling
Module::isMultiChunkPrefilling = true;

// warmup END
std::cout << "Warmup finished." << std::endl;
if (!std::filesystem::exists("qnn_context.bin")) {
static_cast<QNNBackend *>(Backend::global_backends[MLLM_QNN])->saveQNNContext();
}


vector<string> in_strs = {
"Give me a short introduction to large language model.",
Expand Down
8 changes: 3 additions & 5 deletions examples/demo_qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ int main(int argc, char **argv) {
model.load(model_path);

vector<string> in_strs = {
"Hello, who are you?",
"What can you do?",
"Please introduce Beijing University of Posts and Telecommunications.",
" Give me a short introduction to large language model.",
};
for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
Expand All @@ -50,8 +48,8 @@ int main(int argc, char **argv) {
std::cout << "[A] " << std::flush;

LlmTextGeneratorOpts opt{
.max_new_tokens = 100,
.do_sample = true,
.max_new_tokens = 1,
.do_sample = false,
.temperature = 0.3F,
.top_k = 50,
.top_p = 0.F,
Expand Down
30 changes: 19 additions & 11 deletions examples/demo_qwen2.5_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include "QNNBackend.hpp"
#include "Types.hpp"
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
Expand All @@ -12,8 +14,8 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/Qwen2.5-1.5B-Instruct.mllm");
cmdParser.add<string>("billion", 'b', "[0.5B | 1.8B | 1.5B]", false, "1.8B");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-2-int8-int32bias-0kproj-test.mllm");
cmdParser.add<string>("billion", 'b', "[0.5B | 1.8B | 1.5B]", false, "1.5B");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
Expand All @@ -26,19 +28,22 @@ int main(int argc, char **argv) {
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = QWenTokenizer(vocab_path, merge_path);
QWenConfig config(tokens_limit, "1.5B", RoPEType::HFHUBROPE);
auto model = QWenForCausalLM_NPU(config, 64);
QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
auto model = v2::QWenForCausalLM_NPU(config, 32);

Module::initBackend(MLLM_QNN);

model.load(model_path);
auto decoding_model = QWenForCausalLM(config);
decoding_model.load("../models/qwen-2.5-1.5b-instruct-q4_0_4_4.mllm");
decoding_model.load("../models/qwen-2.5-1.5b-instruct-q4_k.mllm");

vector<string> in_strs = {
" Give me a short introduction to large language model.",
};

for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 32, config.vocab_size);
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;

Expand All @@ -48,9 +53,7 @@ int main(int argc, char **argv) {
LlmTextGeneratorOpts opt{
.max_new_tokens = 1,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,

.is_padding = true,
.seq_before_padding = real_seq_length,
};
Expand All @@ -67,7 +70,7 @@ int main(int argc, char **argv) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
.max_new_tokens = 50,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
Expand All @@ -79,6 +82,7 @@ int main(int argc, char **argv) {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
Expand All @@ -96,5 +100,9 @@ int main(int argc, char **argv) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
std::cout << "\n";

if (!std::filesystem::exists("qnn_context.bin")) {
static_cast<QNNBackend *>(Backend::global_backends[MLLM_QNN])->saveQNNContext();
}
}
}
}
124 changes: 124 additions & 0 deletions examples/demo_qwen2.5_pipeline.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#include "Backend.hpp"
#include "Trace.hpp"
#include "Types.hpp"
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
#include "models/qwen/modeling_qwen_npu.hpp"
#include "models/qwen/modeling_qwen.hpp"
#include "models/qwen/tokenization_qwen.hpp"
#include "processor/PostProcess.hpp"
#include "Parallel.hpp"

using namespace mllm;

int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-2-int8-test.mllm");
cmdParser.add<string>("billion", 'b', "[0.5B | 1.8B | 1.5B]", false, "1.5B");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string merge_path = cmdParser.get<string>("merge");
string model_path = cmdParser.get<string>("model");
string model_billion = cmdParser.get<string>("billion");
int tokens_limit = cmdParser.get<int>("limits");
const int chunk_size = 128;
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

Module::initBackend(MLLM_QNN);

auto tokenizer = QWenTokenizer(vocab_path, merge_path);
QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
auto model = v2::QWenForCausalLM_NPU(config, chunk_size);
model.load(model_path);
auto decoding_model = QWenForCausalLM(config);
decoding_model.load("../models/qwen-2.5-1.5b-instruct-q4_0_4_4.mllm");

string trace_string = " ";
auto [_, input_tensor] = tokenizer.tokenizePaddingByChunk(trace_string, chunk_size, config.vocab_size);
Tracer::trace(&model, {input_tensor});
std::cout << "Trace and Warmup finished" << std::endl;

vector<string> in_strs = {
// " Give me a short introduction to large language model.",
"\"Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand and generate human-like text. These models are trained on vast amounts of data, enabling them to perform a wide range of tasks, from answering questions and summarizing text to generating creative content and engaging in conversational dialogue. LLMs like GPT-3 and GPT-4, developed by OpenAI, have set new benchmarks in natural language processing by leveraging deep learning architectures, particularly transformer models, which excel at capturing context and relationships within text. The scalability and versatility of LLMs make them invaluable tools for applications in education, customer service, content creation, and more. However, their deployment also raises ethical considerations, including issues of bias, misinformation, and the potential for misuse. As the field continues to evolve, ongoing research and responsible deployment strategies are essential to harnessing the full potential of these powerful AI systems while mitigating their risks.\"\nGenerate a title based on the above text."};

for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);

// set total seq length for HeadLinear execute, which can not get the real seq length from Opts
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setTotalSequenceLength(real_seq_length);
// set chunk size for the HeadLinear execute, which can not get the chunk size from Opts
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setChunkSize(chunk_size);

std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;
std::cout << "real_seq_length: " << real_seq_length << std::endl;

LlmTextGeneratorOpts opt{
.max_new_tokens = 1,
.do_sample = false,
.is_padding = true,
.seq_before_padding = real_seq_length,
.chunk_size = chunk_size,
};

// tensor vectors to save the chunked tensors of the QNN prefilling input
bool isSwitched = false;

ChunkPipeline pipeline(real_seq_length, chunk_size);
auto prefill_result = pipeline.run(input_tensor, opt, tokenizer, model, isSwitched);

Module::isMultiChunkPrefilling = true;
Module::isFirstChunk = false;

static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,
.is_padding = false,
};
isSwitched = false;

Tensor decoding_input;
decoding_input.setBackend(Backend::global_backends[MLLM_CPU]);
decoding_input.setTtype(INPUT_TENSOR);
decoding_input.reshape(1, 1, 1, 1);
decoding_input.setName("input0");
decoding_input.alloc();
decoding_input.setDataAt(0, 0, 0, 0, prefill_result->dataAt<float>(0, 0, 0, 0));
decoding_model.generate(decoding_input, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
auto [isOk, print_string] = tokenizer.postprocess(out_string);
if (isOk) {
std::cout << print_string << std::flush;
} else {
return false;
}
return true;
});

// turn on switching, set sequence length and execution type
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(0);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
std::cout << "\n";
}
}
6 changes: 3 additions & 3 deletions examples/demo_qwen2_vl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "cmdline.h"
#include "models/qwen2_vl/configuration_qwen2_vl.hpp"
#include "models/qwen2_vl/modeling_qwen2_vl.hpp"
// #include "models/qwen2_vl/vtp/modeling_qwen2_vl.hpp"
#include "models/qwen2_vl/processing_qwen2_vl.hpp"
#include "processor/PostProcess.hpp"

Expand All @@ -11,7 +12,7 @@ int main(int argc, char **argv) {
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2vl_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2vl_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-2-vl-2b-instruct-q4_k.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 2000);
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 800);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);

Expand All @@ -25,8 +26,7 @@ int main(int argc, char **argv) {
ParamLoader param_loader(model_path);
auto processor = Qwen2VLProcessor(vocab_path, merge_path);
Qwen2VLConfig config(tokens_limit, "1.5b");
auto model_config = Qwen2VLConfig(config);
auto model = Qwen2VLModel(model_config);
auto model = Qwen2VLModel(config);
model.load(model_path);

vector<string> in_imgs = {
Expand Down
Loading