diff --git a/.clang-tidy.ignore b/.clang-tidy.ignore
index 2b27ee02b..94d682437 100644
--- a/.clang-tidy.ignore
+++ b/.clang-tidy.ignore
@@ -1 +1 @@
-src/quantizer/gguf.hpp
\ No newline at end of file
+mllm/quantizer/gguf.hpp
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 97847b5ed..a65454c60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
.vscode/
.idea/
.cache/
+.DS_Store
build*/
build/
bin/
@@ -26,15 +27,20 @@ models/*
/.devcontainer/
/.vscode/
workflow.py
-src/backends/qnn/qualcomm_ai_engine_direct_220/*
-src/backends/qnn/HexagonSDK/*
+mllm/backends/qnn/qualcomm_ai_engine_direct_220/*
+mllm/backends/qnn/HexagonSDK/*
tmp/
py-build-out/
mllm.egg-info/
-examples/demo_deepseek.cpp
-src/models/deepseek/*
-examples/demo.cpp
+mllm/backends/qnn/sdk*
-src/backends/qnn/sdk/*
-*.mllm
+
+
+.DS_Store
+examples/test.cpp
+examples/demo_bailing_moe2*
+mllm/models/ling2
+scripts/tmp.sh
+tools/convertor/gptq_converter.py
+*.patch
diff --git a/.gitmodules b/.gitmodules
index b52617297..78351cb62 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,9 +4,12 @@
[submodule "third_party/pybind11"]
path = third_party/pybind11
url = https://github.com/pybind/pybind11.git
-[submodule "src/backends/xnnpack/third_party/XNNPACK"]
- path = src/backends/xnnpack/third_party/XNNPACK
+[submodule "mllm/backends/xnnpack/third_party/XNNPACK"]
+ path = mllm/backends/xnnpack/third_party/XNNPACK
url = https://github.com/google/XNNPACK.git
[submodule "third_party/googletest"]
path = third_party/googletest
url = https://github.com/google/googletest.git
+[submodule "mllm/backends/cpu/third_party/kleidiai"]
+ path = mllm/backends/cpu/third_party/kleidiai
+ url = https://github.com/ARM-software/kleidiai
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c842fefd4..ae0fd8f13 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.10)
-project(mllm)
+project(mllm CXX C ASM)
# 添加编译选项来禁用所有警告
# if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
@@ -11,6 +11,29 @@ project(mllm)
cmake_policy(SET CMP0074 NEW)
set(CMAKE_CXX_STANDARD 17)
+
+
+# 添加Address Sanitizer选项
+option(USE_ASAN "Enable AddressSanitizer for memory leak detection" OFF)
+
+if(USE_ASAN)
+ message(STATUS "Enabling AddressSanitizer")
+ # 确保包含调试符号
+ if(NOT MSVC)
+ add_compile_options(-g)
+ endif()
+
+ # 设置ASan编译选项
+ if(MSVC)
+ add_compile_options(/fsanitize=address)
+ add_link_options(/fsanitize:address)
+ else()
+ add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+ add_link_options(-fsanitize=address)
+ endif()
+endif()
+
+
option(ARM "build on ARM" OFF)
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@@ -31,17 +54,24 @@ add_compile_options(-Wno-gnu-string-literal-operator-template)
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
set(ARM ON)
- set(ANDROID_PLATFORM android-28)
+ # set(ANDROID_PLATFORM android-28)
endif ()
if (ARM)
- set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm)
+if(QNN)
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm-qnn)
+elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin" AND NOT CMAKE_CROSSCOMPILING)
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin)
+else()
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm)
+endif()
+
add_compile_definitions(__ARM_FEATURE_DOTPROD)
# 检查是否使用的是 GCC 或 Clang 编译器
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
# 默认使用 armv8.2-a+dotprod,除非用户自定义了 CMAKE_CXX_FLAGS
if(NOT DEFINED CMAKE_CXX_FLAGS OR CMAKE_CXX_FLAGS STREQUAL "")
- set(CMAKE_CXX_FLAGS "-march=armv8.6-a+dotprod+i8mm")
+ set(CMAKE_CXX_FLAGS "-march=armv8.2-a+fp16+fp16fml+dotprod+i8mm")
endif()
endif()
else ()
@@ -88,6 +118,32 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif ()
+option(OPENCL "Enable OpenCL Backend" ON)
+if(OPENCL)
+ if(ANDROID)
+ # 对于现代 NDK (r23+),我们不再需要 find_library 或 find_package。
+ # NDK 工具链会在链接时自动找到像 OpenCL 这样的系统库。
+ # 我们只需确保启用了相关宏定义,并在链接目标时指明即可。
+ message(STATUS "OpenCL backend for Android enabled. Linking will be handled by the NDK toolchain.")
+ add_definitions(-DUSE_OPENCL)
+ add_definitions(-DMLLM_TARGET_ANDROID)
+ include_directories(${CMAKE_SOURCE_DIR}/mllm/backends/opencl/third_party/OpenCL-Headers)
+
+ else()
+ # 对于非 Android 平台,保持原有的查找逻辑
+ find_package(OpenCL)
+ if(NOT OpenCL_FOUND)
+ message(STATUS "OpenCl backend not found.")
+ set(OPENCL OFF CACHE BOOL "Enable OpenCL Backend" FORCE)
+ else()
+ message(STATUS "OpenCL backend enabled.")
+ add_definitions(-DUSE_OPENCL)
+ endif()
+ endif()
+endif()
+
+
+
# for XNNPACK, avoid invovle googltest twice.
set(GOOGLETEST_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/third_party/googletest)
add_subdirectory(third_party/googletest EXCLUDE_FROM_ALL)
@@ -107,11 +163,11 @@ if (ARM AND NOT APK)
set(MLLM_OPENMP_STATIC ON)
endif ()
# turn off openmp when build on mac or for mac
-if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin" AND NOT CMAKE_CROSSCOMPILING)
- message(STATUS "mac detected, turn off openmp")
- set(MLLM_OPENMP OFF)
- set(MLLM_OPENMP_STATIC OFF)
-endif ()
+# if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin" AND NOT CMAKE_CROSSCOMPILING)
+# message(STATUS "mac detected, turn off openmp")
+# set(MLLM_OPENMP OFF)
+# set(MLLM_OPENMP_STATIC OFF)
+# endif ()
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86_64 detected")
@@ -127,18 +183,19 @@ elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATC
endif()
endif()
-aux_source_directory(${PROJECT_SOURCE_DIR}/src DIR_SRC)
+aux_source_directory(${PROJECT_SOURCE_DIR}/mllm DIR_SRC)
-aux_source_directory(${PROJECT_SOURCE_DIR}/src/express DIR_SRC_EXP)
+aux_source_directory(${PROJECT_SOURCE_DIR}/mllm/express DIR_SRC_EXP)
+# directory for legacy code, which is not used in the current version(Graph,Net,Executor)
+aux_source_directory(${PROJECT_SOURCE_DIR}/mllm/legacy DIR_SRC_LEGACY)
-aux_source_directory(${PROJECT_SOURCE_DIR}/src/processor DIR_SRC_PROCESSOE)
-aux_source_directory(${PROJECT_SOURCE_DIR}/src/memory DIR_SRC_MEM_MANAGER)
+aux_source_directory(${PROJECT_SOURCE_DIR}/mllm/processor DIR_SRC_PROCESSOE)
+aux_source_directory(${PROJECT_SOURCE_DIR}/mllm/memory DIR_SRC_MEM_MANAGER)
aux_source_directory(${PROJECT_SOURCE_DIR}/examples EMP_SRC)
aux_source_directory(${PROJECT_SOURCE_DIR}/test TEST_SRC)
aux_source_directory(${PROJECT_SOURCE_DIR}/third_party/wenet_audio DIR_THIRDPARTY_AUDIO)
-include_directories(${PROJECT_SOURCE_DIR}/src)
-include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_SOURCE_DIR}/mllm)
include_directories(${PROJECT_SOURCE_DIR}/third_party)
include_directories(${PROJECT_SOURCE_DIR}/third_party/fmt/include)
@@ -146,20 +203,21 @@ include_directories(${PROJECT_SOURCE_DIR}/third_party/fmt/include)
# You can remove those lines if you just want to build mllm instead dev on it.
include_directories(${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
-add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/cpu)
+add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mllm/backends/cpu)
+
+
+if(OPENCL)
+ add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mllm/backends/opencl)
+endif()
+# =========
if(QNN) # QNN lib
include_directories(
# $ENV{QNN_SDK_ROOT}/include/QNN # QNN SDK include
- ${PROJECT_SOURCE_DIR}/src/backends/qnn/sdk/include/QNN # QNN SDK include
- ${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn
- ${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn/Log
- ${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn/PAL/include
- ${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn/Model
- ${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn/Utils
- ${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn/WrapperUtils
+ ${PROJECT_SOURCE_DIR}/mllm/backends/qnn/sdk/include/QNN # QNN SDK include
+ ${CMAKE_CURRENT_LIST_DIR}/mllm/backends/qnn
)
- add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn)
+ add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mllm/backends/qnn)
endif()
option(MLLM_BUILD_XNNPACK_BACKEND "Build mllm's XNNPACK backend" OFF)
@@ -175,41 +233,88 @@ if(MLLM_BUILD_XNNPACK_BACKEND)
set(XNNPACK_BUILD_TESTS OFF)
set(XNNPACK_BUILD_BENCHMARKS OFF)
add_definitions(-DMLLM_BUILD_XNNPACK_BACKEND=1)
- add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/xnnpack)
+ add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mllm/backends/xnnpack)
endif()
# add tokenizers
file(GLOB_RECURSE SRC_TOKENIZERS
- ${PROJECT_SOURCE_DIR}/src/tokenizers/*.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/*.hpp
+ ${PROJECT_SOURCE_DIR}/mllm/tokenizers/*.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/tokenizers/*.hpp
)
# if compile to x86_64
if(QUANT)
- include_directories(${PROJECT_SOURCE_DIR}/src/quantizer)
+ include_directories(${PROJECT_SOURCE_DIR}/tools/quantizer)
file(GLOB_RECURSE MLLM_QUANT
- ${PROJECT_SOURCE_DIR}/src/backends/cpu/compute/GEMM_AArch64.hpp
- ${PROJECT_SOURCE_DIR}/src/backends/cpu/compute/GEMM_AArch64.cpp
- ${PROJECT_SOURCE_DIR}/src/backends/cpu/quantize/*.hpp
- ${PROJECT_SOURCE_DIR}/src/backends/cpu/quantize/*.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/backends/cpu/third_party/ggml/GemmPack.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/backends/cpu/compute/GemmKleidiai.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/backends/cpu/third_party/ggml/QuantizeQ8.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/backends/cpu/third_party/ggml/QuantizeQ4.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/backends/cpu/third_party/ggml/QuantizeQ6.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/backends/cpu/third_party/ggml/QuantizeQ3.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/backends/cpu/third_party/ggml/QuantizeQ2.cpp
)
- file(GLOB_RECURSE MLLM_QUANTIZER
- ${CMAKE_CURRENT_LIST_DIR}/src/quantizer/*.cpp
- ${CMAKE_CURRENT_LIST_DIR}/src/quantizer/*.hpp)
- list(REMOVE_ITEM MLLM_QUANTIZER ${CMAKE_CURRENT_LIST_DIR}/src/quantizer/main.cpp)
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+ # 配置 kleidiai 库路径
+ set(KLEIDIAI_SOURCE_DIR ${PROJECT_SOURCE_DIR}/mllm/backends/cpu/third_party/kleidiai)
+ if(NOT EXISTS ${KLEIDIAI_SOURCE_DIR})
+ message(FATAL_ERROR "kleidiai library not found! Please place it in 'third_party/kleidiai'.")
+ endif()
+ # 添加所有源文件路径到 MLLM_QUANT
+ list(APPEND MLLM_QUANT
+ # QSI4_C32P (to FP32) 模块源文件
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod_asm.S
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm_asm.S
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c
+
+ # [新增] QSI4_CXP (to FP16) 模块源文件
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod.c
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod_asm.S
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm.c
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm_asm.S
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f16_neon.c
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0.c
+
+ # FP16 (f16*f16) 模块源文件
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c
+
+ # FP32 (f32*f32) 模块源文件
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_asm.S
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c
+ )
+ include_directories(
+ ${KLEIDIAI_SOURCE_DIR}
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/pack
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p
+ ${KLEIDIAI_SOURCE_DIR}/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p
+ )
+ endif()
+ file(GLOB_RECURSE MLLM_QUANTIZER
+ ${PROJECT_SOURCE_DIR}/tools/quantizer/*.cpp
+ ${PROJECT_SOURCE_DIR}/tools/quantizer/*.hpp)
+ list(REMOVE_ITEM MLLM_QUANTIZER ${PROJECT_SOURCE_DIR}/tools/quantizer/main_quantize.cpp)
add_executable(
quantize
- ${PROJECT_SOURCE_DIR}/src/quantizer/main.cpp
+ ${PROJECT_SOURCE_DIR}/tools/quantizer/main_quantize.cpp
${MLLM_QUANT}
${MLLM_QUANTIZER}
-
- # ${DIR_SRC}
- ${PROJECT_SOURCE_DIR}/src/ParamLoader.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/ParamLoader.cpp
)
- target_link_libraries(quantize fmt::fmt-header-only)
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+ message(STATUS "AArch64/arm64 architecture detected. Applying FP16 compile options to the target.")
+ target_compile_options(quantize PRIVATE "-march=armv8.2-a+fp16")
+ endif()
+ target_link_libraries(quantize fmt::fmt-header-only)
if(FROM_GGUF)
add_executable(
from_gguf
@@ -219,7 +324,7 @@ if(QUANT)
${MLLM_QUANTIZER}
# ${DIR_SRC}
- ${PROJECT_SOURCE_DIR}/src/ParamLoader.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/ParamLoader.cpp
)
target_link_libraries(from_gguf fmt::fmt-header-only)
endif()
@@ -229,61 +334,41 @@ if(TEST)
add_subdirectory(test)
endif()
-# add_executable(demo examples/demo.cpp)
-# target_link_libraries(demo fmt::fmt-header-only)
add_subdirectory(examples)
if(APK)
- add_library(mllm_lib STATIC ${DIR_SRC_CPU} ${DIR_SRC_EXP} ${DIR_SRC} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_PROCESSOE}
+ add_library(mllm_lib STATIC ${DIR_SRC_EXP} ${DIR_SRC} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_PROCESSOE} ${DIR_SRC_LEGACY}
${DIR_THIRDPARTY_AUDIO}
- src/tokenizers/Tokenizer.cpp
+ mllm/tokenizers/Tokenizer.cpp
tools/jni/LibHelper.cpp
-
- # src/tokenizers/Tokenizer.hpp
- # src/tokenizers/Unigram/Unigram.hpp
- src/tokenizers/Unigram/Unigram.cpp
-
- # src/tokenizers/Unigram/trie.hpp
- src/tokenizers/BPE/Bpe.cpp
-
- # src/tokenizers/BPE/Bpe.hpp
- src/tokenizers/Unicode.cpp
- src/tokenizers/UnicodeData.cpp
- src/tokenizers/BPE/Bpe.cpp
- src/tokenizers/WordPiece/WordPiece.cpp
-
- # models/bert/configuration_bert.hpp
- # models/bert/modeling_bert.hpp
- # models/bert/tokenization_bert.hpp
- # models/fuyu/configuration_fuyu.hpp
- # models/fuyu/modeling_fuyu.hpp
- # models/fuyu/processing_fuyu.hpp
- # models/phonelm/configuration_phonelm.hpp
- # models/phonelm/modeling_phonelm.hpp
- # models/qwen/configuration_qwen.hpp
- # models/qwen/modeling_qwen.hpp
- # models/qwen/tokenization_qwen.hpp
- # models/smollm/tokenization_smollm.hpp
- # tokenizers/Unigram/Unigram.hpp
+ mllm/tokenizers/Unigram/Unigram.cpp
+ mllm/tokenizers/BPE/Bpe.cpp
+ mllm/tokenizers/Unicode.cpp
+ mllm/tokenizers/UnicodeData.cpp
+ mllm/tokenizers/BPE/Bpe.cpp
+ mllm/tokenizers/WordPiece/WordPiece.cpp
)
- target_link_libraries(mllm_lib MLLM_CPU)
+ target_link_libraries(mllm_lib mllm_cpu)
+ if (OPENCL)
+ target_link_libraries(mllm_lib mllm_opencl)
+ endif()
if(QNN)
- target_link_libraries(mllm_lib MLLM_QNN)
+ target_link_libraries(mllm_lib mllm_qnn)
endif()
endif()
if(MLLM_ENABLE_PYTHON)
- target_compile_options(MLLM_CPU PRIVATE -fPIC)
+ target_compile_options(mllm_cpu PRIVATE -fPIC)
find_package(Python3 COMPONENTS Interpreter Development)
include_directories(${Python3_INCLUDE_DIRS})
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11)
set(_py_dep_libs
- MLLM_CPU
+ mllm_cpu
- # MLLM_QNN
+ # mllm_qnn
# ${CMAKE_DL_LIBS}
)
diff --git a/README.md b/README.md
index 88c44bc70..544387659 100644
--- a/README.md
+++ b/README.md
@@ -28,16 +28,15 @@ fast and lightweight multimodal LLM inference engine for mobile and e
mllm is a lightweight, fast, and easy-to-use (multimodal) on-device LLM inference engine for mobile devices (mainly supporting CPU/NPU), initiated by the research groups led by [Mengwei Xu](https://xumengwei.github.io/) (BUPT) and [Xuanzhe Liu](http://www.liuxuanzhe.com/) (PKU).
## Recent update
+
+- [2025 July 30] Add Rotation Quantization method for QNN backend models and support Qwen-2-VL 2B
- [2025 August 28] 🔥🔥🔥 Support for MLLM V1 is ending soon. Before its retirement, V1 will integrate the following features: GPT-OSS and NPU QWEN2-VL. MLLM will then transition to V2, which can be viewed on the V2 branch.
V2 will include brand-new capabilities:
- A more Pythonic model authoring approach with eager execution
- Compilation support and MLLM IR for easier NPU integration
- Support for parallel execution of multiple models
- A more refined engineering implementation
-- [2024 November 21] Support new model: Phi 3 Vision https://github.com/UbiquitousLearning/mllm/pull/186
-- [2024 August 30] Support new model: MiniCPM 2B https://github.com/UbiquitousLearning/mllm/pull/132
-- [2024 August 15] Support new model: Phi 3 mini https://github.com/UbiquitousLearning/mllm/pull/119
-- [2024 Aug 10] Supporting Qualcomm NPU: https://github.com/UbiquitousLearning/mllm/pull/112 | [try it out](https://github.com/UbiquitousLearning/mllm/tree/main/src/backends/qnn) | [paper](https://arxiv.org/pdf/2407.05858v1)
+
### Contents
@@ -97,9 +96,9 @@ V2 will include brand-new capabilities:
| [LLaVA 7B](https://github.com/haotian-liu/LLaVA) | [✔️](https://huggingface.co/mllmTeam/llava-1.5-7b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/llava-1.5-7b-mllm/tree/main) | |
| [Gemma 2B](https://github.com/google/gemma_pytorch) | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main) | |
| [Gemma 2 2B](https://github.com/google/gemma_pytorch) | [✔️](https://huggingface.co/mllmTeam/gemma-2-2b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/gemma-2-2b-mllm/tree/main) | |
-| [Qwen 1.5 0.5B](https://github.com/QwenLM/Qwen) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) | |
+| [Qwen 1.5 0.5B](https://github.com/QwenLM/Qwen) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main) | ✔️ |
| [Qwen 1.5 1.8B](https://github.com/QwenLM/Qwen) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm) |
-| [Qwen 2.5 1.5B](https://github.com/QwenLM/Qwen2.5) | [✔️](https://huggingface.co/mllmTeam/qwen-2.5-1.5b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-2.5-1.5b-mllm/tree/main) | |
+| [Qwen 2.5 1.5B](https://github.com/QwenLM/Qwen2.5) | [✔️](https://huggingface.co/mllmTeam/qwen-2.5-1.5b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-2.5-1.5b-mllm/tree/main) | ✔️ |
| [Qwen 3 0.6B](https://github.com/QwenLM/Qwen3) | [✔️](https://huggingface.co/mllmTeam/qwen-3-0.6b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-3-0.6b-mllm/tree/main) | |
| [Mistral 7B](https://github.com/mistralai/mistral-src) | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main) | |
| [Yi 6B](https://huggingface.co/01-ai/Yi-1.5-6B) | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main) | |
@@ -116,15 +115,15 @@ V2 will include brand-new capabilities:
### Multimodal models
-| Model | CPU
FP32 | CPU
INT4 |
-|-----------------------------------------------------------------------------|------|-----|
+| Model | CPU
FP32 | CPU
INT4 | Hexagon NPU
INT8 |
+|-----------------------------------------------------------------------------|------|-----|----------------------------|
| [Fuyu 8B](https://www.adept.ai/blog/fuyu-8b) | [✔️](https://huggingface.co/mllmTeam/fuyu-8b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/fuyu-8b-mllm/tree/main) |
| [Vision Transformer](https://github.com/google-research/vision_transformer) | [✔️](https://huggingface.co/mllmTeam/vit-base-patch16-224-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/vit-base-patch16-224-mllm/tree/main) |
| [CLIP](https://github.com/openai/CLIP) | [✔️](https://huggingface.co/mllmTeam/clip-vit-base-patch32-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/clip-vit-base-patch32-mllm/tree/main) |
| [ImageBind](https://github.com/facebookresearch/ImageBind) (3 modalities) | [✔️](https://huggingface.co/mllmTeam/imagebind_huge-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/imagebind_huge-mllm/tree/main) |
| [LLaVA 7B](https://github.com/haotian-liu/LLaVA) | [✔️](https://huggingface.co/mllmTeam/llava-1.5-7b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/llava-1.5-7b-mllm/tree/main) |
| [Phi-3-Vision](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | [✔️](https://huggingface.co/mllmTeam/phi-3-vision-instruct-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/phi-3-vision-instruct-mllm/tree/main) |
-| [Qwen2-VL 2B](https://github.com/QwenLM/Qwen2-VL) | [✔️](https://huggingface.co/mllmTeam/qwen-2-vl-2b-instruct--mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-2-vl-2b-instruct--mllm/tree/main) |
+| [Qwen2-VL 2B](https://github.com/QwenLM/Qwen2-VL) | [✔️](https://huggingface.co/mllmTeam/qwen-2-vl-2b-instruct--mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/qwen-2-vl-2b-instruct--mllm/tree/main) | ✔️ |
## Quick Start
@@ -134,6 +133,9 @@ V2 will include brand-new capabilities:
```bash
git clone https://github.com/UbiquitousLearning/mllm
cd mllm
+git submodule update --init --recursive \
+ third_party/googletest \
+ mllm/backends/cpu/third_party/kleidiai
```
### Check prerequisites
@@ -151,7 +153,12 @@ Building mllm requires following tools:
*`NOTE:` The QNN backend is preliminary version which can do end-to-end inference. It is still under active development for better performance and more supported models.*
-We support running Qwen-1.5-1.8B-Chat using [Qualcomm QNN](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) to get Hexagon NPU acceleration on devices with Snapdragon 8 Gen3. The details of QNN environment set up and design is [here](./src/backends/qnn/README.md). The prefilling stage is performered by QNN & CPU, and the inference stage is performed by CPU.
+We support running several Qwen family models including Qwen-2-vl using [Qualcomm QNN](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) to get Hexagon NPU acceleration on devices with Snapdragon 8 Gen3. The details of QNN environment set up and design is [here](./mllm/backends/qnn/README.md). The prefilling stage is performered by QNN & CPU, and the inference stage is performed by CPU.
+
+Specifically, we support the following models (similar architecture models are also supported):
+- Qwen 1.5 1.8B (demo_qwen_npu, demo_qwen_pipeline)
+- Qwen 2.5 1.5B (demo_qwen_npu, demo_qwen_pipeline)
+- Qwen 2 VL (demo_qwen2_vl_npu and demo_qwen2_vl_npuvit)
Build the target with QNN backend.
@@ -160,7 +167,7 @@ cd ../script
./build_qnn_android.sh
```
-Download the model from [here](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm/blob/main/), or using the following instructions
+Download the model from [here](https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm/blob/main/), or using the following instructions to download the model. You can also export Pytorch models for QNN backend with int8 weight quantization and apply rotation quantization. Details can be found in backend specific [README](./mllm/backends/qnn/README.md).
```bash
mkdir ../models && cd ../models
@@ -169,19 +176,19 @@ wget https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm/resolve/main/qwen-1
wget https://huggingface.co/mllmTeam/qwen-1.5-1.8b-chat-mllm/resolve/main/qwen-1.5-1.8b-chat-q4k.mllm?download=true -O qwen-1.5-1.8b-chat-q4k.mllm
```
-Run on an android phone with at least 16GB of memory.
+Currently, QNN backend uses models with W8A8 or W8A16 quantization. (It is determined by Quantize & Dequantize ops in modeling class, you can refer to `mllm/models/qwen/modeling_qwen_npu_v2.hpp` for more details.)
+
+Run on an android phone with at least 16GB of memory as building the QNN graphs on device will consume a lot of memory. After building and saving QNN graphs to qnn_context.bin, the runtime memory usage will meet the expectation. The `demo_qwen_pipeline.cpp` will show the pipeline parallel execution for QNN models, which will nearly has 1.5x speedup compared with the original execution.
```bash
cd ../script
-./run_qwen_npu.sh
+./run_qwen_qnn.sh
```
-There are two arguments in the executable. `-s` is for the sequence length of prefilling, the default value is 64 in the demo we provided. `-c` for type of QNN prefilling options, when it is set to 1, the input will be splited into many chunks of sequence 32 and be executed in a pipeline. When it is set to 0, the input will be executed in one chunk.
-
Result are as followed:
```
-> ./main_qwen_npu -s 64 -c 1
+> ./demo_qwen_npu
[Q] <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
@@ -362,7 +369,7 @@ You can convert vocabulary to mllm vocabulary as followed.
```bash
cd tools/convertor
-python vocab.py --input_file=tokenizer.json --output_file=vocab.mllm --type=Unigram
+python vocab.py --input_file=tokenizer.json --output_file=vocab.mllm --type=BPE
```
### Quantize models
diff --git a/assets/rotation.png b/assets/rotation.png
new file mode 100644
index 000000000..43f9f2960
Binary files /dev/null and b/assets/rotation.png differ
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 198877170..0136f525d 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,60 +1,84 @@
-macro(func_link_libaries target)
- target_link_libraries(${target} PUBLIC MLLM_CPU fmt::fmt-header-only)
+set(COMMON_SRC
+ ${DIR_SRC_CPU}
+ ${DIR_SRC_MEM_MANAGER}
+ ${DIR_SRC_EXP}
+ ${DIR_SRC}
+ ${PROJECT_SOURCE_DIR}/mllm/tokenizers/Tokenizer.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/tokenizers/BPE/Bpe.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/tokenizers/WordPiece/WordPiece.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/tokenizers/Tiktoken/tiktoken.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/tokenizers/Unicode.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/tokenizers/UnicodeData.cpp
+ ${PROJECT_SOURCE_DIR}/mllm/processor/PreProcess.cpp
+)
+
+set(VLM_SPECIFIC_SRC
+ ${PROJECT_SOURCE_DIR}/mllm/tokenizers/Unigram/Unigram.cpp
+ ${DIR_SRC_PROCESSOE}
+ ${DIR_THIRDPARTY_AUDIO}
+)
+
+macro(func_set_compile_opts_defs target)
if (MLLM_OPENMP)
target_compile_options(${target} PRIVATE -fopenmp)
- if (ARM)
+ endif()
+ if (QNN)
+ target_compile_definitions(${target} PRIVATE USE_QNN)
+ endif()
+endmacro()
+
+macro(func_link_libs target)
+ target_link_libraries(${target} PUBLIC mllm_cpu fmt)
+ if (MLLM_OPENMP)
+ if (ARM AND NOT (CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin" AND NOT CMAKE_CROSSCOMPILING))
+ # 非Mac的ARM,静态链接OpenMP
target_link_libraries(${target} PUBLIC -fopenmp -static-openmp)
- else ()
+ else()
+ # 其它平台(含Mac),动态链接OpenMP
target_link_libraries(${target} PUBLIC -fopenmp)
- endif ()
- endif ()
+ endif()
+ endif()
+ if(OPENCL)
+ target_link_libraries(${target} PUBLIC mllm_opencl ${CMAKE_DL_LIBS})
+ endif()
if (QNN)
- target_compile_definitions(${target} PRIVATE USE_QNN)
- target_link_libraries(${target} PUBLIC MLLM_QNN ${CMAKE_DL_LIBS})
+ target_link_libraries(${target} PUBLIC mllm_qnn ${CMAKE_DL_LIBS})
endif()
if (MLLM_BUILD_XNNPACK_BACKEND)
- target_link_libraries(${target} PRIVATE MllmXnnpackBackend)
+ target_link_libraries(${target} PRIVATE mllm_xnnpack)
endif()
endmacro()
+add_library(mllm_llm STATIC ${COMMON_SRC})
+func_set_compile_opts_defs(mllm_llm)
+
+add_library(mllm_vlm STATIC ${VLM_SPECIFIC_SRC})
+target_link_libraries(mllm_vlm PUBLIC mllm_llm)
+func_set_compile_opts_defs(mllm_vlm)
+
macro(func_llm_add_executable target)
- add_executable(${target}
- ${PROJECT_SOURCE_DIR}/examples/${target}.cpp
- ${DIR_SRC_CPU}
- ${DIR_SRC_MEM_MANAGER}
- ${DIR_SRC_EXP}
- ${DIR_SRC}
- ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/WordPiece/WordPiece.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/Tiktoken/tiktoken.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
- ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
- )
- func_link_libaries(${target})
+ if(EXISTS "${PROJECT_SOURCE_DIR}/examples/${target}.cpp")
+ add_executable(${target} ${PROJECT_SOURCE_DIR}/examples/${target}.cpp)
+ target_link_libraries(${target} PUBLIC mllm_llm)
+ func_set_compile_opts_defs(${target})
+ func_link_libs(${target})
+ else()
+ message(WARNING "Skip ${target}: ${PROJECT_SOURCE_DIR}/examples/${target}.cpp not found")
+ endif()
endmacro()
macro(func_vlm_add_executable target)
- add_executable(${target}
- ${PROJECT_SOURCE_DIR}/examples/${target}.cpp
- ${DIR_SRC_CPU}
- ${DIR_SRC_MEM_MANAGER}
- ${DIR_SRC_EXP}
- ${DIR_SRC}
- ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/Unigram/Unigram.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
- ${PROJECT_SOURCE_DIR}/src/tokenizers/WordPiece/WordPiece.cpp
- ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
- ${DIR_SRC_PROCESSOE}
- ${DIR_THIRDPARTY_AUDIO}
- )
- func_link_libaries(${target})
+ if(EXISTS "${PROJECT_SOURCE_DIR}/examples/${target}.cpp")
+ add_executable(${target} ${PROJECT_SOURCE_DIR}/examples/${target}.cpp)
+ target_link_libraries(${target} PUBLIC mllm_vlm)
+ func_set_compile_opts_defs(${target})
+ func_link_libs(${target})
+ else()
+ message(WARNING "Skip ${target}: ${PROJECT_SOURCE_DIR}/examples/${target}.cpp not found")
+ endif()
endmacro()
+func_llm_add_executable(test)
func_llm_add_executable(mllm_benchmark)
func_llm_add_executable(demo_llama)
func_llm_add_executable(demo_tinyllama)
@@ -81,8 +105,14 @@ func_llm_add_executable(demo_phonelm)
func_llm_add_executable(demo_llama3)
func_llm_add_executable(demo_minicpm_moe_mbm)
func_llm_add_executable(demo_qwen_sd)
+func_llm_add_executable(demo_qwen_batch)
func_llm_add_executable(demo_minicpm_moe_mbp)
-
+func_llm_add_executable(demo_bailing_moe)
+func_llm_add_executable(demo_bailing_moe2)
+func_llm_add_executable(demo_bailing_moe_mbp)
+func_llm_add_executable(demo_bailing_moe2_mbp)
+func_llm_add_executable(demo_smallthinker)
+func_llm_add_executable(demo_smallthinker_mbp)
func_vlm_add_executable(demo_llava)
func_vlm_add_executable(demo_fuyu)
@@ -92,65 +122,19 @@ func_vlm_add_executable(demo_imagebind)
func_vlm_add_executable(demo_imagebind_1mod)
func_vlm_add_executable(demo_phi3v)
func_vlm_add_executable(demo_qwen2_vl)
+func_vlm_add_executable(demo_qwen2.5_vl)
func_vlm_add_executable(demo_showui)
-# func_vlm_add_executable(demo)
-
-# QNN demo
+func_vlm_add_executable(demo_qwen2_vl_vtp)
+func_vlm_add_executable(demo_showui_vtp)
if(QNN)
+ # func_llm_add_executable(demo_phonelm_npu)
func_llm_add_executable(demo_qwen_npu)
- func_llm_add_executable(main_qwen_npu)
- func_llm_add_executable(demo_phonelm_npu)
- func_llm_add_executable(main_phonelm_npu)
- func_llm_add_executable(demo_qwen2.5_npu)
- func_llm_add_executable(demo_qwen_pipeline)
+ # func_llm_add_executable(demo_qwen_npu_pipeline)
+ func_vlm_add_executable(demo_qwen2_vl_npu)
endif()
if(MLLM_BUILD_XNNPACK_BACKEND)
func_llm_add_executable(demo_qwen_xp)
-endif()
-
-
-# old main
-# macro(func_o_vlm_add_executable target)
-# add_executable(${target}
-# ${PROJECT_SOURCE_DIR}/examples/${target}.cpp
-# ${DIR_SRC_CPU}
-# ${DIR_SRC_MEM_MANAGER}
-# ${DIR_SRC_EXP}
-# ${DIR_SRC}
-# ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
-# ${PROJECT_SOURCE_DIR}/src/tokenizers/Unigram/Unigram.cpp
-# ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
-# ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
-# ${PROJECT_SOURCE_DIR}/src/processor/ClipPreProcess.cpp
-# ${PROJECT_SOURCE_DIR}/src/processor/FuyuPreProcess.cpp
-# )
-# func_link_libaries(${target})
-# endmacro()
-# macro(func_o_avlm_add_executable target)
-# add_executable(${target}
-# ${PROJECT_SOURCE_DIR}/examples/${target}.cpp
-# ${DIR_SRC_CPU}
-# ${DIR_SRC_MEM_MANAGER}
-# ${DIR_SRC_EXP}
-# ${DIR_SRC}
-# ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
-# ${PROJECT_SOURCE_DIR}/src/tokenizers/Unigram/Unigram.cpp
-# ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
-# ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
-# ${PROJECT_SOURCE_DIR}/src/processor/ClipPreProcess.cpp
-# ${DIR_SRC_PROCESSOE}
-# ${DIR_THIRDPARTY_AUDIO}
-# )
-# func_link_libaries(${target})
-# endmacro()
-# func_llm_add_executable(main_llama)
-# func_llm_add_executable(main_alpaca)
-# func_llm_add_executable(main_tinyllama)
-# func_o_vlm_add_executable(main_llava)
-# func_o_vlm_add_executable(main_fuyu)
-# func_o_vlm_add_executable(main_vit)
-# func_o_vlm_add_executable(main_clip)
-# func_o_avlm_add_executable(main_imagebind)
+endif()
\ No newline at end of file
diff --git a/examples/demo_bailing_moe.cpp b/examples/demo_bailing_moe.cpp
new file mode 100644
index 000000000..75fb23120
--- /dev/null
+++ b/examples/demo_bailing_moe.cpp
@@ -0,0 +1,93 @@
+/**
+ * @file demo_bailing_moe.cpp
+ * @brief A demo for using Bailing MoE model.
+ * @author Rongjie Yi
+ * @date 2025-07-01
+ *
+ */
+#include "Types.hpp"
+#include "cmdline.h"
+#include "models/ling/configuration_bailing_moe.hpp"
+#include "models/ling/modeling_bailing_moe.hpp"
+#include "models/ling/tokenization_bailing.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+ std::iostream::sync_with_stdio(false);
+
+ cmdline::parser cmdParser;
+ cmdParser.add("device", 'd', "mllm backend [0:`cpu` | 1:`opencl`]", false, 0);
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/ling_vocab.mllm");
+ cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/ling_merges.txt");
+ string default_model_path = "../models/ling-lite-1.5-q4_0.mllm";
+#if defined(ARM)
+ default_model_path = "../models/ling-lite-1.5-kai_q4_0.mllm";
+#endif
+ cmdParser.add("model", 'm', "specify mllm model path", false, default_model_path);
+ cmdParser.add("limits", 'l', "max KV cache size", false, 500);
+ cmdParser.add("thread", 't', "num of threads", false, 4);
+ cmdParser.add("gen", 'g', "max new tokens", false, -1);
+ cmdParser.parse_check(argc, argv);
+
+ string vocab_path = cmdParser.get("vocab");
+ string merge_path = cmdParser.get("merge");
+ string model_path = cmdParser.get("model");
+ int tokens_limit = cmdParser.get("limits");
+ int max_new_tokens = cmdParser.get("gen");
+ CPUBackend::cpu_threads = cmdParser.get("thread");
+ BackendType device = (BackendType)cmdParser.get("device");
+ assert((device == MLLM_CPU || device == MLLM_OPENCL) && "device not supports!");
+
+ auto tokenizer = BaiLingTokenizer(vocab_path, merge_path);
+ BailingMoeConfig config(tokens_limit);
+#ifdef USE_OPENCL
+ if (device == MLLM_OPENCL) {
+ config.dtype = MLLM_TYPE_F16;
+ config.attn_implementation = "eager";
+ }
+#endif
+ // config.attn_implementation = "sage_attention";
+ auto model = BailingMoeForCausalLM(config);
+#ifdef USE_OPENCL
+ model = model.to(device);
+#endif
+ model.load(model_path);
+
+ vector in_strs = {
+ "怎样计算1+2+...+100的和?",
+ "Who are you?",
+ "Give me a short introduction to large language model.",
+ "背诵天下第一骈文",
+ "夕焼けとコオロギが一斉に飛び、秋水は共に天一色になる。上面句子翻译成中文",
+ "你写一首七言绝句。",
+ "背诵一下水调歌头。",
+ "清晨的阳光透过薄纱窗帘,懒洋洋地洒在木地板上,空气中飘散着咖啡豆研磨后特有的醇厚香气。窗外传来几声清脆的鸟鸣,伴随着远处隐约的车流声,构成这座都市尚未完全苏醒的独特交响。书桌上摊开着昨夜未读完的书,书页边缘已微微卷起。厨房里,水壶正发出细密的声响,预示着一天的热饮即将就绪。昨日的计划表贴在冰箱门上,几个重要的待办事项用红笔醒目地圈出。公园里晨练的人们身影绰绰,有节奏的脚步声和太极音乐交织。一只橘猫敏捷地跃上围墙,在晨光中伸展着腰肢,神态悠闲得仿佛它是这片领地的主人。街角的面包店刚拉开铁门,新鲜出炉的面包香气迫不及待地涌向街头。公交站台上,等待的乘客低头刷着手机屏幕,神情各异。云朵缓慢地在湛蓝的天空中移动,时间似乎被拉长了片刻。生活就在这些微小的、平凡的细节里徐徐展开,既不惊天动地,却也充满细碎的温暖和实在的步履。新的一天开始了。\n请在以上文本中找出描述“气味”的句子(复制出来),然后判断叙述者对“橘猫”的态度是正面还是负面,最后请用三个成语概括文中描绘的早晨氛围。",
+ "项羽已杀卿子冠军,威震楚国,名闻诸侯。乃遣当阳君、蒲将军将卒二万渡河,救巨鹿。战少利,陈馀复请兵。项羽乃悉引兵渡河,皆沉船,破釜甑,烧庐舍,持三日粮,以示士卒必死,无一还心。于是至则围王离,与秦军遇,九战,绝其甬道,大破之,杀苏角,虏王离。涉间不降楚,自烧杀。当是时,楚兵冠诸侯。诸侯军救巨鹿下者十余壁,莫敢纵兵。及楚击秦,诸将皆从壁上观。楚战士无不一以当十,楚兵呼声动天,诸侯军无不人人惴恐。于是已破秦军,项羽召见诸侯将,入辕门,无不膝行而前,莫敢仰视。项羽由是始为诸侯上将军,诸侯皆属焉。 问题:结合项羽在巨鹿之战中的战术决策与心理威慑手段,分析其如何实现『楚战士无不一以当十』的战斗效应,并论述这种军事心理学实践对诸侯将领『膝行而前,莫敢仰视』行为模式的生成机制。",
+ };
+ for (int i = 0; i < in_strs.size(); ++i) {
+ auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+ auto input_tensor = tokenizer.tokenize(input_str);
+ // std::cout << "[tks]" << input_tensor.sequence() << " tokens" << std::endl;
+ std::cout << "[Q] " << in_strs[i] << std::endl;
+ std::cout << "[A] " << std::flush;
+
+ LlmTextGeneratorOpts opt{
+ .max_new_tokens = max_new_tokens > 0 ? max_new_tokens : static_cast(tokens_limit - input_tensor.sequence()),
+ .do_sample = false,
+ .temperature = 0.3F,
+ .top_k = 50,
+ .top_p = 0.F,
+ };
+ model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+ auto out_string = tokenizer.detokenize({out_token});
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { return false; }
+ std::cout << output_string << std::flush;
+ return true;
+ });
+ std::cout << "\n";
+ model.clear_kvcache();
+ model.profiling();
+ }
+}
diff --git a/examples/demo_bailing_moe_mbp.cpp b/examples/demo_bailing_moe_mbp.cpp
new file mode 100644
index 000000000..c0071b45a
--- /dev/null
+++ b/examples/demo_bailing_moe_mbp.cpp
@@ -0,0 +1,98 @@
+/**
+ * @file demo_bailing_moe.cpp
+ * @brief A demo for using Bailing MoE model.
+ * @author Rongjie Yi
+ * @date 2025-07-01
+ *
+ */
+#include "Module.hpp"
+#include "cmdline.h"
+#include "models/ling/configuration_bailing_moe.hpp"
+#include "models/ling/mbp/modeling_bailing_moe_mbp.hpp"
+// #include "models/ling/mbp/modeling_bailing_moe_mbp_e.hpp"
+// #include "models/ling/mbp/modeling_bailing_moe_mbppip.hpp"
+#include "models/ling/tokenization_bailing.hpp"
+#include
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+ std::iostream::sync_with_stdio(false);
+ Module::alloc_mmap = false;
+
+ cmdline::parser cmdParser;
+ cmdParser.add("device", 'd', "mllm backend [0:`cpu` | 1:`opencl`]", false, 0);
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/ling_vocab.mllm");
+ cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/ling_merges.txt");
+ string default_model_path = "../models/ling-lite-1.5-q4_0.mllm";
+#if defined(ARM)
+ default_model_path = "../models/ling-lite-1.5-kai_q4_0.mllm";
+#endif
+ cmdParser.add("model", 'm', "specify mllm model path", false, default_model_path);
+ cmdParser.add("limits", 'l', "max KV cache size", false, 500);
+ cmdParser.add("thread", 't', "num of threads", false, 4);
+ cmdParser.add("gen", 'g', "max new tokens", false, -1);
+ cmdParser.parse_check(argc, argv);
+
+ string vocab_path = cmdParser.get("vocab");
+ string merge_path = cmdParser.get("merge");
+ string model_path = cmdParser.get("model");
+ int tokens_limit = cmdParser.get("limits");
+ int max_new_tokens = cmdParser.get("gen");
+ CPUBackend::cpu_threads = cmdParser.get("thread");
+ BackendType device = (BackendType)cmdParser.get("device");
+ assert((device == MLLM_CPU || device == MLLM_OPENCL) && "device not supports!");
+
+ auto tokenizer = BaiLingTokenizer(vocab_path, merge_path);
+ BailingMoeConfig config(tokens_limit);
+#ifdef USE_OPENCL
+ if (device == MLLM_OPENCL) {
+ config.dtype = MLLM_TYPE_F16;
+ config.attn_implementation = "eager";
+ }
+#endif
+ // config.attn_implementation = "sage_attention";
+ auto model = BailingMoeForCausalLM(config);
+#ifdef USE_OPENCL
+ model = model.to(device);
+#endif
+ model.load(model_path);
+
+ vector in_strs = {
+ "怎样计算1+2+...+100的和?",
+ "Who are you?",
+ "Give me a short introduction to large language model.",
+ "夕焼けとコオロギが一斉に飛び、秋水は共に天一色になる。上面句子翻译成中文",
+ "你写一首七言绝句。",
+ "背诵一下水调歌头。",
+ "清晨的阳光透过薄纱窗帘,懒洋洋地洒在木地板上,空气中飘散着咖啡豆研磨后特有的醇厚香气。窗外传来几声清脆的鸟鸣,伴随着远处隐约的车流声,构成这座都市尚未完全苏醒的独特交响。书桌上摊开着昨夜未读完的书,书页边缘已微微卷起。厨房里,水壶正发出细密的声响,预示着一天的热饮即将就绪。昨日的计划表贴在冰箱门上,几个重要的待办事项用红笔醒目地圈出。公园里晨练的人们身影绰绰,有节奏的脚步声和太极音乐交织。一只橘猫敏捷地跃上围墙,在晨光中伸展着腰肢,神态悠闲得仿佛它是这片领地的主人。街角的面包店刚拉开铁门,新鲜出炉的面包香气迫不及待地涌向街头。公交站台上,等待的乘客低头刷着手机屏幕,神情各异。云朵缓慢地在湛蓝的天空中移动,时间似乎被拉长了片刻。生活就在这些微小的、平凡的细节里徐徐展开,既不惊天动地,却也充满细碎的温暖和实在的步履。新的一天开始了。\n请在以上文本中找出描述“气味”的句子(复制出来),然后判断叙述者对“橘猫”的态度是正面还是负面,最后请用三个成语概括文中描绘的早晨氛围。",
+ "项羽已杀卿子冠军,威震楚国,名闻诸侯。乃遣当阳君、蒲将军将卒二万渡河,救巨鹿。战少利,陈馀复请兵。项羽乃悉引兵渡河,皆沉船,破釜甑,烧庐舍,持三日粮,以示士卒必死,无一还心。于是至则围王离,与秦军遇,九战,绝其甬道,大破之,杀苏角,虏王离。涉间不降楚,自烧杀。当是时,楚兵冠诸侯。诸侯军救巨鹿下者十余壁,莫敢纵兵。及楚击秦,诸将皆从壁上观。楚战士无不一以当十,楚兵呼声动天,诸侯军无不人人惴恐。于是已破秦军,项羽召见诸侯将,入辕门,无不膝行而前,莫敢仰视。项羽由是始为诸侯上将军,诸侯皆属焉。 问题:结合项羽在巨鹿之战中的战术决策与心理威慑手段,分析其如何实现『楚战士无不一以当十』的战斗效应,并论述这种军事心理学实践对诸侯将领『膝行而前,莫敢仰视』行为模式的生成机制。",
+ };
+
+ ling_mbp_init(config.num_hidden_layers, config.num_experts);
+ for (int i = 0; i < in_strs.size(); ++i) {
+ auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+ auto input_tensor = tokenizer.tokenize(input_str);
+ std::cout << "[Q] " << in_strs[i] << std::endl;
+ std::cout << "[A] " << std::flush;
+
+ LlmTextGeneratorOpts opt{
+ .max_new_tokens = max_new_tokens > 0 ? max_new_tokens : static_cast(tokens_limit - input_tensor.sequence()),
+ .do_sample = false,
+ .temperature = 0.3F,
+ .top_k = 50,
+ .top_p = 0.F,
+ };
+ model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+ auto out_string = tokenizer.detokenize({out_token});
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { return false; }
+ std::cout << output_string << std::flush;
+ return true;
+ });
+ std::cout << "\n";
+ model.clear_kvcache();
+ model.profiling();
+ // prinMBPtimes();
+ }
+}
diff --git a/examples/demo_ds_qwen2.cpp b/examples/demo_ds_qwen2.cpp
index 36ef9ee6d..afa588790 100644
--- a/examples/demo_ds_qwen2.cpp
+++ b/examples/demo_ds_qwen2.cpp
@@ -23,7 +23,7 @@ int main(int argc, char **argv) {
cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/ds_qwen2_merges.txt");
cmdParser.add("model", 'm', "specify mllm model path", false, "../models/ds-qwen-2-1.5b-q4_k.mllm");
cmdParser.add("billion", 'b', "only support ds-1.5B right now", false, "ds-1.5B");
- cmdParser.add("limits", 'l', "max KV cache size", false, 400);
+ cmdParser.add("limits", 'l', "max KV cache size", false, 1040);
cmdParser.add("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
@@ -47,7 +47,7 @@ int main(int argc, char **argv) {
std::cout << "[A] " << std::flush;
LlmTextGeneratorOpts opt{
- .max_new_tokens = 300,
+ .max_new_tokens = 1000,
.do_sample = true,
.temperature = 0.3F,
.top_k = 50,
diff --git a/examples/demo_llama3.cpp b/examples/demo_llama3.cpp
index 005f6c777..c7957e43e 100644
--- a/examples/demo_llama3.cpp
+++ b/examples/demo_llama3.cpp
@@ -13,8 +13,8 @@ using namespace mllm;
int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama3_tokenizer.model");
- cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-3.2-1b-instruct_q4_k.mllm");
- cmdParser.add("billion", 'b', "[1B | 3B |]", false, "1B");
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-3.2-1b-instruct-kai_q4_0.mllm");
+ cmdParser.add("billion", 'b', "[1B | 3B |]", false, "1B-lm");
cmdParser.add("limits", 'l', "max KV cache size", false, 400);
cmdParser.add("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
diff --git a/examples/demo_minicpm_moe_mbm.cpp b/examples/demo_minicpm_moe_mbm.cpp
index 6cd876429..44773c5f2 100644
--- a/examples/demo_minicpm_moe_mbm.cpp
+++ b/examples/demo_minicpm_moe_mbm.cpp
@@ -8,6 +8,7 @@
using namespace mllm;
int main(int argc, char **argv) {
+ Module::alloc_mmap = false;
cmdline::parser cmdParser;
cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/minicpm_vocab.mllm");
cmdParser.add("model", 'm', "specify mllm model path", false, "../models/minicpm-moe-8x2b-q4_k.mllm");
diff --git a/examples/demo_minicpm_moe_mbp.cpp b/examples/demo_minicpm_moe_mbp.cpp
index be85a8d8c..e2da3876c 100644
--- a/examples/demo_minicpm_moe_mbp.cpp
+++ b/examples/demo_minicpm_moe_mbp.cpp
@@ -8,6 +8,7 @@
using namespace mllm;
int main(int argc, char **argv) {
+ Module::alloc_mmap = false;
cmdline::parser cmdParser;
cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/minicpm_vocab.mllm");
cmdParser.add("model", 'm', "specify mllm model path", false, "../models/minicpm-moe-8x2b-q4_k.mllm");
diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp
index 7d269eb94..494d5d79d 100644
--- a/examples/demo_phonelm_npu.cpp
+++ b/examples/demo_phonelm_npu.cpp
@@ -1,4 +1,8 @@
+
+
+#include "Context.hpp"
#include "Module.hpp"
+#include "QNNBackend.hpp"
#include "Types.hpp"
#include
#include "backends/cpu/CPUBackend.hpp"
@@ -13,8 +17,10 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
+
cmdParser.add("model", 'm', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-int8.mllm");
cmdParser.add("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
+
cmdParser.add("limits", 'l', "max KV cache size", false, 400);
cmdParser.add("thread", 't', "num of threads", false, 4);
cmdParser.add("chunk", 'c', "chunk size", false, 64);
@@ -28,6 +34,8 @@ int main(int argc, char **argv) {
int chunk_size = cmdParser.get("chunk");
CPUBackend::cpu_threads = cmdParser.get("thread");
+ Module::initBackend(MLLM_QNN);
+
auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
PhoneLMConfig config(tokens_limit, "1.5B");
auto model = PhoneLMForCausalLM_NPU(config, chunk_size);
@@ -51,14 +59,16 @@ int main(int argc, char **argv) {
if (!not_end) { return false; }
return true;
});
- Module::isFirstChunk = false;
- static_cast(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(0);
- static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
- // turn on the multi-chunk prefilling
- Module::isMultiChunkPrefilling = true;
+ Context::Instance().inference_state().setQnnGraphFrozen(true);
+ Context::Instance().inference_state().setCurSequenceLength(0);
+ Context::Instance().inference_state().setExecutionType(PROMPT);
+ Context::Instance().inference_state().toggleSwitching();
+
// warmup END
std::cout << "Warmup finished." << std::endl;
+ if (!std::filesystem::exists("qnn_context.bin")) {
+ static_cast(Backend::global_backends[MLLM_QNN].get())->saveQNNContext();
+ }
vector in_strs = {
"Give me a short introduction to large language model.",
@@ -69,86 +79,86 @@ int main(int argc, char **argv) {
"Please introduce Beijing University of Posts and Telecommunications.",
"\"Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand and generate human-like text. These models are trained on vast amounts of data, enabling them to perform a wide range of tasks, from answering questions and summarizing text to generating creative content and engaging in conversational dialogue. LLMs like GPT-3 and GPT-4, developed by OpenAI, have set new benchmarks in natural language processing by leveraging deep learning architectures, particularly transformer models, which excel at capturing context and relationships within text. The scalability and versatility of LLMs make them invaluable tools for applications in education, customer service, content creation, and more. However, their deployment also raises ethical considerations, including issues of bias, misinformation, and the potential for misuse. As the field continues to evolve, ongoing research and responsible deployment strategies are essential to harnessing the full potential of these powerful AI systems while mitigating their risks.\"\nGenerate a title based on the above text."};
- for (int i = 0; i < in_strs.size(); ++i) {
- auto input_str = tokenizer.apply_chat_template(in_strs[i]);
- auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
- const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
- const int chunk_num = seq_length_padding / chunk_size;
- bool isSwitched = false;
- // std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl;
- std::cout << "[Q] " << in_strs[i] << std::endl;
- std::cout << "[A] " << std::flush;
-
- // set total seq length for HeadLinear execute, which can not get the real seq length from Opts
- static_cast(Backend::global_backends[MLLM_CPU])->setTotalSequenceLength(real_seq_length);
- // set chunk size for the HeadLinear execute, which can not get the chunk size from Opts
- static_cast(Backend::global_backends[MLLM_CPU])->setChunkSize(chunk_size);
-
- // tensor vectors to save the chunked tensors of the QNN prefilling input
- vector chunked_tensors(chunk_num);
- LlmTextGeneratorOpts opt{
- .max_new_tokens = 1,
- .do_sample = false,
- .is_padding = true,
- .seq_before_padding = real_seq_length,
- .chunk_size = chunk_size,
- };
-
- for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
- chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU]);
- chunked_tensors[chunk_id].setTtype(INPUT_TENSOR);
- chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1);
- chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id));
- chunked_tensors[chunk_id].shallowCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});
-
- model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
- // if (i != 0 && !isSwitched && chunk_id == 0) {
- if (!isSwitched && chunk_id == 0) {
- // turn off switching at the first chunk of following inputs
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
- isSwitched = true;
- }
- auto out_string = tokenizer.detokenize({out_token});
- auto [not_end, output_string] = tokenizer.postprocess(out_string);
- if (!not_end) { return false; }
- if (chunk_id == chunk_num - 1) { // print the output of the last chunk
- std::cout << output_string << std::flush;
- }
- return true;
- });
- Module::isFirstChunk = false;
- }
-
- // turn on switching, set sequence length and execution type
- static_cast(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(real_seq_length);
- static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
-
- LlmTextGeneratorOpts decoding_opt{
- .max_new_tokens = 100,
- .do_sample = false,
- .temperature = 0.3f,
- .top_k = 50,
- .top_p = 0.f,
- .is_padding = false,
- };
- isSwitched = false;
- decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
- if (!isSwitched) { // turn off switching
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
- isSwitched = true;
- }
- auto out_string = tokenizer.detokenize({out_token});
- auto [not_end, output_string] = tokenizer.postprocess(out_string);
- if (!not_end) { return false; }
- std::cout << output_string << std::flush;
- return true;
- });
-
- // turn on switching, set sequence length and execution type
- static_cast(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(0);
- static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
- std::cout << "\n";
- }
-}
\ No newline at end of file
+ // for (int i = 0; i < in_strs.size(); ++i) {
+ // auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+ // auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
+ // const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
+ // const int chunk_num = seq_length_padding / chunk_size;
+ // bool isSwitched = false;
+ // // std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl;
+ // std::cout << "[Q] " << in_strs[i] << std::endl;
+ // std::cout << "[A] " << std::flush;
+
+ // // set total seq length for HeadLinear execute, which can not get the real seq length from Opts
+ // Context::Instance().inference_state().setTotalSequenceLength(real_seq_length);
+ // // set chunk size for the HeadLinear execute, which can not get the chunk size from Opts
+ // Context::Instance().inference_state().setChunkSize(chunk_size);
+
+ // // tensor vectors to save the chunked tensors of the QNN prefilling input
+ // vector chunked_tensors(chunk_num);
+ // LlmTextGeneratorOpts opt{
+ // .max_new_tokens = 1,
+ // .do_sample = false,
+ // .is_padding = true,
+ // .seq_before_padding = real_seq_length,
+ // .chunk_size = chunk_size,
+ // };
+
+ // for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
+ // chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU].get());
+ // chunked_tensors[chunk_id].setTtype(INPUT_TENSOR);
+ // chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1);
+ // chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id));
+ // chunked_tensors[chunk_id].shallowCopyFrom(input_tensor, false, {0, 0, chunk_id * chunk_size, 0});
+
+ // model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
+ // // if (i != 0 && !isSwitched && chunk_id == 0) {
+ // if (!isSwitched && chunk_id == 0) {
+ // // turn off switching at the first chunk of following inputs
+ // Context::Instance().inference_state().toggleSwitching();
+ // isSwitched = true;
+ // }
+ // auto out_string = tokenizer.detokenize({out_token});
+ // auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ // if (!not_end) { return false; }
+ // if (chunk_id == chunk_num - 1) { // print the output of the last chunk
+ // std::cout << output_string << std::flush;
+ // }
+ // return true;
+ // });
+ // Context::Instance().inference_state().setQnnGraphFrozen(true);
+ // }
+
+ // // turn on switching, set sequence length and execution type
+ // Context::Instance().inference_state().setCurSequenceLength(real_seq_length);
+ // Context::Instance().inference_state().setExecutionType(AUTOREGRESSIVE);
+ // Context::Instance().inference_state().toggleSwitching();
+
+ // LlmTextGeneratorOpts decoding_opt{
+ // .max_new_tokens = 100,
+ // .do_sample = false,
+ // .temperature = 0.3f,
+ // .top_k = 50,
+ // .top_p = 0.f,
+ // .is_padding = false,
+ // };
+ // isSwitched = false;
+ // decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
+ // if (!isSwitched) { // turn off switching
+ // Context::Instance().inference_state().toggleSwitching();
+ // isSwitched = true;
+ // }
+ // auto out_string = tokenizer.detokenize({out_token});
+ // auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ // if (!not_end) { return false; }
+ // std::cout << output_string << std::flush;
+ // return true;
+ // });
+
+ // // turn on switching, set sequence length and execution type
+ // Context::Instance().inference_state().setCurSequenceLength(0);
+ // Context::Instance().inference_state().setExecutionType(PROMPT);
+ // Context::Instance().inference_state().toggleSwitching();
+ // std::cout << "\n";
+ // }
+}
diff --git a/examples/demo_qwen.cpp b/examples/demo_qwen.cpp
index 1c70d52ce..282705bdd 100644
--- a/examples/demo_qwen.cpp
+++ b/examples/demo_qwen.cpp
@@ -7,6 +7,7 @@
* @copyright Copyright (c) 2024
*
*/
+#include "DataType.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
#include "models/qwen/modeling_qwen.hpp"
@@ -18,11 +19,18 @@ int main(int argc, char **argv) {
std::iostream::sync_with_stdio(false);
cmdline::parser cmdParser;
+ cmdParser.add("device", 'd', "mllm backend [0:`cpu` | 1:`opencl`]", false, 0);
cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm");
cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt");
- cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-2.5-3b-instruct-q4_0_4x4.mllm");
- cmdParser.add("billion", 'b', "[0.5B | 1.8B | 1.5B | 3B |]", false, "3B");
- cmdParser.add("limits", 'l', "max KV cache size", false, 400);
+ string default_model_path = "../models/qwen-2.5-1.5b-instruct-q4_0_4_4.mllm";
+ string default_model_billion = "1.5b";
+#if defined(ARM)
+ default_model_path = "../models/qwen-2.5-1.5b-instruct-kai_q4_0_lm.mllm";
+ default_model_billion = "1.5b-lm";
+#endif
+ cmdParser.add("model", 'm', "specify mllm model path", false, default_model_path);
+ cmdParser.add("billion", 'b', "[0.5B | 1.8B | 1.5B | 3B |]", false, default_model_billion);
+ cmdParser.add("limits", 'l', "max KV cache size", false, 550);
cmdParser.add("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
@@ -32,16 +40,29 @@ int main(int argc, char **argv) {
string model_billion = cmdParser.get("billion");
int tokens_limit = cmdParser.get("limits");
CPUBackend::cpu_threads = cmdParser.get("thread");
+ BackendType device = (BackendType)cmdParser.get("device");
+ assert((device == MLLM_CPU || device == MLLM_OPENCL) && "device not supports!");
auto tokenizer = QWenTokenizer(vocab_path, merge_path);
- QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
+ QWenConfig config(tokens_limit, model_billion);
+#ifdef USE_OPENCL
+ if (device == MLLM_OPENCL) {
+ config.dtype = MLLM_TYPE_F16;
+ // config.attn_implementation = "eager";
+ }
+#endif
+ // config.attn_implementation = "sage_attention";
auto model = QWenForCausalLM(config);
+#ifdef USE_OPENCL
+ model = model.to(device);
+#endif
model.load(model_path);
vector in_strs = {
- "Hello, who are you?",
- "What can you do?",
- "Please introduce Beijing University of Posts and Telecommunications.",
+ "Give me a short introduction to large language model.",
+ "介绍一下你自己。",
+ "清晨的阳光透过薄纱窗帘,懒洋洋地洒在木地板上,空气中飘散着咖啡豆研磨后特有的醇厚香气。窗外传来几声清脆的鸟鸣,伴随着远处隐约的车流声,构成这座都市尚未完全苏醒的独特交响。书桌上摊开着昨夜未读完的书,书页边缘已微微卷起。厨房里,水壶正发出细密的声响,预示着一天的热饮即将就绪。昨日的计划表贴在冰箱门上,几个重要的待办事项用红笔醒目地圈出。公园里晨练的人们身影绰绰,有节奏的脚步声和太极音乐交织。一只橘猫敏捷地跃上围墙,在晨光中伸展着腰肢,神态悠闲得仿佛它是这片领地的主人。街角的面包店刚拉开铁门,新鲜出炉的面包香气迫不及待地涌向街头。公交站台上,等待的乘客低头刷着手机屏幕,神情各异。云朵缓慢地在湛蓝的天空中移动,时间似乎被拉长了片刻。生活就在这些微小的、平凡的细节里徐徐展开,既不惊天动地,却也充满细碎的温暖和实在的步履。新的一天开始了。\n请在以上文本中找出描述“气味”的句子(复制出来),然后判断叙述者对“橘猫”的态度是正面还是负面,最后请用三个成语概括文中描绘的早晨氛围。",
+ "项羽已杀卿子冠军,威震楚国,名闻诸侯。乃遣当阳君、蒲将军将卒二万渡河,救巨鹿。战少利,陈馀复请兵。项羽乃悉引兵渡河,皆沉船,破釜甑,烧庐舍,持三日粮,以示士卒必死,无一还心。于是至则围王离,与秦军遇,九战,绝其甬道,大破之,杀苏角,虏王离。涉间不降楚,自烧杀。当是时,楚兵冠诸侯。诸侯军救巨鹿下者十余壁,莫敢纵兵。及楚击秦,诸将皆从壁上观。楚战士无不一以当十,楚兵呼声动天,诸侯军无不人人惴恐。于是已破秦军,项羽召见诸侯将,入辕门,无不膝行而前,莫敢仰视。项羽由是始为诸侯上将军,诸侯皆属焉。 问题:结合项羽在巨鹿之战中的战术决策与心理威慑手段,分析其如何实现『楚战士无不一以当十』的战斗效应,并论述这种军事心理学实践对诸侯将领『膝行而前,莫敢仰视』行为模式的生成机制。",
};
for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
@@ -50,8 +71,8 @@ int main(int argc, char **argv) {
std::cout << "[A] " << std::flush;
LlmTextGeneratorOpts opt{
- .max_new_tokens = 100,
- .do_sample = true,
+ .max_new_tokens = 200,
+ .do_sample = false,
.temperature = 0.3F,
.top_k = 50,
.top_p = 0.F,
diff --git a/examples/demo_qwen2.5_npu.cpp b/examples/demo_qwen2.5_npu.cpp
deleted file mode 100644
index 761a34926..000000000
--- a/examples/demo_qwen2.5_npu.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-#include "backends/cpu/CPUBackend.hpp"
-#include "cmdline.h"
-#include "models/qwen/configuration_qwen.hpp"
-#include "models/qwen/modeling_qwen_npu.hpp"
-#include "models/qwen/modeling_qwen.hpp"
-#include "models/qwen/tokenization_qwen.hpp"
-#include "processor/PostProcess.hpp"
-
-using namespace mllm;
-
-int main(int argc, char **argv) {
- cmdline::parser cmdParser;
- cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm");
- cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt");
- cmdParser.add("model", 'm', "specify mllm model path", false, "../models/Qwen2.5-1.5B-Instruct.mllm");
- cmdParser.add("billion", 'b', "[0.5B | 1.8B | 1.5B]", false, "1.8B");
- cmdParser.add("limits", 'l', "max KV cache size", false, 400);
- cmdParser.add("thread", 't', "num of threads", false, 4);
- cmdParser.parse_check(argc, argv);
-
- string vocab_path = cmdParser.get("vocab");
- string merge_path = cmdParser.get("merge");
- string model_path = cmdParser.get("model");
- string model_billion = cmdParser.get("billion");
- int tokens_limit = cmdParser.get("limits");
- CPUBackend::cpu_threads = cmdParser.get("thread");
-
- auto tokenizer = QWenTokenizer(vocab_path, merge_path);
- QWenConfig config(tokens_limit, "1.5B", RoPEType::HFHUBROPE);
- auto model = QWenForCausalLM_NPU(config, 64);
- model.load(model_path);
- auto decoding_model = QWenForCausalLM(config);
- decoding_model.load("../models/qwen-2.5-1.5b-instruct-q4_0_4_4.mllm");
-
- vector in_strs = {
- " Give me a short introduction to large language model.",
- };
-
- for (int i = 0; i < in_strs.size(); ++i) {
- auto input_str = tokenizer.apply_chat_template(in_strs[i]);
- auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
- std::cout << "[Q] " << in_strs[i] << std::endl;
- std::cout << "[A] " << std::flush;
-
- // set total seq length for HeadLinear execute, which can not get the real seq length from Opts
- static_cast(Backend::global_backends[MLLM_CPU])->setTotalSequenceLength(real_seq_length);
-
- LlmTextGeneratorOpts opt{
- .max_new_tokens = 1,
- .do_sample = false,
- .temperature = 0.3f,
- .top_k = 50,
- .top_p = 0.f,
- .is_padding = true,
- .seq_before_padding = real_seq_length,
- };
- model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
- auto out_string = tokenizer.detokenize({out_token});
- auto [not_end, output_string] = tokenizer.postprocess(out_string);
- if (!not_end) { return false; }
- std::cout << output_string << std::flush;
- return true;
- });
-
- static_cast(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(real_seq_length);
- static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
-
- LlmTextGeneratorOpts decoding_opt{
- .max_new_tokens = 100,
- .do_sample = false,
- .temperature = 0.3f,
- .top_k = 50,
- .top_p = 0.f,
- .is_padding = false,
- };
- bool isSwitched = false;
- decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
- // call only once of switchDecodeTag
- if (!isSwitched) {
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
- isSwitched = true;
- }
- auto out_string = tokenizer.detokenize({out_token});
- auto [isOk, print_string] = tokenizer.postprocess(out_string);
- if (isOk) {
- std::cout << print_string << std::flush;
- } else {
- return false;
- }
- return true;
- });
-
- // turn on switching, set sequence length and execution type
- static_cast(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(0);
- static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
- std::cout << "\n";
- }
-}
\ No newline at end of file
diff --git a/examples/demo_qwen2.5_vl.cpp b/examples/demo_qwen2.5_vl.cpp
new file mode 100644
index 000000000..c0fabb89e
--- /dev/null
+++ b/examples/demo_qwen2.5_vl.cpp
@@ -0,0 +1,71 @@
+#include
+#include "cmdline.h"
+#include "models/qwen2_5_vl/configuration_qwen2_5_vl.hpp"
+#include "models/qwen2_5_vl/modeling_qwen2_5_vl.hpp"
+// #include "models/qwen2_vl/vtp/modeling_qwen2_vl.hpp"
+#include "models/qwen2_vl/processing_qwen2_vl.hpp"
+#include "processor/PostProcess.hpp"
+
+using namespace mllm;
+int main(int argc, char **argv) {
+ cmdline::parser cmdParser;
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2vl_vocab.mllm");
+ cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2vl_merges.txt");
+#ifdef ARM
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-2.5-vl-3b-instruct-kai_q4_0_f.mllm");
+#else
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-2-vl-2b-instruct-q4_k.mllm");
+#endif
+ cmdParser.add("billion", 'b', "[3B | 7B |]", false, "3B");
+ cmdParser.add("limits", 'l', "max KV cache size", false, 800);
+ cmdParser.add("thread", 't', "num of threads", false, 4);
+ cmdParser.parse_check(argc, argv);
+
+ string vocab_path = cmdParser.get("vocab");
+ string merge_path = cmdParser.get("merge");
+ string model_path = cmdParser.get("model");
+ string model_billion = cmdParser.get("billion") == "3B" ? "3b" : cmdParser.get("billion");
+ int tokens_limit = cmdParser.get("limits");
+ int thread_num = cmdParser.get("thread");
+ CPUBackend::cpu_threads = cmdParser.get("thread");
+
+ ParamLoader param_loader(model_path);
+ auto processor = Qwen2VLProcessor(vocab_path, merge_path);
+ Qwen2VLConfig config(tokens_limit, model_billion);
+ auto model = Qwen2VLModel(config);
+ model.load(model_path);
+
+ vector in_imgs = {
+ // "../assets/bus.png",
+ "../assets/two_cats.jpg",
+ // "../assets/bird_image.jpg",
+ };
+ vector in_strs = {
+ "<|vision_start|><|image_pad|><|vision_end|>Describe this image.",
+ };
+
+ for (int i = 0; i < in_strs.size(); ++i) {
+ auto in_str = in_strs[i];
+ in_str = processor.tokenizer->apply_chat_template(in_str);
+ auto input_tensor = processor.process(in_str, in_imgs[i]);
+ std::cout << "[Q] " << in_strs[i] << std::endl;
+ std::cout << "[A] " << std::flush;
+
+ for (int step = 0; step < 100; step++) {
+ model.get_position_ids(input_tensor);
+ auto result = model(input_tensor);
+ auto outputs = processor.detokenize(result[0]);
+ auto out_string = outputs.first;
+ auto out_token = outputs.second;
+ auto [not_end, output_string] = processor.tokenizer->postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
+ chatPostProcessing(out_token, input_tensor[0], {&input_tensor[1], &input_tensor[2]});
+ }
+ printf("\n");
+ model.clear_kvcache();
+ model.profiling();
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/demo_qwen2_vl.cpp b/examples/demo_qwen2_vl.cpp
index 3a23c982a..8a238e887 100644
--- a/examples/demo_qwen2_vl.cpp
+++ b/examples/demo_qwen2_vl.cpp
@@ -2,6 +2,7 @@
#include "cmdline.h"
#include "models/qwen2_vl/configuration_qwen2_vl.hpp"
#include "models/qwen2_vl/modeling_qwen2_vl.hpp"
+// #include "models/qwen2_vl/vtp/modeling_qwen2_vl.hpp"
#include "models/qwen2_vl/processing_qwen2_vl.hpp"
#include "processor/PostProcess.hpp"
@@ -10,27 +11,35 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2vl_vocab.mllm");
cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2vl_merges.txt");
+#ifdef ARM
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-2-vl-2b-instruct-kai_q4_0.mllm");
+#else
cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-2-vl-2b-instruct-q4_k.mllm");
- cmdParser.add("limits", 'l', "max KV cache size", false, 2000);
+#endif
+ cmdParser.add("billion", 'b', "[2B | 7B |]", false, "2B");
+ cmdParser.add("limits", 'l', "max KV cache size", false, 800);
cmdParser.add("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
string vocab_path = cmdParser.get("vocab");
string merge_path = cmdParser.get("merge");
string model_path = cmdParser.get("model");
+ string model_billion = cmdParser.get("billion") == "2B" ? "1.5b" : cmdParser.get("billion");
int tokens_limit = cmdParser.get("limits");
int thread_num = cmdParser.get("thread");
CPUBackend::cpu_threads = cmdParser.get("thread");
ParamLoader param_loader(model_path);
auto processor = Qwen2VLProcessor(vocab_path, merge_path);
- Qwen2VLConfig config(tokens_limit, "1.5b");
- auto model_config = Qwen2VLConfig(config);
- auto model = Qwen2VLModel(model_config);
+ Qwen2VLConfig config(tokens_limit, model_billion);
+ auto model = Qwen2VLModel(config);
model.load(model_path);
vector in_imgs = {
- "../assets/bus.png"};
+ // "../assets/bus.png",
+ "../assets/two_cats.jpg",
+ // "../assets/bird_image.jpg",
+ };
vector in_strs = {
"<|vision_start|><|image_pad|><|vision_end|>Describe this image.",
};
@@ -53,6 +62,8 @@ int main(int argc, char **argv) {
chatPostProcessing(out_token, input_tensor[0], {&input_tensor[1], &input_tensor[2]});
}
printf("\n");
+ model.clear_kvcache();
+ model.profiling();
}
return 0;
diff --git a/examples/demo_qwen2_vl_npu.cpp b/examples/demo_qwen2_vl_npu.cpp
new file mode 100644
index 000000000..ebca5fcab
--- /dev/null
+++ b/examples/demo_qwen2_vl_npu.cpp
@@ -0,0 +1,181 @@
+#include "Context.hpp"
+#include "QNNBackend.hpp"
+#include
+#include
+#include "Types.hpp"
+#include "cmdline.h"
+#include "memory/MemInspect.hpp"
+#include "models/qwen2_vl/configuration_qwen2_vl.hpp"
+#include "models/qwen2_vl/modeling_qwen2_vl_npuvit.hpp"
+#include "models/qwen2_vl/modeling_qwen2_vl_npu.hpp"
+#include "models/qwen2_vl/processing_qwen2_vl.hpp"
+#include "processor/PostProcess.hpp"
+
+using namespace mllm;
+int main(int argc, char **argv) {
+ cmdline::parser cmdParser;
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2vl_vocab.mllm");
+ cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2vl_merges.txt");
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen2_vl_vit_lm_rota_noshadow.mllm");
+ cmdParser.add("limits", 'l', "max KV cache size", false, 1000);
+ cmdParser.add("thread", 't', "num of threads", false, 4);
+ cmdParser.parse_check(argc, argv);
+
+ string vocab_path = cmdParser.get("vocab");
+ string merge_path = cmdParser.get("merge");
+ string model_path = cmdParser.get("model");
+ const string cpu_model_path = "../models/Qwen2-VL-2B-Instruct_vit_lm_rotated-Q40.mllm";
+ int tokens_limit = cmdParser.get("limits");
+ int thread_num = cmdParser.get("thread");
+ CPUBackend::cpu_threads = cmdParser.get("thread");
+
+ // TODO: add a function to calculate the chunk size
+ const int chunk_size = 128;
+
+ Module::initBackend(MLLM_QNN);
+
+ Context::Instance().inference_state().setCPUViT(false);
+
+ ParamLoader param_loader(model_path);
+ auto processor = Qwen2VLProcessor(vocab_path, merge_path);
+ Qwen2VLNPUConfig npu_config(tokens_limit, "1.5b-vl-rotated");
+
+ // npu vit embedding
+ auto prefill_embedding = npu::Qwen2VL_ImagePatchAndEmbedding(npu_config);
+ prefill_embedding.load(model_path);
+
+ // npu llm
+ auto prefill_body = Qwen2VL_PrefillBody(npu_config, chunk_size, npu_config.shadow_layers);
+ prefill_body.load(model_path);
+
+ // cpu model
+ auto cpu_model_config = Qwen2VLConfig(tokens_limit, "1.5b");
+ cpu_model_config.attn_implementation = "eager_notrans";
+ auto decoding_model = Qwen2VL_Decoding_Model(cpu_model_config);
+ decoding_model.load(cpu_model_path);
+
+ vector in_imgs = {
+ "../assets/bus.png"};
+ vector in_strs = {
+ "<|vision_start|><|image_pad|><|vision_end|>Imagine you are describing this image to someone who cannot see it. Explain everything you observe, including the background, subjects, their expressions, and any activities they appear to be doing.",
+ };
+
+ auto &in_str = in_strs[0];
+ in_str = processor.tokenizer->apply_chat_template(in_str);
+ auto input_tensors = processor.process(in_str, in_imgs[0]);
+
+ const int real_seq_length = input_tensors[0].sequence();
+ std::cout << "real seq length: " << real_seq_length << std::endl;
+
+ const int num_iter = (real_seq_length + chunk_size - 1) / chunk_size;
+ std::cout << "num_iter: " << num_iter << std::endl;
+ // padding the position_ids to total chunk length(example: 256*2) for CPUMultimodalRoPEPipeline
+ prefill_embedding.get_position_ids(input_tensors, chunk_size * num_iter);
+
+ // 1. QNN vit embedding
+ // NOTE: put vit here is because compatible with older qnn_context.bin.
+ // In QNNBackend, the graph should be executed in the order of the context
+ // TODO: better QNNBackend graph indexing and management
+ auto vit_start = mllm_time_ms();
+ auto merged_embd = prefill_embedding(input_tensors);
+ auto vit_end = mllm_time_ms();
+
+ auto merged_embd_warmup_tensor = Tensor(0, MLLM_QNN);
+ merged_embd_warmup_tensor.reshape(1, 1, chunk_size, 1536);
+ merged_embd_warmup_tensor.setTtype(INPUT_TENSOR);
+ merged_embd_warmup_tensor.alloc();
+
+ merged_embd_warmup_tensor.setTtype(INPUT_TENSOR);
+ input_tensors.back().setTtype(INPUT_TENSOR);
+ vector prefill_input = {merged_embd_warmup_tensor, input_tensors.back()};
+
+ auto llm_start = mllm_time_ms();
+ prefill_body(prefill_input);
+ auto llm_end = mllm_time_ms();
+ std::cout << "after warm up" << std::endl;
+
+ if (!std::filesystem::exists("qnn_context.bin")) {
+ static_cast(Backend::global_backends[MLLM_QNN].get())->saveQNNContext();
+ }
+
+ Context::Instance().inference_state().setQnnGraphFrozen(true);
+ Context::Instance().inference_state().setCurSequenceLength(0);
+ Context::Instance().inference_state().setExecutionType(PROMPT);
+ Context::Instance().inference_state().toggleSwitching();
+
+ // set total seq length for HeadLinear execute, which can not get the real seq length from Opts
+ Context::Instance().inference_state().setTotalSequenceLength(real_seq_length);
+ // set chunk size for the HeadLinear execute, which can not get the chunk size from Opts
+ Context::Instance().inference_state().setChunkSize(chunk_size);
+
+ std::cout << "[Q] " << in_strs[0] << std::endl;
+ std::cout << "[A] " << std::flush;
+
+ for (auto &t : input_tensors) {
+ t.setTtype(INPUT_TENSOR);
+ }
+
+ // 2. QNN LLM Prefill
+ unsigned int out_token = 0;
+ auto start_time = mllm_time_ms();
+ int64_t prefill_time;
+ for (auto i = 0; i < num_iter; ++i) {
+ // copy the data from merged_embd[0] to merged_embd_warmup_tensor
+ auto source = merged_embd[0].ptrAt(0, 0, chunk_size * i, 0);
+ auto dest = prefill_input[0].hostPtr();
+ if (i == 0) {
+ memcpy(dest, source, std::min(prefill_input[0].cntSize(), merged_embd[0].cntSize()));
+ } else {
+ memcpy(dest, source, (merged_embd[0].sequence() % chunk_size) * merged_embd[0].dimension() * sizeof(float));
+ }
+
+ auto result = prefill_body(prefill_input);
+
+ if (i == 0) { // turn off switching to avoid RoPE h_cnt_ reset to curSequenceLength in next chunk
+ Context::Instance().inference_state().toggleSwitching();
+ }
+
+ if (i == num_iter - 1) {
+ auto end_time = mllm_time_ms();
+ prefill_time = end_time - start_time;
+ auto outputs = processor.detokenize(result[0], real_seq_length % chunk_size);
+ auto out_string = outputs.first;
+ out_token = outputs.second;
+ auto [not_end, output_string] = processor.tokenizer->postprocess(out_string);
+ std::cout << output_string << std::flush;
+ }
+ }
+
+ chatPostProcessing(out_token, input_tensors[0], {&input_tensors[1], &input_tensors[2]});
+
+ Context::Instance().inference_state().setCurSequenceLength(real_seq_length);
+ Context::Instance().inference_state().setExecutionType(AUTOREGRESSIVE);
+ Context::Instance().inference_state().toggleSwitching();
+
+ // 3. CPU LLM Decoding
+ for (auto &t : input_tensors) { // set to INPUT_TENSOR to let decoding module update act
+ t.setTtype(INPUT_TENSOR);
+ }
+
+ const int last_position_id = input_tensors[3].dataAt(0, 0, 0, real_seq_length - 1);
+ for (int step = 0; step < 100; step++) {
+ // use the last position id(no padding position) in decoding
+ prefill_embedding.get_position_ids(input_tensors, 0, last_position_id + 1 + step);
+
+ auto result = decoding_model(input_tensors);
+ auto outputs = processor.detokenize(result[0]);
+ auto out_string = outputs.first;
+ auto out_token = outputs.second;
+ auto [not_end, output_string] = processor.tokenizer->postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
+ chatPostProcessing(out_token, input_tensors[0], {&input_tensors[1], &input_tensors[2]});
+
+ if (step == 0) Context::Instance().inference_state().toggleSwitching();
+ }
+
+ std::cout << std::endl;
+ std::cout << "vit embedding time: " << vit_end - vit_start << " ms" << std::endl;
+ std::cout << "Prefill:" << prefill_time << " ms" << std::endl;
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/demo_qwen2_vl_vtp.cpp b/examples/demo_qwen2_vl_vtp.cpp
new file mode 100644
index 000000000..40bd82ade
--- /dev/null
+++ b/examples/demo_qwen2_vl_vtp.cpp
@@ -0,0 +1,72 @@
+#include
+#include "cmdline.h"
+#include "models/qwen2_vl/configuration_qwen2_vl.hpp"
+#include "models/qwen2_vl/vtp/modeling_qwen2_vl.hpp"
+#include "models/qwen2_vl/vtp/processing_qwen2_vl.hpp"
+#include "processor/PostProcess.hpp"
+
+using namespace mllm;
+int main(int argc, char **argv) {
+ cmdline::parser cmdParser;
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2vl_vocab.mllm");
+ cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2vl_merges.txt");
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-2-vl-2b-instruct-kai_q4_0.mllm");
+ cmdParser.add("billion", 'b', "[2B | 7B |]", false, "2B");
+ cmdParser.add("limits", 'l', "max KV cache size", false, 800);
+ cmdParser.add("thread", 't', "num of threads", false, 4);
+ cmdParser.add("premerge", 'g', "enable pre-ViT image token merging", false, false);
+ cmdParser.add("pruning", 'p', "enable pruning", false, false);
+ cmdParser.parse_check(argc, argv);
+
+ string vocab_path = cmdParser.get("vocab");
+ string merge_path = cmdParser.get("merge");
+ string model_path = cmdParser.get("model");
+ string model_billion = cmdParser.get("billion") == "2B" ? "1.5b" : cmdParser.get("billion");
+ int tokens_limit = cmdParser.get("limits");
+ int thread_num = cmdParser.get("thread");
+ CPUBackend::cpu_threads = cmdParser.get("thread");
+ use_pre_vit_merge = cmdParser.exist("premerge");
+ bool use_pruning = cmdParser.exist("pruning");
+ if (!use_pruning) {
+ WHERE_TOKEN_PRUNING.pruning_place_cfg = {};
+ }
+
+ ParamLoader param_loader(model_path);
+ auto processor = Qwen2VLProcessor(vocab_path, merge_path);
+ Qwen2VLConfig config(tokens_limit, model_billion);
+ auto model = Qwen2VLModel(config);
+ model.load(model_path);
+
+ vector in_imgs = {
+ // "../assets/bus.png",
+ "../assets/two_cats.jpg",
+ // "../assets/bird_image.jpg",
+ };
+ vector in_strs = {
+ "<|vision_start|><|image_pad|><|vision_end|>Describe this image.",
+ };
+
+ for (int i = 0; i < in_strs.size(); ++i) {
+ auto in_str = in_strs[i];
+ in_str = processor.tokenizer->apply_chat_template(in_str);
+ auto input_tensor = processor.process(in_str, in_imgs[i]);
+ std::cout << "[Q] " << in_strs[i] << std::endl;
+ std::cout << "[A] " << std::flush;
+ for (int step = 0; step < 100; step++) {
+ model.get_position_ids(input_tensor);
+ auto result = model(input_tensor);
+ auto outputs = processor.detokenize(result[0]);
+ auto out_string = outputs.first;
+ auto out_token = outputs.second;
+ auto [not_end, output_string] = processor.tokenizer->postprocess(out_string);
+ if (!not_end) { break; }
+ std::cout << output_string << std::flush;
+ chatPostProcessing(out_token, input_tensor[0], {&input_tensor[1], &input_tensor[2]});
+ }
+ printf("\n");
+ model.clear_kvcache();
+ model.profiling();
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/demo_qwen3.cpp b/examples/demo_qwen3.cpp
index 6a93e6d36..009dceb00 100644
--- a/examples/demo_qwen3.cpp
+++ b/examples/demo_qwen3.cpp
@@ -20,8 +20,8 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
- cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-3-0.6b-q4_k.mllm");
- cmdParser.add("billion", 'b', "[0.6B | 4B |]", false, "0.6B");
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-3-0.6b-kai_q4_0.mllm");
+ cmdParser.add("billion", 'b', "[0.6B | 4B |]", false, "0.6b-lm");
cmdParser.add("limits", 'l', "max KV cache size", false, 800);
cmdParser.add("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
@@ -67,5 +67,7 @@ int main(int argc, char **argv) {
return true;
});
std::cout << "\n";
+ model.clear_kvcache();
+ model.profiling();
}
}
diff --git a/examples/demo_qwen_batch.cpp b/examples/demo_qwen_batch.cpp
new file mode 100644
index 000000000..4c25f3de9
--- /dev/null
+++ b/examples/demo_qwen_batch.cpp
@@ -0,0 +1,76 @@
+/**
+ * @file demo_qwen.cpp
+ * @author Chenghua Wang (chenghua.wang.edu@gmail.com)
+ * @version 0.1
+ * @date 2024-05-01
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+#include "cmdline.h"
+#include "models/qwen/configuration_qwen.hpp"
+#include "models/qwen/modeling_qwen.hpp"
+#include "models/qwen/tokenization_qwen.hpp"
+#include
+#include
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+ std::iostream::sync_with_stdio(false);
+
+ cmdline::parser cmdParser;
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm");
+ cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt");
+#ifdef ARM
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-2.5-1.5b-instruct-kai_q4_0_lm.mllm");
+#else
+ cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-2.5-1.5b-instruct-q4_0_4_4.mllm");
+#endif
+ cmdParser.add("billion", 'b', "[0.5B | 1.8B | 1.5B | 3B |]", false, "1.5b-lm");
+ cmdParser.add("limits", 'l', "max KV cache size", false, 400);
+ cmdParser.add("thread", 't', "num of threads", false, 4);
+ cmdParser.parse_check(argc, argv);
+
+ string vocab_path = cmdParser.get("vocab");
+ string merge_path = cmdParser.get("merge");
+ string model_path = cmdParser.get("model");
+ string model_billion = cmdParser.get("billion");
+ int tokens_limit = cmdParser.get("limits");
+ CPUBackend::cpu_threads = cmdParser.get("thread");
+
+ auto tokenizer = QWenTokenizer(vocab_path, merge_path);
+ QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
+ // config.attn_implementation = "sage_attention"; // 使用Sage Attention实现
+ auto model = QWenForCausalLM(config);
+ model.load(model_path);
+
+ vector in_strs = {
+ "Give me a short introduction to large language model.",
+ "介绍一下你自己。",
+ "什么是北京市的旧称?",
+ };
+ vector input_strs;
+ for (int i = 0; i < in_strs.size(); ++i) {
+ std::cout << "[Q" << i << "] " << in_strs[i] << std::endl;
+ auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+ input_strs.push_back(input_str);
+ }
+ auto input_tensor = tokenizer.tokenize(input_strs);
+
+ LlmTextGeneratorOpts opt{
+ .max_new_tokens = 200,
+ .do_sample = false,
+ .temperature = 0.3F,
+ .top_k = 50,
+ .top_p = 0.F,
+ };
+ auto output_tokens = model.generate(input_tensor, opt, tokenizer.eos_id_);
+ for (int i = 0; i < output_tokens.size(); ++i) {
+ auto out_token = output_tokens[i];
+ auto out_string = tokenizer.detokenize(out_token);
+ std::cout << "[A" << i << "] " << out_string << std::endl;
+ }
+ model.clear_kvcache();
+ model.profiling();
+}
diff --git a/examples/demo_qwen_npu.cpp b/examples/demo_qwen_npu.cpp
index 9e230f01c..9187f1301 100644
--- a/examples/demo_qwen_npu.cpp
+++ b/examples/demo_qwen_npu.cpp
@@ -1,8 +1,11 @@
+#include "Context.hpp"
+#include "QNNBackend.hpp"
+#include "Types.hpp"
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
-#include "models/qwen/modeling_qwen_npu.hpp"
#include "models/qwen/modeling_qwen.hpp"
+#include "models/qwen/modeling_qwen_npu_v2.hpp"
#include "models/qwen/tokenization_qwen.hpp"
#include "processor/PostProcess.hpp"
@@ -10,123 +13,82 @@ using namespace mllm;
int main(int argc, char **argv) {
cmdline::parser cmdParser;
- cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
- cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
- cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-chat-int8.mllm");
- cmdParser.add("billion", 'b', "[0.5B | 1.8B]", false, "1.8B");
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm");
+ cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt");
+ cmdParser.add("qnn-model", 'm', "specify mllm model path", false, "../models/Qwen2.5-1.5B-Instruct_rotated-noshadow.mllm");
+ cmdParser.add("decoding-model", '\0', "specify mllm model path", false, "../models/Qwen2.5-1.5B-Instruct_rotated-Q40.mllm");
+ cmdParser.add("billion", 'b', "[0.5B | 1.8B | 1.5B | [1.5B, 1.8B]-rotated]", false, "1.5B-rotated");
cmdParser.add("limits", 'l', "max KV cache size", false, 400);
cmdParser.add("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
string vocab_path = cmdParser.get("vocab");
string merge_path = cmdParser.get("merge");
- string model_path = cmdParser.get("model");
+ string model_path = cmdParser.get("qnn-model");
+ string decoding_model_path = cmdParser.get("decoding-model");
string model_billion = cmdParser.get("billion");
int tokens_limit = cmdParser.get("limits");
- const int chunk_size = 128;
CPUBackend::cpu_threads = cmdParser.get("thread");
+ Module::initBackend(MLLM_QNN);
+
auto tokenizer = QWenTokenizer(vocab_path, merge_path);
- QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
- auto model = QWenForCausalLM_NPU(config, chunk_size);
+ QWenNPUConfig config(tokens_limit, "1.5b-rotated", RoPEType::HFHUBROPE);
+ auto model = v2::QWenForCausalLM_NPU(config, 256);
+ config.attn_implementation = "eager_notrans";
model.load(model_path);
auto decoding_model = QWenForCausalLM(config);
- decoding_model.load("../models/qwen-1.5-1.8b-chat-q4k.mllm");
-
- // warmup START
- std::string input_str = " ";
- auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
- LlmTextGeneratorOpts opt{
- .max_new_tokens = 1,
- .do_sample = false,
- .is_padding = true,
- .seq_before_padding = real_seq_length,
- .chunk_size = chunk_size,
- };
- model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
- auto out_string = tokenizer.detokenize({out_token});
- auto [not_end, output_string] = tokenizer.postprocess(out_string);
- if (!not_end) { return false; }
- return true;
- });
- Module::isFirstChunk = false;
- static_cast(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(0);
- static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
- // turn on the multi-chunk prefilling
- Module::isMultiChunkPrefilling = true;
- // warmup END
- std::cout << "Warmup finished." << std::endl;
+ decoding_model.load(decoding_model_path);
vector in_strs = {
// " Give me a short introduction to large language model.",
- "\"Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand and generate human-like text. These models are trained on vast amounts of data, enabling them to perform a wide range of tasks, from answering questions and summarizing text to generating creative content and engaging in conversational dialogue. LLMs like GPT-3 and GPT-4, developed by OpenAI, have set new benchmarks in natural language processing by leveraging deep learning architectures, particularly transformer models, which excel at capturing context and relationships within text. The scalability and versatility of LLMs make them invaluable tools for applications in education, customer service, content creation, and more. However, their deployment also raises ethical considerations, including issues of bias, misinformation, and the potential for misuse. As the field continues to evolve, ongoing research and responsible deployment strategies are essential to harnessing the full potential of these powerful AI systems while mitigating their risks.\"\nGenerate a title based on the above text."};
+ "\"Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand and generate human-like text. These models are trained on vast amounts of data, enabling them to perform a wide range of tasks, from answering questions and summarizing text to generating creative content and engaging in conversational dialogue. LLMs like GPT-3 and GPT-4, developed by OpenAI, have set new benchmarks in natural language processing by leveraging deep learning architectures, particularly transformer models, which excel at capturing context and relationships within text. The scalability and versatility of LLMs make them invaluable tools for applications in education, customer service, content creation, and more. However, their deployment also raises ethical considerations, including issues of bias, misinformation, and the potential for misuse. As the field continues to evolve, ongoing research and responsible deployment strategies are essential to harnessing the full potential of these powerful AI systems while mitigating their risks.\"\nGenerate a title based on the above text.",
+ // " Hello, Who are you?"
+ };
for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
- auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
- const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
- const int chunk_num = seq_length_padding / chunk_size;
-
+ auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 256, config.vocab_size);
+ // real_seq_length = 256;
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;
+ std::cout << "real_seq_length: " << real_seq_length << std::endl;
// set total seq length for HeadLinear execute, which can not get the real seq length from Opts
- static_cast(Backend::global_backends[MLLM_CPU])->setTotalSequenceLength(real_seq_length);
- static_cast(Backend::global_backends[MLLM_CPU])->setChunkSize(chunk_size);
+ Context::Instance().inference_state().setTotalSequenceLength(real_seq_length);
LlmTextGeneratorOpts opt{
.max_new_tokens = 1,
.do_sample = false,
.is_padding = true,
.seq_before_padding = real_seq_length,
- .chunk_size = chunk_size,
};
+ model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+ auto out_string = tokenizer.detokenize({out_token});
+ auto [not_end, output_string] = tokenizer.postprocess(out_string);
+ if (!not_end) { return false; }
+ std::cout << output_string << std::flush;
+ return true;
+ });
- // tensor vectors to save the chunked tensors of the QNN prefilling input
- bool isSwitched = false;
- vector chunked_tensors(chunk_num);
- for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
- chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU]);
- chunked_tensors[chunk_id].setTtype(INPUT_TENSOR);
- chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1);
- chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id));
- chunked_tensors[chunk_id].shallowCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});
-
- model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
- if (!isSwitched && chunk_id == 0 && static_cast(Backend::global_backends[MLLM_CPU])->isStageSwitching()) {
- // turn off switching at the first chunk of following inputs
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
- isSwitched = true;
- }
- auto out_string = tokenizer.detokenize({out_token});
- auto [not_end, output_string] = tokenizer.postprocess(out_string);
- if (!not_end) { return false; }
- if (chunk_id == chunk_num - 1) { // print the output of the last chunk
- std::cout << output_string << std::flush;
- }
- return true;
- });
- Module::isFirstChunk = false;
- }
-
- static_cast(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(real_seq_length);
- static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+ Context::Instance().inference_state().setCurSequenceLength(real_seq_length);
+ Context::Instance().inference_state().setExecutionType(AUTOREGRESSIVE);
+ Context::Instance().inference_state().toggleSwitching();
LlmTextGeneratorOpts decoding_opt{
- .max_new_tokens = 100,
+ .max_new_tokens = 50,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,
.is_padding = false,
};
- isSwitched = false;
- decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
+ bool isSwitched = false;
+ decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+ Context::Instance().inference_state().toggleSwitching();
+
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
@@ -140,9 +102,14 @@ int main(int argc, char **argv) {
});
// turn on switching, set sequence length and execution type
- static_cast(Backend::global_backends[MLLM_CPU])->setCurSequenceLength(0);
- static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
- static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+ Context::Instance().inference_state().setCurSequenceLength(0);
+ Context::Instance().inference_state().setExecutionType(PROMPT);
+ Context::Instance().inference_state().toggleSwitching();
std::cout << "\n";
+
+ if (!std::filesystem::exists("qnn_context.bin")) {
+ // static_cast(Backend::global_backends[MLLM_QNN].get())->saveQNNContext();
+ static_cast(Backend::global_backends[MLLM_QNN].get())->saveQNNContext();
+ }
}
-}
\ No newline at end of file
+}
diff --git a/examples/demo_qwen_pipeline.cpp b/examples/demo_qwen_npu_pipeline.cpp
similarity index 53%
rename from examples/demo_qwen_pipeline.cpp
rename to examples/demo_qwen_npu_pipeline.cpp
index f2f8bb8d0..343db554a 100644
--- a/examples/demo_qwen_pipeline.cpp
+++ b/examples/demo_qwen_npu_pipeline.cpp
@@ -1,10 +1,12 @@
+#include "Context.hpp"
#include "Backend.hpp"
+#include "QNNBackend.hpp"
#include "Trace.hpp"
#include "Types.hpp"
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
-#include "models/qwen/modeling_qwen_npu.hpp"
+#include "models/qwen/modeling_qwen_npu_v2.hpp"
#include "models/qwen/modeling_qwen.hpp"
#include "models/qwen/tokenization_qwen.hpp"
#include "processor/PostProcess.hpp"
@@ -14,46 +16,60 @@ using namespace mllm;
int main(int argc, char **argv) {
cmdline::parser cmdParser;
- cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
- cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
- cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-chat-int8.mllm");
- cmdParser.add("billion", 'b', "[0.5B | 1.8B]", false, "1.8B");
+ cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm");
+ // "../vocab/qwen_vocab.mllm"
+ cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt");
+ // "../vocab/qwen_merges.txt"
+ cmdParser.add