Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions examples/minicpm_o/CMakeLists.txt
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

代码中没有cnpy和z的使用,需要移除这些额外依赖。并且避免使用$ENV{HOME}作为依赖目录

Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
add_executable(mllm-minicpm-o main.cpp)
target_link_libraries(mllm-minicpm-o PRIVATE MllmRT MllmCPUBackend)
target_include_directories(mllm-minicpm-o PRIVATE ${MLLM_INCLUDE_DIR})
cmake_minimum_required(VERSION 3.10)

add_executable(main_minicpm_o main.cpp)
target_link_libraries(main_minicpm_o PRIVATE MllmRT MllmCPUBackend)
target_include_directories(main_minicpm_o PRIVATE ${MLLM_INCLUDE_DIR})
37 changes: 37 additions & 0 deletions examples/minicpm_o/config_minicpm_o.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"vision_hidden_size": 1152,
"vision_intermediate_size": 4304,
"vision_num_hidden_layers": 27,
"vision_num_attention_heads": 16,
"vision_num_channels": 3,
"vision_image_size": 980,
"vision_patch_size": 14,

"hidden_size": 3584,
"intermediate_size": 18944,
"num_attention_heads": 28,
"num_key_value_heads": 4,
"num_hidden_layers": 28,
"max_position_embeddings": 32768,
"rms_norm_eps": 1e-06,
"vocab_size": 151700,

"query_num": 64,

"audio_hidden_size": 1024,
"audio_num_hidden_layers": 24,
"audio_num_attention_heads": 16,
"audio_max_position_embeddings": 1500,
"audio_chunk_length": 1.0,
"audio_pool_step": 2,

"tts_llm_dim": 3584,

"max_cache_length": 8192,
"eos_token_id": 151645,
"bos_token_id": 151643,
"rope_theta": 1000000.0,
"tie_word_embeddings": true,

"linear_impl_type": "default"
}
112 changes: 97 additions & 15 deletions examples/minicpm_o/main.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,98 @@

#include <iostream>
#include <fmt/core.h>
#include <cstdint>
#include <cstdlib>
#include <mllm/mllm.hpp>
#include <mllm/core/DataTypes.hpp>
#include <mllm/core/ParameterFile.hpp>
#include <mllm/core/Tensor.hpp>
#include <mllm/models/minicpm_o2_6/configuration_chattts.hpp>
#include <mllm/models/minicpm_o2_6/modeling_chattts.hpp>
#include <mllm/utils/Common.hpp>
#include <fstream>

using namespace mllm; // NOLINT

MLLM_MAIN({NYI("leave empty")});
#include "mllm/mllm.hpp"
#include "mllm/models/minicpm_o2_6/configuration_minicpmo.hpp"
#include "mllm/models/minicpm_o2_6/modeling_minicpmo.hpp"
#include "mllm/models/minicpm_o2_6/modeling_resampler.hpp"
#include "mllm/models/minicpm_o2_6/modeling_siglip.hpp"
#include "mllm/models/minicpm_o2_6/tokenization_minicpmo.hpp"
#include "mllm/models/minicpm_o2_6/image_preprocessor_minicpmo.hpp"
#include "mllm/utils/AnyValue.hpp"
#include "mllm/preprocessor/visual/Image.hpp"

using mllm::Argparse;

MLLM_MAIN({
mllm::Logger::level() = mllm::LogLevel::kError;
//mllm::setPrintMaxElementsPerDim(1000); // For debugging large tensors

auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
auto& model_path = Argparse::add<std::string>("-m|--model_path").help("Model path").required(true);
auto& model_version = Argparse::add<std::string>("-mv|--model_version").help("Model version").required(true);
auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer_path").help("Tokenizer directory").required(true);
auto& config_path = Argparse::add<std::string>("-c|--config_path").help("Config path").required(true);
/*
FOR RUN(MacOS Apple Silicon):
python task.py tasks/build_osx_apple_silicon.yaml
cd build-osx/bin
./main_minicpm_o -m ../../models/minicpm-o-2_6.mllm -mv v1 -t ../../tokenizer/MiniCPM-o-2_6/tokenizer.json -c ../../examples/minicpm_o/config_minicpm_o.json
(need to get model.mllm and tokenizer.json first)
*/

Argparse::parse(argc, argv);

#ifdef MLLM_PERFETTO_ENABLE
mllm::perf::start();
#endif

mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1;
if (model_version.get() == "v1") {
file_version = mllm::ModelFileVersion::kV1;
} else if (model_version.get() == "v2") {
file_version = mllm::ModelFileVersion::kV2;
}

if (help.isSet()) {
Argparse::printHelp();
mllm::shutdownContext();
return 0;
}
{
auto minicpmo_cfg = mllm::models::minicpmo::MiniCPMOConfig(config_path.get());
auto minicpmo_tokenizer = mllm::models::minicpmo::MiniCPMOTokenizer(tokenizer_path.get());
auto minicpmo = mllm::models::minicpmo::MiniCPMOForCausalLM(minicpmo_cfg);

auto param = mllm::load(model_path.get(), file_version);
minicpmo.llm_.llm.load(param);
minicpmo.vpm_.load(param);
minicpmo.resampler_.load(param);
//minicpmo.audio_proj_.load(param);
//minicpmo.tts_proj_.load(param);

fmt::print("\n{:*^60}\n", " MiniCPM-o Interactive CLI ");
fmt::print("Enter 'exit' or 'quit' to end the session\n");

std::string image_path = "path/to/your/image.jpg";
std::string prompt_text = "描述图片中物体";
mllm::models::minicpmo::MiniCPMOMessage message;
message.prompt = prompt_text;
message.img_file_path = image_path;

fmt::print("Processing...\n");
auto inputs = minicpmo_tokenizer.convertMessage(message);

fmt::print("\nResponse: ");

int token_count = 0;
for(auto& step : minicpmo.chat(inputs)){
auto token_str = minicpmo_tokenizer.detokenize(step.cur_token_id);
std::wcout<< token_str << std::flush;

token_count++;
if(token_count >= 50) break; // Limit output for debugging
}

fmt::print("\n{}\n", std::string(60, '-'));



#ifdef MLLM_PERFETTO_ENABLE
mllm::perf::stop();
mllm::perf::saveReport("minicpmo.perf");
#endif

mllm::memoryReport();
mllm::shutdownContext();
return 0;
}
})
1 change: 1 addition & 0 deletions mllm/backends/cpu/kernels/Kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "mllm/backends/cpu/kernels/arm/softmax.hpp" // IWYU pragma: export
#include "mllm/backends/cpu/kernels/arm/rmsnorm.hpp" // IWYU pragma: export
#include "mllm/backends/cpu/kernels/arm/gelu.hpp" // IWYU pragma: export
#include "mllm/backends/cpu/kernels/arm/conv2d.hpp" // IWYU pragma: export
#include "mllm/backends/cpu/kernels/arm/conv3d.hpp" // IWYU pragma: export
#include "mllm/backends/cpu/kernels/arm/linear/kai.hpp" // IWYU pragma: export
#include "mllm/backends/cpu/kernels/arm/relu.hpp" // IWYU pragma: export
Expand Down
99 changes: 99 additions & 0 deletions mllm/models/minicpm_o2_6/configuration_minicpmo.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright (c) MLLM Team.
// Licensed under the MIT License.
#pragma once

#include "mllm/core/aops/LinearOp.hpp"
#include "mllm/engine/ConfigFile.hpp"

namespace mllm::models::minicpmo {

struct MiniCPMOConfig : protected ConfigFile {
MiniCPMOConfig() = default;

explicit MiniCPMOConfig(const std::string& file_path) : ConfigFile(file_path) {
// Vision Config
vision_hidden_size = data()["vision_hidden_size"];
vision_intermediate_size = data()["vision_intermediate_size"];
vision_num_hidden_layers = data()["vision_num_hidden_layers"];
vision_num_attention_heads = data()["vision_num_attention_heads"];
vision_num_channels = data()["vision_num_channels"];
vision_image_size = data()["vision_image_size"];
vision_patch_size = data()["vision_patch_size"];

// LLM Config (Qwen2 based)
hidden_size = data()["hidden_size"];
intermediate_size = data()["intermediate_size"];
num_attention_heads = data()["num_attention_heads"];
num_key_value_heads = data()["num_key_value_heads"];
num_hidden_layers = data()["num_hidden_layers"];
max_position_embeddings = data()["max_position_embeddings"];
rms_norm_eps = data()["rms_norm_eps"];
vocab_size = data()["vocab_size"];

// Resampler Config
query_num = data()["query_num"];

// Audio Config (Whisper based)
audio_hidden_size = data()["audio_hidden_size"];
audio_num_hidden_layers = data()["audio_num_hidden_layers"];
audio_num_attention_heads = data()["audio_num_attention_heads"];
audio_max_position_embeddings = data()["audio_max_position_embeddings"];
audio_chunk_length = data()["audio_chunk_length"];
audio_pool_step = data()["audio_pool_step"];

// TTS Config
tts_llm_dim = data()["tts_llm_dim"];

// Common Config
max_cache_length = data()["max_cache_length"];
eos_token_id = data()["eos_token_id"];
rope_theta = data()["rope_theta"];
tie_word_embeddings = data()["tie_word_embeddings"];

linear_impl_type = aops::str2LinearImplTypes(data()["linear_impl_type"]);
}

// Vision Config (SigLIP)
int32_t vision_hidden_size = 1152;
int32_t vision_intermediate_size = 4304;
int32_t vision_num_hidden_layers = 27;
int32_t vision_num_attention_heads = 16;
int32_t vision_num_channels = 3;
int32_t vision_image_size = 980;
int32_t vision_patch_size = 14;

// LLM Config (Qwen2.5-7B)
int32_t hidden_size = 3584;
int32_t intermediate_size = 18944;
int32_t num_attention_heads = 28;
int32_t num_key_value_heads = 4;
int32_t num_hidden_layers = 28;
int32_t max_position_embeddings = 32768;
float rms_norm_eps = 1e-06;
int32_t vocab_size = 151700;

// Resampler Config
int32_t query_num = 64;

// Audio Config (Whisper)
int32_t audio_hidden_size = 1024;
int32_t audio_num_hidden_layers = 24;
int32_t audio_num_attention_heads = 16;
int32_t audio_max_position_embeddings = 1500;
float audio_chunk_length = 1.0;
int32_t audio_pool_step = 2;

// TTS Config (按实际添加更改)
int32_t tts_llm_dim = 3584;

// Common Config
int32_t max_cache_length = 8192;
int64_t eos_token_id = 151645;
int64_t bos_token_id = 151643;
float rope_theta = 1000000.0;
bool tie_word_embeddings = false;

aops::LinearImplTypes linear_impl_type = aops::LinearImplTypes::kDefault;
};

} // namespace mllm::models::minicpmo
Loading
Loading