From 24a07ab6e6dd339941da0b8334227262bc2fab8f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Mar 2025 01:30:16 +0100 Subject: [PATCH 01/10] tts : implement mimi decoder --- .gitignore | 1 + common/common.cpp | 28 + common/common.h | 22 + examples/tts/CMakeLists.txt | 6 + examples/tts/README-mimi.md | 50 ++ examples/tts/convert_mimi_to_gguf.py | 191 +++++++ examples/tts/mimi.cpp | 770 +++++++++++++++++++++++++++ 7 files changed, 1068 insertions(+) create mode 100644 examples/tts/README-mimi.md create mode 100644 examples/tts/convert_mimi_to_gguf.py create mode 100644 examples/tts/mimi.cpp diff --git a/.gitignore b/.gitignore index 2c67ad7f7c6..41fe1f31271 100644 --- a/.gitignore +++ b/.gitignore @@ -107,6 +107,7 @@ examples/server/*.gz.hpp !examples/*/*/*.kts !examples/sycl/*.bat !examples/sycl/*.sh +/*.wav # Server Web UI temporary files node_modules diff --git a/common/common.cpp b/common/common.cpp index 18ffb4e738a..30870980a14 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2055,3 +2055,31 @@ common_grammar_trigger common_grammar_trigger::from_json(const json & in) { } return out; } + +// +// Audio utils +// + +bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { + std::ofstream file(fname, std::ios::binary); + if (!file) { + LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str()); + return false; + } + + wav_header header; + header.sample_rate = sample_rate; + header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); + header.block_align = header.num_channels * (header.bits_per_sample / 8); + header.data_size = data.size() * (header.bits_per_sample / 8); + header.chunk_size = 36 + header.data_size; + + file.write(reinterpret_cast(&header), sizeof(header)); + + for (const auto & sample : data) { + int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); + file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); + } + + return file.good(); +} diff --git a/common/common.h b/common/common.h index 1c0f1997749..0c676931492 100644 --- a/common/common.h +++ b/common/common.h @@ -683,3 +683,25 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count"; const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; } + +// +// Audio utils +// + +struct wav_header { + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t chunk_size; + char wave[4] = {'W', 'A', 'V', 'E'}; + char fmt[4] = {'f', 'm', 't', ' '}; + uint32_t fmt_chunk_size = 16; + uint16_t audio_format = 1; // PCM + uint16_t num_channels = 1; // Mono + uint32_t sample_rate; + uint32_t byte_rate; + uint16_t block_align; + uint16_t bits_per_sample = 16; + char data[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size; +}; + +bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate); diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt index c72bd814c3b..f76d834b18f 100644 --- a/examples/tts/CMakeLists.txt +++ b/examples/tts/CMakeLists.txt @@ -3,3 +3,9 @@ add_executable(${TARGET} tts.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +set(TARGET llama-mimi) +add_executable(${TARGET} mimi.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/tts/README-mimi.md b/examples/tts/README-mimi.md new file mode 100644 index 00000000000..b46f5f77b95 --- /dev/null +++ b/examples/tts/README-mimi.md @@ -0,0 +1,50 @@ +# llama.cpp/example/mimi + +This demonstrates running [Kyutai's Mimi](https://huggingface.co/kyutai/mimi) model via GGML. + +## Quickstart + +Convert model to GGUF (no need to download, the script will automatically download the `safetensors` file) + +```sh +python examples/tts/convert_mimi_to_gguf.py + +# output file: kyutai-mimi.gguf + +# optionally, use q8_0 quantization for faster speed +python examples/tts/convert_mimi_to_gguf.py --outtype q8_0 +``` + +Then compile, run it: + +```sh +cmake --build build -j --target llama-mimi + +./build/bin/llama-mimi kyutai-mimi.gguf codes.txt + +# output: output.wav + +# alternatively, use "dummy1" to get a "hey hello there" sample output file +./build/bin/llama-mimi kyutai-mimi.gguf dummy1 +``` + +Example of code file (one code per line): + +``` +1263 +1597 +1596 +1477 +1540 +1720 +1433 +118 +1066 +1968 +1096 +232 +418 +566 +1653 +2010 +``` diff --git a/examples/tts/convert_mimi_to_gguf.py b/examples/tts/convert_mimi_to_gguf.py new file mode 100644 index 00000000000..5b44ef62103 --- /dev/null +++ b/examples/tts/convert_mimi_to_gguf.py @@ -0,0 +1,191 @@ +import gguf +import argparse +import logging +import torch +from typing import Union +from pathlib import Path +from torch import Tensor +from transformers import MimiModel + +logger = logging.getLogger("mimi") + + +class MimiModelConverter: + mimi_model: MimiModel + gguf_writer: gguf.GGUFWriter + fname_out: Path + ftype: gguf.LlamaFileType + + def __init__(self, + pretrained_model_name_or_path: Union[Path, str], + fname_out: Path, + ftype: gguf.LlamaFileType, + is_big_endian: bool,): + self.mimi_model = MimiModel.from_pretrained(pretrained_model_name_or_path) + self.fname_out = fname_out + self.ftype = ftype + endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.gguf_writer = gguf.GGUFWriter( + path=None, + arch="if you see this, you are using the wrong file", + endianess=endianess) + + assert self.mimi_model.config.architectures[0] == "MimiModel" + + # load tensors + for name, data_torch in self.mimi_model.state_dict().items(): + # convert any unsupported data types to float32 + old_dtype = data_torch.dtype + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + self.add_tensor(name, data_torch, old_dtype) + + def add_tensor(self, name: str, data_torch: Tensor, old_dtype: torch.dtype): + is_1d = len(data_torch.shape) == 1 + is_bias = ".bias" in name + can_quantize = not is_1d and not is_bias + data_qtype = gguf.GGMLQuantizationType.F32 + + n_head = self.mimi_model.config.num_attention_heads + n_kv_head = self.mimi_model.config.num_key_value_heads + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = self.undo_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = self.undo_permute(data_torch, n_head, n_kv_head) + + # process codebook + if ".codebook.initialized" in name: + # "initialized" tensor + state_dict = self.mimi_model.state_dict() + embed_sum = state_dict[name.replace(".initialized", ".embed_sum")] + cluster_usage = state_dict[name.replace(".initialized", ".cluster_usage")] + # see modeling_mimi.py --> MimiEuclideanCodebook + data_torch = embed_sum / cluster_usage.clamp(min=self.mimi_model.config.norm_eps)[:, None] + name = name.replace(".initialized", "") + + # ignore processed tensors + if ".cluster_usage" in name or ".embed_sum" in name: + return + + # transpose some tensors + if ".conv.bias" in name: + data_torch = data_torch.view((1, data_torch.shape[0])) + data_torch = data_torch.transpose(0, 1) + + # change view 3d to 2d + if "quantizer" in name and "_proj." in name: + assert data_torch.shape[2] == 1 + data_torch = data_torch.view((data_torch.shape[0], data_torch.shape[1])) + + # shorten name, otherwise it will be too long for ggml to read + name = name.replace("_residual_vector_quantizer", "_rvq") + + if can_quantize: + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unsupported file type: {self.ftype}") + + # Conv kernels are always F16 + if ".conv.weight" in name: + data_qtype = gguf.GGMLQuantizationType.F16 + + data = data_torch.numpy() + + try: + data = gguf.quants.quantize(data, data_qtype) + except Exception as e: + logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" + logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) + + def write(self): + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + @staticmethod + def undo_permute(weights: Tensor, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert Mimi safetensors model to GGUF",) + parser.add_argument( + "--outfile", type=Path, default="kyutai-mimi.gguf", + help="path to write to", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", + help="output format", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", type=Path, + help="directory or model ID containing model file (if model ID is specified, download from Hugging Face hub)", + nargs="?", + default="kyutai/mimi", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + + args = parser.parse_args() + if args.model is None: + parser.error("the following arguments are required: model") + return args + + +def main() -> None: + args = parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + dir_model = args.model + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + } + + logger.info(f"Loading model: {dir_model}") + + with torch.inference_mode(): + converter = MimiModelConverter( + pretrained_model_name_or_path=dir_model, + fname_out=args.outfile, + ftype=ftype_map[args.outtype], + is_big_endian=args.bigendian, + ) + converter.write() + + +if __name__ == '__main__': + main() + diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp new file mode 100644 index 00000000000..2c5833faa27 --- /dev/null +++ b/examples/tts/mimi.cpp @@ -0,0 +1,770 @@ +#include "ggml.h" +#include "ggml-cpp.h" +#include "ggml-cpu.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "gguf.h" + +#include "common.h" + +#include +#include +#include +#include +#include +#include + +/** + * Implementation of Kyutai's Mimi model using GGML. + * Based on this research: https://github.com/ngxson/ggml-easy/blob/master/demo/kyutai-mimi.cpp + * + * NOTE: only decoder is working for now. + * + * Background: + * - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc + * - Audio codes must be in the order: (1 semantic component, 31 acoustic components) repeated N times + * + * How it works? + * 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code + * 2. The latent code is passed to a mimi_conv_transpose_1d (depthwise) to upscale + * 3. The upscaled code is passed to transformer, it converts N frames to N frames + * 4. The output embeddings is then passed to SEANet (mimi_encoder_decoder) to get the final waveform + * 5. Waveform is written to a file + */ + +// copied from https://huggingface.co/kyutai/mimi/blob/main/config.json +struct mimi_config_t { + bool causal = true; + int max_position_embeddings = 8000; + int num_hidden_layers = 8; + int n_embd = 512; + int n_ffn = 2048; + int n_head = 8; + int n_head_kv = 8; + int n_rot = 64; + float norm_eps = 1e-5; + float rope_theta = 10000.0f; + int sliding_window = 250; + std::array upsampling_ratio = {8, 6, 5, 4}; + std::array downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio + // vector quantizer + float frame_rate = 12.5; + int audio_channels = 1; + int codebook_size = 2048; + int codebook_dim = 256; + int n_semantic_components = 1; + int n_acoustic_components = 31; + // decode + float trim_right_ratio = 1.0f; +} mimi_config; + +// Adapted from https://github.com/ngxson/ggml-easy/blob/master/ggml-easy.h +struct mimi_ggml_ctx { + gguf_context * ctx_gguf = nullptr; + ggml_context * ctx_data = nullptr; + ggml_context * ctx_gf = nullptr; + + // CPU-only for now, as many kernels are missing and we actually get less performance with GPU + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buf = nullptr; + ggml_backend_sched_ptr sched; + + ggml_cgraph * gf = nullptr; + std::vector buf_compute_meta; + int max_nodes = 16 * 1024; + + std::unordered_map tensors; + + mimi_ggml_ctx() { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + auto buft = ggml_backend_get_default_buffer_type(backend); + sched.reset( + ggml_backend_sched_new(&backend, &buft, 1, max_nodes, false) + ); + buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); + } + + void load_gguf(const char * fname) { + ggml_context * meta = nullptr; + + gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + ctx_gguf = gguf_init_from_file(fname, params); + + // load tensors + const int n_tensors = gguf_get_n_tensors(ctx_gguf); + + std::vector read_buf; + ggml_init_params ggml_params = { + /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ctx_data = ggml_init(ggml_params); + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + ggml_free(meta); + throw std::runtime_error("cannot open model file for loading tensors"); + } + + // add tensors to context + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + ggml_tensor * t = ggml_get_tensor(meta, name); + ggml_tensor * cur = ggml_dup_tensor(ctx_data, t); + ggml_set_name(cur, name); + tensors.insert({name, cur}); + } + + // alloc memory and offload data + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft); + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + const size_t offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i); + // printf("%s: Loading tensor \"%s\"\n", __func__, name); + fin.seekg(offset, std::ios::beg); + if (!fin) { + ggml_free(meta); + throw std::runtime_error(string_format("failed to seek for tensor: %s", name)); + } + int num_bytes = ggml_nbytes(cur); + if (ggml_backend_buft_is_host(buft)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + } + printf("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname); + fin.close(); + + ggml_free(meta); + } + + /** + * Build a cgraph using the given builder function. + * + * The built cgraph will be stored in `ctx.gf` + */ + void build_graph(std::function builder_fn) { + ggml_free(ctx_gf); + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ctx_gf = ggml_init(params); + ggml_backend_sched_reset(sched.get()); + gf = ggml_new_graph_custom(ctx_gf, max_nodes, false); + + builder_fn(ctx_gf, gf); + ggml_backend_sched_alloc_graph(sched.get(), gf); + } + + ggml_status compute() { + ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf); + return status; + } + + void set_tensor_data(const std::string & name, const void * data) { + ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); + if (!t) { + throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); + } + ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); + } + + std::pair> get_tensor_data(const std::string & name) { + ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); + if (!t) { + throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); + } + std::vector data(ggml_nbytes(t)); + ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); + return std::make_pair(t, data); + } + + ggml_tensor * get_weight(const char *fmt, ...) { + std::vector str(128); + va_list va; + va_start(va, fmt); + vsnprintf(str.data(), 128, fmt, va); + va_end(va); + auto it = tensors.find(str.data()); + if (it == tensors.end()) { + throw std::runtime_error(string_format("weight tensor not found: %s", str.data())); + } + return it->second; + } + + ~mimi_ggml_ctx() { + ggml_free(ctx_data); + gguf_free(ctx_gguf); + ggml_backend_buffer_free(buf); + } +}; + +/////////////////////////////////////////////////////////////////////////// +// extension to ggml.h +// TODO: add these ops to the library (ofc with a more optimized kernel) + + +// mode: (0) constant, (1) reflect, (2) replicate, (3) circular +// value is only used in "constant" +// only "constant" with 0.0f and "replicate" are implemented here +static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode, + int64_t pad_left, int64_t pad_right, float value = 0.0f) { + GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f + GGML_ASSERT(mode == 0 || mode == 2); + if (pad_left > 0) { + ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]); + if (mode == 0) { + tmp = ggml_scale(ctx0, tmp, value); + } else if (mode == 2) { + ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column + tmp = ggml_repeat(ctx0, elem, tmp); + } + x = ggml_concat(ctx0, tmp, x, 0); + } + if (pad_right > 0) { + ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]); + if (mode == 0) { + tmp = ggml_scale(ctx0, tmp, value); + } else if (mode == 2) { + int64_t last = x->ne[0] - 1; + ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column + tmp = ggml_repeat(ctx0, elem, tmp); + } + x = ggml_concat(ctx0, x, tmp, 0); + } + return x; +} + + + + +/////////////////////////////////////////////////////////////////////////// +// MimiConv and MimiConvTranspose + +static int64_t div_ceil(int64_t a, int64_t b) { + return a / b + (a % b ? 1 : 0); +} + +static ggml_tensor * mimi_conv_1d(ggml_context * ctx0, ggml_tensor * x, + ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) { + int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1; + int64_t p_total = kernel_size - stride; // padding total + int64_t p_half = p_total / 2; + + int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride); + int64_t ideal_len = n_frames * stride + kernel_size - p_total; + int64_t p_extra = ideal_len - x->ne[0]; + + int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra; + int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half); + + x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right); + + x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation); + if (bias) { + x = ggml_add(ctx0, x, bias); + } + ggml_set_name(x, "mimi_conv_1d"); + return x; +} + +static ggml_tensor * mimi_conv_transpose_1d(ggml_context * ctx0, ggml_tensor * x, + ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) { + GGML_ASSERT(x->ne[1] == kernel->ne[2]); + int64_t n_rows = x->ne[1]; + int64_t kernel_size = kernel->ne[0]; + int64_t p_total = kernel_size - stride; // padding total + + int64_t p_right = mimi_config.causal + ? (float)p_total / mimi_config.trim_right_ratio + : p_total / 2; + int64_t p_left = p_total - p_right; + + ggml_tensor * out = nullptr; + + if (depthwise) { + for (int64_t ir = 0; ir < n_rows; ir++) { + ggml_tensor * row = ggml_view_1d(ctx0, x, + x->ne[0], ir*x->ne[0]*ggml_element_size(x)); + ggml_tensor * krn = ggml_view_1d(ctx0, kernel, + kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel)); + row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation); + // unpad (remove p_right and p_left columns) + row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row)); + + // TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc + out = out ? ggml_concat(ctx0, out, row, 1) : row; + } + + } else { + out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation); + // unpad + out = ggml_view_2d(ctx0, out, + out->ne[0] - p_total, out->ne[1], + out->nb[1], p_left*ggml_element_size(out)); + } + + if (bias) { + out = ggml_add(ctx0, out, bias); + } + + return out; +} + + + +/////////////////////////////////////////////////////////////////////////// + +// based on MimiEncoder +// SEANet encoder as used by Mimi. +struct mimi_encoder_decoder { + mimi_ggml_ctx & ctx; + struct layer { + bool is_elu = false; + bool is_resnet = false; + bool is_transposed_conv = false; + ggml_tensor * conv_0_w; + ggml_tensor * conv_0_b; + ggml_tensor * conv_1_w; + ggml_tensor * conv_1_b; + int stride = 1; + }; + std::vector layers; + + std::array repeated_pattern = {1, 4, 7, 10}; + + mimi_encoder_decoder(mimi_ggml_ctx & ctx): ctx(ctx) { + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"), + .conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"), + }); + for (int i = 0; i < (int)repeated_pattern.size(); ++i) { + int i_start = repeated_pattern[i]; + // upsampling layers + layers.push_back({ + .is_elu = true, // layer (i_start) + }); + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), + .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), + .stride = mimi_config.upsampling_ratio[i], + .is_transposed_conv = true, + }); + // residual layers + layers.push_back({ + .is_resnet = true, + .conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2), + .conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2), + .conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2), + .conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2), + }); + } + layers.push_back({ + .is_elu = true, // layer 13 + }); + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"), + .conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"), + }); + } + + ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input) { + ggml_tensor * x = input; + + for (auto & layer : layers) { + if (layer.is_elu) { + x = ggml_elu(ctx0, x); + } else if (layer.is_resnet) { + ggml_tensor * residual = x; + x = ggml_elu(ctx0, x); + x = mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1); + x = ggml_elu(ctx0, x); + x = mimi_conv_1d(ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1); + x = ggml_add(ctx0, x, residual); + } else { + x = layer.is_transposed_conv + ? mimi_conv_transpose_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false) + : mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1); + } + } + + return x; + } +}; + +struct mimi_transformer { + struct layer { + ggml_tensor * inp_norm_w; + ggml_tensor * inp_norm_b; + + ggml_tensor * attn_q; + ggml_tensor * attn_k; + ggml_tensor * attn_v; + ggml_tensor * attn_o; + ggml_tensor * attn_post_norm_w; + ggml_tensor * attn_post_norm_b; + ggml_tensor * attn_layer_scale; + + ggml_tensor * ffn_up; + ggml_tensor * ffn_down; + ggml_tensor * mlp_layer_scale; + }; + std::vector layers; + + mimi_transformer(mimi_ggml_ctx & ctx, const char * prefix, int n_layers) { + for (int il = 0; il < n_layers; il++) { + layers.push_back({ + .inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il), + .inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il), + + .attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il), + .attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il), + .attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il), + .attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il), + .attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il), + .attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il), + .attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il), + + .ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il), + .ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il), + .mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il), + }); + } + } + + ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * inp_pos) { + int n_tokens = input->ne[1]; + ggml_tensor * x = input; + + auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { + x = ggml_norm(ctx0, x, mimi_config.norm_eps); + x = ggml_mul(ctx0, x, w); + x = ggml_add(ctx0, x, b); + return x; + }; + + ggml_tensor * residual = input; + + for (auto & layer : layers) { + residual = x; + + // input layer norm + x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b); + + // self attention + { + ggml_tensor * q = ggml_mul_mat(ctx0, layer.attn_q, x); + ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x); + ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x); + + int n_embd_head = mimi_config.n_embd / mimi_config.n_head; + q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens); + k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens); + v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens); + + int n_rot = n_embd_head; + q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0); + q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); + + k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0); + k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp + kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head)); + ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens); + kq = ggml_soft_max_inplace(ctx0, kq_masked); + + v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head); + kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens); + + x = ggml_mul_mat(ctx0, layer.attn_o, kqv); + } + + // residual + x = ggml_mul(ctx0, x, layer.attn_layer_scale); + x = ggml_add(ctx0, x, residual); + + residual = x; + x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b); + + // mlp + { + x = ggml_mul_mat(ctx0, layer.ffn_up, x); + x = ggml_gelu(ctx0, x); + x = ggml_mul_mat(ctx0, layer.ffn_down, x); + } + + // residual + x = ggml_mul(ctx0, x, layer.mlp_layer_scale); + x = ggml_add(ctx0, x, residual); + } + + return x; + } +}; + +struct mimi_residual_vector_quantizer { + struct component { + ggml_tensor * codebook; + }; + + ggml_tensor * semantic_inp_proj; + std::vector semantic_components; + ggml_tensor * semantic_out_proj; + + ggml_tensor * acoustic_inp_proj; + std::vector acoustic_components; + ggml_tensor * acoustic_out_proj; + + mimi_residual_vector_quantizer(mimi_ggml_ctx & ctx) { + semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight"); + semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight"); + for (int i = 0; i < mimi_config.n_semantic_components; i++) { + semantic_components.push_back({ + .codebook = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook", i), + }); + } + acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight"); + acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight"); + for (int i = 0; i < mimi_config.n_acoustic_components; i++) { + acoustic_components.push_back({ + .codebook = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook", i), + }); + } + } + + // the input has shape [n_codes, n_codes_per_embd] + // first row is semantic, the rest are acoustic + // example: [ [semantic], [acoustic1], [acoustic2], ... ] + ggml_tensor * decode(ggml_context * ctx0, ggml_tensor * input) { + GGML_ASSERT(input->type == GGML_TYPE_I32); + + size_t n_semantic = semantic_components.size(); + int64_t n_codes_per_embd = (n_semantic + acoustic_components.size()); + int64_t n_codes = input->ne[0] / n_codes_per_embd; + + GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0); + + ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); + ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); + out_s = ggml_scale(ctx0, out_s, 0.0f); // clear + out_a = ggml_scale(ctx0, out_a, 0.0f); // clear + + for (size_t ir = 0; ir < (size_t)n_codes_per_embd; ir++) { + ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input)); + if (ir < n_semantic) { + // semantic + ggml_tensor * codebook = semantic_components[ir].codebook; + ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); + out_s = ggml_add(ctx0, out_s, embd); + } else { + // acoustic + ggml_tensor * codebook = acoustic_components[ir-n_semantic].codebook; + ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); + out_a = ggml_add(ctx0, out_a, embd); + } + } + + out_s = ggml_mul_mat(ctx0, semantic_out_proj, out_s); + out_a = ggml_mul_mat(ctx0, acoustic_out_proj, out_a); + + return ggml_add(ctx0, out_s, out_a); + } +}; + + + +/////////////////////////////////////////////////////////////////////////// +// main program + +int main(int argc, const char ** argv) { + if (argc < 3) { + fprintf(stderr, "Usage: %s model.gguf codes.txt [output.wav]\n", argv[0]); + fprintf(stderr, " Format of codes.txt file: one code per line\n"); + fprintf(stderr, " Replace codes.txt with dummy0 and dummy1 for testing\n"); + fprintf(stderr, " dummy0: using code 1, 2, 3,..., 96, used for logits matching\n"); + fprintf(stderr, " dummy1: using code that will outputs 'hey hello there' sound\n"); + return 1; + } + + const char * model_path = argv[1]; + const char * codes_path = argv[2]; + const char * out_path = argc < 4 ? "output.wav" : argv[3]; + + mimi_ggml_ctx ctx; + ctx.load_gguf(model_path); + + // initialize components + mimi_encoder_decoder decoder(ctx); + mimi_transformer transformer(ctx, "decoder", mimi_config.num_hidden_layers); + mimi_residual_vector_quantizer quantizer(ctx); + + // load codes + std::vector codes; + if (strcmp(codes_path, "dummy0") == 0) { + printf("Using dummy0 codes\n"); + codes.resize(32 * 3); // [n_codes = 3, n_codes_per_embd = 32] + int n = 0; + for (int c = 0; c < 32; c++) { + for (int r = 0; r < 3; r++) { + codes[r*32 + c] = n++; + } + } + } else if (strcmp(codes_path, "dummy1") == 0) { + printf("Using dummy1 codes\n"); + codes = { + 1263 ,1597 ,1596 ,1477 ,1540 ,1720 ,1433 ,118 ,1066 ,1968 ,1096 ,232 ,418 ,566 ,1653 ,2010 , + 1029 ,1874 ,77 ,1803 ,123 ,908 ,97 ,1616 ,595 ,1170 ,1654 ,1211 ,1967 ,1579 ,1846 ,1462 , + 1962 ,175 ,1539 ,742 ,1065 ,1226 ,19 ,955 ,528 ,1031 ,659 ,1687 ,1173 ,1802 ,1031 ,1714 , + 1986 ,582 ,367 ,112 ,1245 ,1386 ,759 ,532 ,1472 ,1790 ,802 ,1213 ,1543 ,1916 ,1251 ,309 , + 1962 ,1280 ,1943 ,878 ,1588 ,1989 ,568 ,1463 ,1814 ,1095 ,103 ,583 ,976 ,998 ,871 ,587 , + 247 ,1698 ,1817 ,1024 ,268 ,597 ,45 ,1608 ,1880 ,2047 ,759 ,1578 ,1612 ,49 ,1031 ,1076 , + 927 ,1202 ,1601 ,1719 ,1670 ,412 ,568 ,1838 ,341 ,1265 ,1279 ,830 ,1997 ,32 ,1369 ,1686 , + 1307 ,419 ,1143 ,324 ,325 ,572 ,1597 ,1920 ,795 ,915 ,610 ,2000 ,819 ,718 ,1235 ,282 , + 1912 ,1911 ,141 ,1069 ,1485 ,642 ,1370 ,732 ,284 ,1407 ,1591 ,1002 ,939 ,671 ,951 ,1411 , + 1887 ,460 ,1588 ,1636 ,1312 ,232 ,969 ,1513 ,1336 ,1185 ,1660 ,4 ,926 ,1243 ,1077 ,1379 , + 704 ,85 ,257 ,1302 ,1029 ,1717 ,899 ,1345 ,355 ,1915 ,1007 ,315 ,1283 ,779 ,415 ,335 , + 1848 ,1786 ,469 ,295 ,380 ,1736 ,393 ,765 ,1921 ,836 ,374 ,1649 ,52 ,1633 ,759 ,548 , + 1922 ,47 ,564 ,893 ,34 ,131 ,1063 ,1657 ,474 ,1960 ,1255 ,1275 ,92 ,976 ,1217 ,483 , + 105 ,1746 ,1158 ,1557 ,1001 ,512 ,1668 ,1255 ,1045 ,1596 ,613 ,1272 ,1366 ,1147 ,411 ,831 , + 349 ,692 ,1435 ,2005 ,1465 ,37 ,892 ,95 ,460 ,557 ,1315 ,259 ,1978 ,1838 ,1232 ,2003 , + 1197 ,111 ,1953 ,1297 ,1843 ,671 ,1687 ,91 ,1788 ,1138 ,1896 ,399 ,615 ,758 ,1423 ,365 , + 288 ,632 ,876 ,875 ,1156 ,345 ,1189 ,638 ,1527 ,1981 ,1925 ,333 ,1353 ,473 ,1913 ,1443 , + 1634 ,1373 ,803 ,420 ,192 ,1440 ,1593 ,1925 ,784 ,831 ,552 ,807 ,1942 ,1289 ,612 ,511 , + 968 ,1091 ,30 ,828 ,1611 ,1241 ,1985 ,596 ,273 ,529 ,1182 ,302 ,726 ,1942 ,733 ,1590 , + 1564 ,214 ,1156 ,1722 ,1215 ,1837 ,1729 ,1823 ,672 ,116 ,340 ,396 ,721 ,462 ,1615 ,1380 , + 1459 ,1553 ,636 ,586 ,1148 ,1147 ,1941 ,471 ,876 ,127 ,1938 ,2002 ,1563 ,1121 ,857 ,1179 , + 1983 ,1324 ,1726 ,1445 ,295 ,270 ,896 ,1947 ,1740 ,1211 ,128 ,1266 ,734 ,715 ,1562 ,285 , + 1139 ,304 ,526 ,653 ,1270 ,320 ,484 ,22 ,687 ,1065 ,489 ,827 ,993 ,1654 ,431 ,1552 , + 1418 ,1604 ,455 ,841 ,412 ,848 ,475 ,540 ,1903 ,575 ,584 ,300 ,1079 ,189 ,1481 ,893 , + 228 ,1577 ,429 ,635 ,106 ,1536 ,176 ,348 ,1733 ,1570 ,537 ,1840 ,798 ,410 ,1714 ,1318 , + 487 ,332 ,1109 ,1744 ,283 ,692 ,681 ,1744 ,1008 ,1715 ,1956 ,1066 ,1768 ,1645 ,139 ,1967 , + 897 ,132 ,1010 ,1932 ,277 ,1536 ,1541 ,952 ,19 ,88 ,1663 ,1232 ,1681 ,1878 ,1241 ,1805 , + 89 ,1401 ,544 ,1061 ,1166 ,267 ,1351 ,1998 ,1623 ,1898 ,425 ,1320 ,2006 ,865 ,1981 ,823 , + 1243 ,471 ,485 ,1765 ,391 ,1281 ,1607 ,1418 ,116 ,1702 ,1725 ,512 ,1088 ,1375 ,1994 ,1738 , + 725 ,1471 ,811 ,1251 ,1156 ,1664 ,898 ,1511 ,1872 ,1717 ,444 ,1005 ,254 ,103 ,202 ,1769 , + 1511 ,433 ,284 ,721 ,1741 ,56 ,615 ,916 ,887 ,1253 ,916 ,535 ,1666 ,1713 ,741 ,873 , + 447 ,492 ,388 ,321 ,1860 ,1456 ,1658 ,1682 ,848 ,462 ,2034 ,1368 ,1609 ,1887 ,510 ,1516 , + }; + } else { + std::ifstream fin(codes_path); + if (!fin) { + fprintf(stderr, "Error: cannot open codes file: %s\n", codes_path); + return 1; + } + std::string line; + while (std::getline(fin, line)) { + // Skip empty lines + if (line.empty()) continue; + try { + int code = std::stoi(line); + codes.push_back(code); + } catch (const std::exception& e) { + fprintf(stderr, "Error parsing code: %s\n", line.c_str()); + return 1; + } + } + if (codes.empty()) { + fprintf(stderr, "Error: no codes found in file: %s\n", codes_path); + return 1; + } + + printf("Loaded %d codes from %s\n", (int)codes.size(), codes_path); + } + + // build cgraph + int n_pos = -1; + int n_codes = codes.size(); + int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; + GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiple of n_codes_per_embd"); + + ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) { + ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes); + ggml_set_name(inp_dec, "inp_dec"); + ggml_set_input(inp_dec); + + // RVQ + ggml_tensor * embeddings = quantizer.decode(ctx_gf, inp_dec); + + // upsample + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + embeddings = mimi_conv_transpose_1d(ctx_gf, embeddings, ctx.get_weight("upsample.conv.weight"), nullptr, 2, 1, true); + + // transformer + n_pos = embeddings->ne[0]; + ggml_tensor * pos_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_dec, "pos_dec"); + ggml_set_input(pos_dec); + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + embeddings = transformer.forward(ctx_gf, embeddings, pos_dec); + + // SEANET decoder + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + ggml_tensor * output = decoder.forward(ctx_gf, embeddings); + + ggml_set_name(output, "output"); + ggml_set_output(output); + ggml_build_forward_expand(gf, output); + }); + + // position data + std::vector pos_data(1024); + for (int i = 0; i < (int)pos_data.size(); i++) { + pos_data[i] = i; + } + ctx.set_tensor_data("pos_dec", pos_data.data()); + + // code data (need to transpose it) + // code [n_codes, n_codes_per_embd] -> [n_codes_per_embd, n_codes] + std::vector codes_t(n_codes_per_embd * n_codes); + for (int i = 0; i < n_codes / n_codes_per_embd; i++) { + for (int j = 0; j < n_codes_per_embd; j++) { + int src_idx = i * n_codes_per_embd + j; + int dst_idx = j * (n_codes / n_codes_per_embd) + i; + codes_t[dst_idx] = codes[src_idx]; + } + } + ctx.set_tensor_data("inp_dec", codes_t.data()); + + ctx.compute(); + + auto output = ctx.get_tensor_data("output"); + auto output_tensor = output.first; + auto output_data = output.second; + printf("Output shape: [%lld, %lld]\n", output_tensor->ne[0], output_tensor->ne[1]); + + // print first 20 values + for (int i = 0; i < 20; i++) { + printf("%2.4f, ", ((float *)output_data.data())[i]); + } + printf("...\n"); + + // write to wav + std::vector wav_data(output_data.size() / sizeof(float)); + for (size_t i = 0; i < wav_data.size(); i++) { + wav_data[i] = ((float *)output_data.data())[i]; + } + printf("Writing to %s\n", out_path); + save_wav16(out_path, wav_data, 24000); +} From efeaa5712cb6489b9a704daf670d043e5e758347 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Mar 2025 09:06:00 +0100 Subject: [PATCH 02/10] fix llama-tts --- examples/tts/tts.cpp | 40 ---------------------------------------- 1 file changed, 40 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 4cc42e1674c..b3461b5d273 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -71,46 +71,6 @@ static void print_usage(int, char ** argv) { LOG("\n"); } -struct wav_header { - char riff[4] = {'R', 'I', 'F', 'F'}; - uint32_t chunk_size; - char wave[4] = {'W', 'A', 'V', 'E'}; - char fmt[4] = {'f', 'm', 't', ' '}; - uint32_t fmt_chunk_size = 16; - uint16_t audio_format = 1; // PCM - uint16_t num_channels = 1; // Mono - uint32_t sample_rate; - uint32_t byte_rate; - uint16_t block_align; - uint16_t bits_per_sample = 16; - char data[4] = {'d', 'a', 't', 'a'}; - uint32_t data_size; -}; - -static bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { - std::ofstream file(fname, std::ios::binary); - if (!file) { - LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str()); - return false; - } - - wav_header header; - header.sample_rate = sample_rate; - header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); - header.block_align = header.num_channels * (header.bits_per_sample / 8); - header.data_size = data.size() * (header.bits_per_sample / 8); - header.chunk_size = 36 + header.data_size; - - file.write(reinterpret_cast(&header), sizeof(header)); - - for (const auto & sample : data) { - int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); - file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); - } - - return file.good(); -} - static void fill_hann_window(int length, bool periodic, float * output) { int offset = -1; if (periodic) { From a98f19918d7e6cff600d1bf0db15ea9cb9bff0da Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Mar 2025 09:51:10 +0100 Subject: [PATCH 03/10] put mimi_model into a shared header --- examples/tts/CMakeLists.txt | 2 +- examples/tts/README-mimi.md | 2 +- examples/tts/mimi-model.cpp | 720 ++++++++++++++++++++++++++++++++++++ examples/tts/mimi-model.h | 32 ++ examples/tts/mimi.cpp | 677 +-------------------------------- 5 files changed, 762 insertions(+), 671 deletions(-) create mode 100644 examples/tts/mimi-model.cpp create mode 100644 examples/tts/mimi-model.h diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt index f76d834b18f..39e0a92c5ac 100644 --- a/examples/tts/CMakeLists.txt +++ b/examples/tts/CMakeLists.txt @@ -5,7 +5,7 @@ target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-mimi) -add_executable(${TARGET} mimi.cpp) +add_executable(${TARGET} mimi.cpp mimi-model.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/tts/README-mimi.md b/examples/tts/README-mimi.md index b46f5f77b95..6576a118291 100644 --- a/examples/tts/README-mimi.md +++ b/examples/tts/README-mimi.md @@ -24,7 +24,7 @@ cmake --build build -j --target llama-mimi # output: output.wav -# alternatively, use "dummy1" to get a "hey hello there" sample output file +# alternatively, use "dummy1" to get a "wah hello there" sample output file ./build/bin/llama-mimi kyutai-mimi.gguf dummy1 ``` diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp new file mode 100644 index 00000000000..31ff86256ae --- /dev/null +++ b/examples/tts/mimi-model.cpp @@ -0,0 +1,720 @@ +#include "ggml.h" +#include "ggml-cpp.h" +#include "ggml-cpu.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "gguf.h" + +#include "common.h" +#include "mimi-model.h" + +#include +#include +#include +#include +#include +#include +#include + +/** + * Implementation of Kyutai's Mimi model using GGML. + * Based on this research: https://github.com/ngxson/ggml-easy/blob/master/demo/kyutai-mimi.cpp + * + * NOTE: only decoder is working for now. + * + * Background: + * - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc + * - Audio codes must be in the order: (1 semantic component, 31 acoustic components) repeated N times + * + * How it works? + * 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code + * 2. The latent code is passed to a mimi_conv_transpose_1d (depthwise) to upscale + * 3. The upscaled code is passed to transformer, it converts N frames to N frames + * 4. The output embeddings is then passed to SEANet (mimi_encoder_decoder) to get the final waveform + * 5. Waveform is written to a file + */ + +// copied from https://huggingface.co/kyutai/mimi/blob/main/config.json +struct mimi_config_t { + bool causal = true; + int sample_rate = 24000; + int max_position_embeddings = 8000; + int num_hidden_layers = 8; + int n_embd = 512; + int n_ffn = 2048; + int n_head = 8; + int n_head_kv = 8; + int n_rot = 64; + float norm_eps = 1e-5; + float rope_theta = 10000.0f; + int sliding_window = 250; + std::array upsampling_ratio = {8, 6, 5, 4}; + std::array downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio + // vector quantizer + float frame_rate = 12.5; + int audio_channels = 1; + int codebook_size = 2048; + int codebook_dim = 256; + int n_semantic_components = 1; + int n_acoustic_components = 31; + // decode + float trim_right_ratio = 1.0f; + int n_codes_per_frame = (sliding_window / 2) * (n_semantic_components + n_acoustic_components); +} mimi_config; + +// Adapted from https://github.com/ngxson/ggml-easy/blob/master/ggml-easy.h +struct mimi_ggml_ctx { + gguf_context * ctx_gguf = nullptr; + ggml_context * ctx_data = nullptr; + ggml_context * ctx_gf = nullptr; + + // CPU-only for now, as many kernels are missing and we actually get less performance with GPU + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buf = nullptr; + ggml_backend_sched_ptr sched; + + ggml_cgraph * gf = nullptr; + std::vector buf_compute_meta; + int max_nodes = 16 * 1024; + + std::unordered_map tensors; + + mimi_ggml_ctx() { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + auto buft = ggml_backend_get_default_buffer_type(backend); + sched.reset( + ggml_backend_sched_new(&backend, &buft, 1, max_nodes, false) + ); + buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); + } + + void load_gguf(const char * fname) { + ggml_context * meta = nullptr; + + gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + ctx_gguf = gguf_init_from_file(fname, params); + + // load tensors + const int n_tensors = gguf_get_n_tensors(ctx_gguf); + + std::vector read_buf; + ggml_init_params ggml_params = { + /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ctx_data = ggml_init(ggml_params); + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + ggml_free(meta); + throw std::runtime_error("cannot open model file for loading tensors"); + } + + // add tensors to context + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + ggml_tensor * t = ggml_get_tensor(meta, name); + ggml_tensor * cur = ggml_dup_tensor(ctx_data, t); + ggml_set_name(cur, name); + tensors.insert({name, cur}); + } + + // alloc memory and offload data + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft); + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + const size_t offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i); + // printf("%s: Loading tensor \"%s\"\n", __func__, name); + fin.seekg(offset, std::ios::beg); + if (!fin) { + ggml_free(meta); + throw std::runtime_error(string_format("failed to seek for tensor: %s", name)); + } + int num_bytes = ggml_nbytes(cur); + if (ggml_backend_buft_is_host(buft)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + } + printf("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname); + fin.close(); + + ggml_free(meta); + } + + /** + * Build a cgraph using the given builder function. + * + * The built cgraph will be stored in `ctx.gf` + */ + void build_graph(std::function builder_fn) { + ggml_free(ctx_gf); + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ctx_gf = ggml_init(params); + ggml_backend_sched_reset(sched.get()); + gf = ggml_new_graph_custom(ctx_gf, max_nodes, false); + + builder_fn(ctx_gf, gf); + ggml_backend_sched_alloc_graph(sched.get(), gf); + } + + ggml_status compute() { + ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf); + return status; + } + + void set_tensor_data(const std::string & name, const void * data) { + ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); + if (!t) { + throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); + } + ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); + } + + std::pair> get_tensor_data(const std::string & name) { + ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); + if (!t) { + throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); + } + std::vector data(ggml_nbytes(t)); + ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); + return std::make_pair(t, data); + } + + ggml_tensor * get_weight(const char *fmt, ...) { + std::vector str(128); + va_list va; + va_start(va, fmt); + vsnprintf(str.data(), 128, fmt, va); + va_end(va); + auto it = tensors.find(str.data()); + if (it == tensors.end()) { + throw std::runtime_error(string_format("weight tensor not found: %s", str.data())); + } + return it->second; + } + + ~mimi_ggml_ctx() { + ggml_free(ctx_data); + gguf_free(ctx_gguf); + ggml_backend_buffer_free(buf); + } +}; + +/////////////////////////////////////////////////////////////////////////// +// extension to ggml.h +// TODO: add these ops to the library (ofc with a more optimized kernel) + + +// mode: (0) constant, (1) reflect, (2) replicate, (3) circular +// value is only used in "constant" +// only "constant" with 0.0f and "replicate" are implemented here +static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode, + int64_t pad_left, int64_t pad_right, float value = 0.0f) { + GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f + GGML_ASSERT(mode == 0 || mode == 2); + if (pad_left > 0) { + ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]); + if (mode == 0) { + tmp = ggml_scale(ctx0, tmp, value); + } else if (mode == 2) { + ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column + tmp = ggml_repeat(ctx0, elem, tmp); + } + x = ggml_concat(ctx0, tmp, x, 0); + } + if (pad_right > 0) { + ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]); + if (mode == 0) { + tmp = ggml_scale(ctx0, tmp, value); + } else if (mode == 2) { + int64_t last = x->ne[0] - 1; + ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column + tmp = ggml_repeat(ctx0, elem, tmp); + } + x = ggml_concat(ctx0, x, tmp, 0); + } + return x; +} + + + + +/////////////////////////////////////////////////////////////////////////// +// MimiConv and MimiConvTranspose + +static int64_t div_ceil(int64_t a, int64_t b) { + return a / b + (a % b ? 1 : 0); +} + +static ggml_tensor * mimi_conv_1d(ggml_context * ctx0, ggml_tensor * x, + ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) { + int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1; + int64_t p_total = kernel_size - stride; // padding total + int64_t p_half = p_total / 2; + + int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride); + int64_t ideal_len = n_frames * stride + kernel_size - p_total; + int64_t p_extra = ideal_len - x->ne[0]; + + int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra; + int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half); + + x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right); + + x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation); + if (bias) { + x = ggml_add(ctx0, x, bias); + } + ggml_set_name(x, "mimi_conv_1d"); + return x; +} + +static ggml_tensor * mimi_conv_transpose_1d(ggml_context * ctx0, ggml_tensor * x, + ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) { + GGML_ASSERT(x->ne[1] == kernel->ne[2]); + int64_t n_rows = x->ne[1]; + int64_t kernel_size = kernel->ne[0]; + int64_t p_total = kernel_size - stride; // padding total + + int64_t p_right = mimi_config.causal + ? (float)p_total / mimi_config.trim_right_ratio + : p_total / 2; + int64_t p_left = p_total - p_right; + + ggml_tensor * out = nullptr; + + if (depthwise) { + for (int64_t ir = 0; ir < n_rows; ir++) { + ggml_tensor * row = ggml_view_1d(ctx0, x, + x->ne[0], ir*x->ne[0]*ggml_element_size(x)); + ggml_tensor * krn = ggml_view_1d(ctx0, kernel, + kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel)); + row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation); + // unpad (remove p_right and p_left columns) + row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row)); + + // TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc + out = out ? ggml_concat(ctx0, out, row, 1) : row; + } + + } else { + out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation); + // unpad + out = ggml_view_2d(ctx0, out, + out->ne[0] - p_total, out->ne[1], + out->nb[1], p_left*ggml_element_size(out)); + } + + if (bias) { + out = ggml_add(ctx0, out, bias); + } + + return out; +} + + + +/////////////////////////////////////////////////////////////////////////// + +// based on MimiEncoder +// SEANet encoder as used by Mimi. +struct mimi_encoder_decoder { + mimi_ggml_ctx & ctx; + struct layer { + bool is_elu = false; + bool is_resnet = false; + bool is_transposed_conv = false; + ggml_tensor * conv_0_w; + ggml_tensor * conv_0_b; + ggml_tensor * conv_1_w; + ggml_tensor * conv_1_b; + int stride = 1; + }; + std::vector layers; + + std::array repeated_pattern = {1, 4, 7, 10}; + + mimi_encoder_decoder(mimi_ggml_ctx & ctx): ctx(ctx) { + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"), + .conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"), + }); + for (int i = 0; i < (int)repeated_pattern.size(); ++i) { + int i_start = repeated_pattern[i]; + // upsampling layers + layers.push_back({ + .is_elu = true, // layer (i_start) + }); + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), + .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), + .stride = mimi_config.upsampling_ratio[i], + .is_transposed_conv = true, + }); + // residual layers + layers.push_back({ + .is_resnet = true, + .conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2), + .conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2), + .conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2), + .conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2), + }); + } + layers.push_back({ + .is_elu = true, // layer 13 + }); + layers.push_back({ + .conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"), + .conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"), + }); + } + + ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input) { + ggml_tensor * x = input; + + for (auto & layer : layers) { + if (layer.is_elu) { + x = ggml_elu(ctx0, x); + } else if (layer.is_resnet) { + ggml_tensor * residual = x; + x = ggml_elu(ctx0, x); + x = mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1); + x = ggml_elu(ctx0, x); + x = mimi_conv_1d(ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1); + x = ggml_add(ctx0, x, residual); + } else { + x = layer.is_transposed_conv + ? mimi_conv_transpose_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false) + : mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1); + } + } + + return x; + } +}; + +struct mimi_transformer { + struct layer { + ggml_tensor * inp_norm_w; + ggml_tensor * inp_norm_b; + + ggml_tensor * attn_q; + ggml_tensor * attn_k; + ggml_tensor * attn_v; + ggml_tensor * attn_o; + ggml_tensor * attn_post_norm_w; + ggml_tensor * attn_post_norm_b; + ggml_tensor * attn_layer_scale; + + ggml_tensor * ffn_up; + ggml_tensor * ffn_down; + ggml_tensor * mlp_layer_scale; + }; + std::vector layers; + + mimi_transformer(mimi_ggml_ctx & ctx, const char * prefix, int n_layers) { + for (int il = 0; il < n_layers; il++) { + layers.push_back({ + .inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il), + .inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il), + + .attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il), + .attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il), + .attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il), + .attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il), + .attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il), + .attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il), + .attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il), + + .ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il), + .ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il), + .mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il), + }); + } + } + + ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * inp_pos) { + int n_tokens = input->ne[1]; + ggml_tensor * x = input; + + auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { + x = ggml_norm(ctx0, x, mimi_config.norm_eps); + x = ggml_mul(ctx0, x, w); + x = ggml_add(ctx0, x, b); + return x; + }; + + ggml_tensor * residual = input; + + for (auto & layer : layers) { + residual = x; + + // input layer norm + x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b); + + // self attention + { + ggml_tensor * q = ggml_mul_mat(ctx0, layer.attn_q, x); + ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x); + ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x); + + int n_embd_head = mimi_config.n_embd / mimi_config.n_head; + q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens); + k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens); + v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens); + + int n_rot = n_embd_head; + q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0); + q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); + + k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0); + k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp + kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head)); + ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens); + kq = ggml_soft_max_inplace(ctx0, kq_masked); + + v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head); + kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens); + + x = ggml_mul_mat(ctx0, layer.attn_o, kqv); + } + + // residual + x = ggml_mul(ctx0, x, layer.attn_layer_scale); + x = ggml_add(ctx0, x, residual); + + residual = x; + x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b); + + // mlp + { + x = ggml_mul_mat(ctx0, layer.ffn_up, x); + x = ggml_gelu(ctx0, x); + x = ggml_mul_mat(ctx0, layer.ffn_down, x); + } + + // residual + x = ggml_mul(ctx0, x, layer.mlp_layer_scale); + x = ggml_add(ctx0, x, residual); + } + + return x; + } +}; + +struct mimi_residual_vector_quantizer { + struct component { + ggml_tensor * codebook; + }; + + ggml_tensor * semantic_inp_proj; + std::vector semantic_components; + ggml_tensor * semantic_out_proj; + + ggml_tensor * acoustic_inp_proj; + std::vector acoustic_components; + ggml_tensor * acoustic_out_proj; + + mimi_residual_vector_quantizer(mimi_ggml_ctx & ctx) { + semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight"); + semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight"); + for (int i = 0; i < mimi_config.n_semantic_components; i++) { + semantic_components.push_back({ + .codebook = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook", i), + }); + } + acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight"); + acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight"); + for (int i = 0; i < mimi_config.n_acoustic_components; i++) { + acoustic_components.push_back({ + .codebook = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook", i), + }); + } + } + + // the input has shape [n_codes, n_codes_per_embd] + // first row is semantic, the rest are acoustic + // example: [ [semantic], [acoustic1], [acoustic2], ... ] + ggml_tensor * decode(ggml_context * ctx0, ggml_tensor * input) { + GGML_ASSERT(input->type == GGML_TYPE_I32); + + size_t n_semantic = semantic_components.size(); + int64_t n_codes_per_embd = (n_semantic + acoustic_components.size()); + int64_t n_codes = input->ne[0] / n_codes_per_embd; + + GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0); + + ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); + ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); + out_s = ggml_scale(ctx0, out_s, 0.0f); // clear + out_a = ggml_scale(ctx0, out_a, 0.0f); // clear + + for (size_t ir = 0; ir < (size_t)n_codes_per_embd; ir++) { + ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input)); + if (ir < n_semantic) { + // semantic + ggml_tensor * codebook = semantic_components[ir].codebook; + ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); + out_s = ggml_add(ctx0, out_s, embd); + } else { + // acoustic + ggml_tensor * codebook = acoustic_components[ir-n_semantic].codebook; + ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); + out_a = ggml_add(ctx0, out_a, embd); + } + } + + out_s = ggml_mul_mat(ctx0, semantic_out_proj, out_s); + out_a = ggml_mul_mat(ctx0, acoustic_out_proj, out_a); + + return ggml_add(ctx0, out_s, out_a); + } +}; + + +mimi_model::mimi_model(const char * fname, bool verbose) : verbose(verbose) { + ctx.reset(new mimi_ggml_ctx()); + ctx->load_gguf(fname); + + // initialize components + seanet_dec .reset(new mimi_encoder_decoder(*ctx)); + transformer_dec.reset(new mimi_transformer(*ctx, "decoder", mimi_config.num_hidden_layers)); + quantizer .reset(new mimi_residual_vector_quantizer(*ctx)); +} + +mimi_model::~mimi_model() { +} + +std::vector mimi_model::decode_frame(const std::vector & codes, int & n_past) { + // build cgraph + int n_pos = -1; + int n_codes = codes.size(); + int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; + GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiple of n_codes_per_embd"); + + ctx->build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) { + ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes); + ggml_set_name(inp_dec, "inp_dec"); + ggml_set_input(inp_dec); + + // RVQ + ggml_tensor * embeddings = quantizer->decode(ctx_gf, inp_dec); + + // upsample + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + embeddings = mimi_conv_transpose_1d(ctx_gf, embeddings, ctx->get_weight("upsample.conv.weight"), nullptr, 2, 1, true); + + // transformer + n_pos = embeddings->ne[0]; + ggml_tensor * pos_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_dec, "pos_dec"); + ggml_set_input(pos_dec); + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + embeddings = transformer_dec->forward(ctx_gf, embeddings, pos_dec); + + // SEANET decoder + embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); + ggml_tensor * output = seanet_dec->forward(ctx_gf, embeddings); + + ggml_set_name(output, "output"); + ggml_set_output(output); + ggml_build_forward_expand(gf, output); + }); + + // position data + GGML_ASSERT(n_pos <= mimi_config.sliding_window); + std::vector pos_data(n_pos); + for (int i = 0; i < (int)pos_data.size(); i++) { + pos_data[i] = i + n_past; + } + n_past += n_pos; + if (verbose) { + printf("%s: n_pos: %d, n_past: %d\n", __func__, n_pos, n_past); + } + ctx->set_tensor_data("pos_dec", pos_data.data()); + + // code data (need to transpose it) + // code [n_codes, n_codes_per_embd] -> [n_codes_per_embd, n_codes] + std::vector codes_t(n_codes_per_embd * n_codes); + for (int i = 0; i < n_codes / n_codes_per_embd; i++) { + for (int j = 0; j < n_codes_per_embd; j++) { + int src_idx = i * n_codes_per_embd + j; + int dst_idx = j * (n_codes / n_codes_per_embd) + i; + codes_t[dst_idx] = codes[src_idx]; + } + } + ctx->set_tensor_data("inp_dec", codes_t.data()); + + ctx->compute(); + + auto output = ctx->get_tensor_data("output"); + // auto output_tensor = output.first; + auto output_data = output.second; + // printf("Output shape: [%lld, %lld]\n", output_tensor->ne[0], output_tensor->ne[1]); + + std::vector wav_data(output_data.size() / sizeof(float)); + for (size_t i = 0; i < wav_data.size(); i++) { + wav_data[i] = ((float *)output_data.data())[i]; + } + + return wav_data; +} + +std::vector mimi_model::decode(const std::vector & codes) { + std::vector output; + + if (verbose) { + printf("%s: n_codes: %zu\n", __func__, codes.size()); + } + + int64_t t_start = ggml_time_ms(); + int n_frames = 0; + + int n_past = 0; + for (size_t i = 0; i < codes.size(); i += mimi_config.n_codes_per_frame) { + size_t remaining = std::min((size_t)mimi_config.n_codes_per_frame, codes.size() - i); + std::vector frame(codes.begin() + i, codes.begin() + i + remaining); + + auto wav_data = decode_frame(frame, n_past); + output.insert(output.end(), wav_data.begin(), wav_data.end()); + + n_frames++; + } + + int64_t t_end = ggml_time_ms(); + if (verbose) { + printf("%s: n_frames: %d, time: %" PRId64 "ms, per_frame: %" PRId64 "ms\n", __func__, n_frames, t_end - t_start, (t_end - t_start) / n_frames); + } + + return output; +} + +int mimi_model::get_sample_rate() const { + return mimi_config.sample_rate; +} diff --git a/examples/tts/mimi-model.h b/examples/tts/mimi-model.h new file mode 100644 index 00000000000..d48c19b5476 --- /dev/null +++ b/examples/tts/mimi-model.h @@ -0,0 +1,32 @@ +#pragma once + +#include "ggml.h" +#include +#include + +struct mimi_ggml_ctx; +struct mimi_encoder_decoder; +struct mimi_transformer; +struct mimi_residual_vector_quantizer; + +struct mimi_model { + bool verbose = false; + std::unique_ptr ctx; + + std::unique_ptr seanet_dec; + std::unique_ptr transformer_dec; + std::unique_ptr quantizer; + + mimi_model(const char * fname, bool verbose = false); + ~mimi_model(); + + int get_sample_rate() const; + + std::vector decode(const std::vector & codes); + + // TODO: implement encoding pass + // std::vector encode(const std::vector & wav_data); + +private: + std::vector decode_frame(const std::vector & codes, int & n_past); +}; diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp index 2c5833faa27..052f546b43a 100644 --- a/examples/tts/mimi.cpp +++ b/examples/tts/mimi.cpp @@ -1,610 +1,17 @@ -#include "ggml.h" -#include "ggml-cpp.h" -#include "ggml-cpu.h" -#include "ggml-alloc.h" -#include "ggml-backend.h" -#include "gguf.h" - #include "common.h" +#include "mimi-model.h" -#include #include -#include #include -#include -#include - -/** - * Implementation of Kyutai's Mimi model using GGML. - * Based on this research: https://github.com/ngxson/ggml-easy/blob/master/demo/kyutai-mimi.cpp - * - * NOTE: only decoder is working for now. - * - * Background: - * - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc - * - Audio codes must be in the order: (1 semantic component, 31 acoustic components) repeated N times - * - * How it works? - * 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code - * 2. The latent code is passed to a mimi_conv_transpose_1d (depthwise) to upscale - * 3. The upscaled code is passed to transformer, it converts N frames to N frames - * 4. The output embeddings is then passed to SEANet (mimi_encoder_decoder) to get the final waveform - * 5. Waveform is written to a file - */ - -// copied from https://huggingface.co/kyutai/mimi/blob/main/config.json -struct mimi_config_t { - bool causal = true; - int max_position_embeddings = 8000; - int num_hidden_layers = 8; - int n_embd = 512; - int n_ffn = 2048; - int n_head = 8; - int n_head_kv = 8; - int n_rot = 64; - float norm_eps = 1e-5; - float rope_theta = 10000.0f; - int sliding_window = 250; - std::array upsampling_ratio = {8, 6, 5, 4}; - std::array downsampling_ratio = {4, 5, 6, 8}; // reverse of upsampling_ratio - // vector quantizer - float frame_rate = 12.5; - int audio_channels = 1; - int codebook_size = 2048; - int codebook_dim = 256; - int n_semantic_components = 1; - int n_acoustic_components = 31; - // decode - float trim_right_ratio = 1.0f; -} mimi_config; - -// Adapted from https://github.com/ngxson/ggml-easy/blob/master/ggml-easy.h -struct mimi_ggml_ctx { - gguf_context * ctx_gguf = nullptr; - ggml_context * ctx_data = nullptr; - ggml_context * ctx_gf = nullptr; - - // CPU-only for now, as many kernels are missing and we actually get less performance with GPU - ggml_backend_t backend = nullptr; - ggml_backend_buffer_t buf = nullptr; - ggml_backend_sched_ptr sched; - - ggml_cgraph * gf = nullptr; - std::vector buf_compute_meta; - int max_nodes = 16 * 1024; - - std::unordered_map tensors; - - mimi_ggml_ctx() { - backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - auto buft = ggml_backend_get_default_buffer_type(backend); - sched.reset( - ggml_backend_sched_new(&backend, &buft, 1, max_nodes, false) - ); - buf_compute_meta.resize(max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); - } - - void load_gguf(const char * fname) { - ggml_context * meta = nullptr; - - gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &meta, - }; - - ctx_gguf = gguf_init_from_file(fname, params); - - // load tensors - const int n_tensors = gguf_get_n_tensors(ctx_gguf); - - std::vector read_buf; - ggml_init_params ggml_params = { - /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - - ctx_data = ggml_init(ggml_params); - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - ggml_free(meta); - throw std::runtime_error("cannot open model file for loading tensors"); - } - - // add tensors to context - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - ggml_tensor * t = ggml_get_tensor(meta, name); - ggml_tensor * cur = ggml_dup_tensor(ctx_data, t); - ggml_set_name(cur, name); - tensors.insert({name, cur}); - } - - // alloc memory and offload data - ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); - buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_data, buft); - ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - for (int i = 0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - const size_t offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i); - // printf("%s: Loading tensor \"%s\"\n", __func__, name); - fin.seekg(offset, std::ios::beg); - if (!fin) { - ggml_free(meta); - throw std::runtime_error(string_format("failed to seek for tensor: %s", name)); - } - int num_bytes = ggml_nbytes(cur); - if (ggml_backend_buft_is_host(buft)) { - // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(cur->data), num_bytes); - } else { - // read into a temporary buffer first, then copy to device memory - read_buf.resize(num_bytes); - fin.read(reinterpret_cast(read_buf.data()), num_bytes); - ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); - } - } - printf("%s: Loaded %d tensors from %s\n", __func__, n_tensors, fname); - fin.close(); - - ggml_free(meta); - } - - /** - * Build a cgraph using the given builder function. - * - * The built cgraph will be stored in `ctx.gf` - */ - void build_graph(std::function builder_fn) { - ggml_free(ctx_gf); - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - ctx_gf = ggml_init(params); - ggml_backend_sched_reset(sched.get()); - gf = ggml_new_graph_custom(ctx_gf, max_nodes, false); - - builder_fn(ctx_gf, gf); - ggml_backend_sched_alloc_graph(sched.get(), gf); - } - - ggml_status compute() { - ggml_status status = ggml_backend_sched_graph_compute(sched.get(), gf); - return status; - } - - void set_tensor_data(const std::string & name, const void * data) { - ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); - if (!t) { - throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); - } - ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); - } - - std::pair> get_tensor_data(const std::string & name) { - ggml_tensor * t = ggml_get_tensor(ctx_gf, name.c_str()); - if (!t) { - throw std::runtime_error(string_format("tensor not found: %s", name.c_str())); - } - std::vector data(ggml_nbytes(t)); - ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); - return std::make_pair(t, data); - } - - ggml_tensor * get_weight(const char *fmt, ...) { - std::vector str(128); - va_list va; - va_start(va, fmt); - vsnprintf(str.data(), 128, fmt, va); - va_end(va); - auto it = tensors.find(str.data()); - if (it == tensors.end()) { - throw std::runtime_error(string_format("weight tensor not found: %s", str.data())); - } - return it->second; - } - - ~mimi_ggml_ctx() { - ggml_free(ctx_data); - gguf_free(ctx_gguf); - ggml_backend_buffer_free(buf); - } -}; - -/////////////////////////////////////////////////////////////////////////// -// extension to ggml.h -// TODO: add these ops to the library (ofc with a more optimized kernel) - - -// mode: (0) constant, (1) reflect, (2) replicate, (3) circular -// value is only used in "constant" -// only "constant" with 0.0f and "replicate" are implemented here -static ggml_tensor * ggml_pad_ext(ggml_context * ctx0, ggml_tensor * x, int mode, - int64_t pad_left, int64_t pad_right, float value = 0.0f) { - GGML_ASSERT(value == 0.0f); // we can technically use ggml_arange, but for simplication we only support 0.0f - GGML_ASSERT(mode == 0 || mode == 2); - if (pad_left > 0) { - ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_left, x->ne[1]); - if (mode == 0) { - tmp = ggml_scale(ctx0, tmp, value); - } else if (mode == 2) { - ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], 0); // get first column - tmp = ggml_repeat(ctx0, elem, tmp); - } - x = ggml_concat(ctx0, tmp, x, 0); - } - if (pad_right > 0) { - ggml_tensor * tmp = ggml_new_tensor_2d(ctx0, x->type, pad_right, x->ne[1]); - if (mode == 0) { - tmp = ggml_scale(ctx0, tmp, value); - } else if (mode == 2) { - int64_t last = x->ne[0] - 1; - ggml_tensor * elem = ggml_view_2d(ctx0, x, 1, x->ne[1], x->nb[1], last * ggml_element_size(x)); // get last column - tmp = ggml_repeat(ctx0, elem, tmp); - } - x = ggml_concat(ctx0, x, tmp, 0); - } - return x; -} - - - - -/////////////////////////////////////////////////////////////////////////// -// MimiConv and MimiConvTranspose - -static int64_t div_ceil(int64_t a, int64_t b) { - return a / b + (a % b ? 1 : 0); -} - -static ggml_tensor * mimi_conv_1d(ggml_context * ctx0, ggml_tensor * x, - ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool pad_zero = true) { - int64_t kernel_size = (kernel->ne[0] - 1) * dilation + 1; - int64_t p_total = kernel_size - stride; // padding total - int64_t p_half = p_total / 2; - - int64_t n_frames = div_ceil(x->ne[0] - kernel_size + p_total, stride); - int64_t ideal_len = n_frames * stride + kernel_size - p_total; - int64_t p_extra = ideal_len - x->ne[0]; - - int64_t p_right = (mimi_config.causal ? 0 : p_half) + p_extra; - int64_t p_left = p_total - (mimi_config.causal ? 0 : p_half); - - x = ggml_pad_ext(ctx0, x, pad_zero ? 0 : 2, p_left, p_right); - - x = ggml_conv_1d(ctx0, kernel, x, stride, 0, dilation); - if (bias) { - x = ggml_add(ctx0, x, bias); - } - ggml_set_name(x, "mimi_conv_1d"); - return x; -} - -static ggml_tensor * mimi_conv_transpose_1d(ggml_context * ctx0, ggml_tensor * x, - ggml_tensor * kernel, ggml_tensor * bias, int stride, int dilation, bool depthwise) { - GGML_ASSERT(x->ne[1] == kernel->ne[2]); - int64_t n_rows = x->ne[1]; - int64_t kernel_size = kernel->ne[0]; - int64_t p_total = kernel_size - stride; // padding total - - int64_t p_right = mimi_config.causal - ? (float)p_total / mimi_config.trim_right_ratio - : p_total / 2; - int64_t p_left = p_total - p_right; - - ggml_tensor * out = nullptr; - - if (depthwise) { - for (int64_t ir = 0; ir < n_rows; ir++) { - ggml_tensor * row = ggml_view_1d(ctx0, x, - x->ne[0], ir*x->ne[0]*ggml_element_size(x)); - ggml_tensor * krn = ggml_view_1d(ctx0, kernel, - kernel->ne[0], ir*kernel->ne[0]*ggml_element_size(kernel)); - row = ggml_conv_transpose_1d(ctx0, krn, row, stride, 0, dilation); - // unpad (remove p_right and p_left columns) - row = ggml_view_1d(ctx0, row, row->ne[0] - p_total, p_left*ggml_element_size(row)); - - // TODO: concat can be slow, we should use ggml_view_1d/ggml_cpy to avoid realloc - out = out ? ggml_concat(ctx0, out, row, 1) : row; - } - - } else { - out = ggml_conv_transpose_1d(ctx0, kernel, x, stride, 0, dilation); - // unpad - out = ggml_view_2d(ctx0, out, - out->ne[0] - p_total, out->ne[1], - out->nb[1], p_left*ggml_element_size(out)); - } - - if (bias) { - out = ggml_add(ctx0, out, bias); - } - - return out; -} - -/////////////////////////////////////////////////////////////////////////// - -// based on MimiEncoder -// SEANet encoder as used by Mimi. -struct mimi_encoder_decoder { - mimi_ggml_ctx & ctx; - struct layer { - bool is_elu = false; - bool is_resnet = false; - bool is_transposed_conv = false; - ggml_tensor * conv_0_w; - ggml_tensor * conv_0_b; - ggml_tensor * conv_1_w; - ggml_tensor * conv_1_b; - int stride = 1; - }; - std::vector layers; - - std::array repeated_pattern = {1, 4, 7, 10}; - - mimi_encoder_decoder(mimi_ggml_ctx & ctx): ctx(ctx) { - layers.push_back({ - .conv_0_w = ctx.get_weight("decoder.layers.0.conv.weight"), - .conv_0_b = ctx.get_weight("decoder.layers.0.conv.bias"), - }); - for (int i = 0; i < (int)repeated_pattern.size(); ++i) { - int i_start = repeated_pattern[i]; - // upsampling layers - layers.push_back({ - .is_elu = true, // layer (i_start) - }); - layers.push_back({ - .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), - .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), - .stride = mimi_config.upsampling_ratio[i], - .is_transposed_conv = true, - }); - // residual layers - layers.push_back({ - .is_resnet = true, - .conv_0_w = ctx.get_weight("decoder.layers.%d.block.1.conv.weight", i_start + 2), - .conv_0_b = ctx.get_weight("decoder.layers.%d.block.1.conv.bias", i_start + 2), - .conv_1_w = ctx.get_weight("decoder.layers.%d.block.3.conv.weight", i_start + 2), - .conv_1_b = ctx.get_weight("decoder.layers.%d.block.3.conv.bias", i_start + 2), - }); - } - layers.push_back({ - .is_elu = true, // layer 13 - }); - layers.push_back({ - .conv_0_w = ctx.get_weight("decoder.layers.14.conv.weight"), - .conv_0_b = ctx.get_weight("decoder.layers.14.conv.bias"), - }); - } - - ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input) { - ggml_tensor * x = input; - - for (auto & layer : layers) { - if (layer.is_elu) { - x = ggml_elu(ctx0, x); - } else if (layer.is_resnet) { - ggml_tensor * residual = x; - x = ggml_elu(ctx0, x); - x = mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, 1, 1); - x = ggml_elu(ctx0, x); - x = mimi_conv_1d(ctx0, x, layer.conv_1_w, layer.conv_1_b, 1, 1); - x = ggml_add(ctx0, x, residual); - } else { - x = layer.is_transposed_conv - ? mimi_conv_transpose_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1, false) - : mimi_conv_1d(ctx0, x, layer.conv_0_w, layer.conv_0_b, layer.stride, 1); - } - } - - return x; - } -}; - -struct mimi_transformer { - struct layer { - ggml_tensor * inp_norm_w; - ggml_tensor * inp_norm_b; - - ggml_tensor * attn_q; - ggml_tensor * attn_k; - ggml_tensor * attn_v; - ggml_tensor * attn_o; - ggml_tensor * attn_post_norm_w; - ggml_tensor * attn_post_norm_b; - ggml_tensor * attn_layer_scale; - - ggml_tensor * ffn_up; - ggml_tensor * ffn_down; - ggml_tensor * mlp_layer_scale; - }; - std::vector layers; - - mimi_transformer(mimi_ggml_ctx & ctx, const char * prefix, int n_layers) { - for (int il = 0; il < n_layers; il++) { - layers.push_back({ - .inp_norm_w = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.weight", prefix, il), - .inp_norm_b = ctx.get_weight("%s_transformer.layers.%d.input_layernorm.bias", prefix, il), - - .attn_q = ctx.get_weight("%s_transformer.layers.%d.self_attn.q_proj.weight", prefix, il), - .attn_k = ctx.get_weight("%s_transformer.layers.%d.self_attn.k_proj.weight", prefix, il), - .attn_v = ctx.get_weight("%s_transformer.layers.%d.self_attn.v_proj.weight", prefix, il), - .attn_o = ctx.get_weight("%s_transformer.layers.%d.self_attn.o_proj.weight", prefix, il), - .attn_post_norm_w = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.weight", prefix, il), - .attn_post_norm_b = ctx.get_weight("%s_transformer.layers.%d.post_attention_layernorm.bias", prefix, il), - .attn_layer_scale = ctx.get_weight("%s_transformer.layers.%d.self_attn_layer_scale.scale", prefix, il), - - .ffn_up = ctx.get_weight("%s_transformer.layers.%d.mlp.fc1.weight", prefix, il), - .ffn_down = ctx.get_weight("%s_transformer.layers.%d.mlp.fc2.weight", prefix, il), - .mlp_layer_scale = ctx.get_weight("%s_transformer.layers.%d.mlp_layer_scale.scale", prefix, il), - }); - } - } - - ggml_tensor * forward(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * inp_pos) { - int n_tokens = input->ne[1]; - ggml_tensor * x = input; - - auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { - x = ggml_norm(ctx0, x, mimi_config.norm_eps); - x = ggml_mul(ctx0, x, w); - x = ggml_add(ctx0, x, b); - return x; - }; - - ggml_tensor * residual = input; - - for (auto & layer : layers) { - residual = x; - - // input layer norm - x = layer_norm(x, layer.inp_norm_w, layer.inp_norm_b); - - // self attention - { - ggml_tensor * q = ggml_mul_mat(ctx0, layer.attn_q, x); - ggml_tensor * k = ggml_mul_mat(ctx0, layer.attn_k, x); - ggml_tensor * v = ggml_mul_mat(ctx0, layer.attn_v, x); - - int n_embd_head = mimi_config.n_embd / mimi_config.n_head; - q = ggml_reshape_3d(ctx0, q, n_embd_head, mimi_config.n_head, n_tokens); - k = ggml_reshape_3d(ctx0, k, n_embd_head, mimi_config.n_head_kv, n_tokens); - v = ggml_reshape_3d(ctx0, v, n_embd_head, mimi_config.n_head_kv, n_tokens); - - int n_rot = n_embd_head; - q = ggml_rope_inplace(ctx0, q, inp_pos, n_rot, 0); - q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3)); - - k = ggml_rope_inplace(ctx0, k, inp_pos, n_rot, 0); - k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3)); - - ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); // mimic behavior of llama.cpp - kq = ggml_scale_inplace(ctx0, kq, 1.0f / std::sqrt(n_embd_head)); - ggml_tensor * kq_masked = ggml_diag_mask_inf_inplace(ctx0, kq, n_tokens); - kq = ggml_soft_max_inplace(ctx0, kq_masked); - - v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3)); - - ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - kqv = ggml_reshape_3d(ctx0, kqv, n_embd_head, n_tokens, mimi_config.n_head); - kqv = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - kqv = ggml_cont_2d(ctx0, kqv, mimi_config.n_embd, n_tokens); - - x = ggml_mul_mat(ctx0, layer.attn_o, kqv); - } - - // residual - x = ggml_mul(ctx0, x, layer.attn_layer_scale); - x = ggml_add(ctx0, x, residual); - - residual = x; - x = layer_norm(x, layer.attn_post_norm_w, layer.attn_post_norm_b); - - // mlp - { - x = ggml_mul_mat(ctx0, layer.ffn_up, x); - x = ggml_gelu(ctx0, x); - x = ggml_mul_mat(ctx0, layer.ffn_down, x); - } - - // residual - x = ggml_mul(ctx0, x, layer.mlp_layer_scale); - x = ggml_add(ctx0, x, residual); - } - - return x; - } -}; - -struct mimi_residual_vector_quantizer { - struct component { - ggml_tensor * codebook; - }; - - ggml_tensor * semantic_inp_proj; - std::vector semantic_components; - ggml_tensor * semantic_out_proj; - - ggml_tensor * acoustic_inp_proj; - std::vector acoustic_components; - ggml_tensor * acoustic_out_proj; - - mimi_residual_vector_quantizer(mimi_ggml_ctx & ctx) { - semantic_inp_proj = ctx.get_weight("quantizer.semantic_rvq.input_proj.weight"); - semantic_out_proj = ctx.get_weight("quantizer.semantic_rvq.output_proj.weight"); - for (int i = 0; i < mimi_config.n_semantic_components; i++) { - semantic_components.push_back({ - .codebook = ctx.get_weight("quantizer.semantic_rvq.layers.%d.codebook", i), - }); - } - acoustic_inp_proj = ctx.get_weight("quantizer.acoustic_rvq.input_proj.weight"); - acoustic_out_proj = ctx.get_weight("quantizer.acoustic_rvq.output_proj.weight"); - for (int i = 0; i < mimi_config.n_acoustic_components; i++) { - acoustic_components.push_back({ - .codebook = ctx.get_weight("quantizer.acoustic_rvq.layers.%d.codebook", i), - }); - } - } - - // the input has shape [n_codes, n_codes_per_embd] - // first row is semantic, the rest are acoustic - // example: [ [semantic], [acoustic1], [acoustic2], ... ] - ggml_tensor * decode(ggml_context * ctx0, ggml_tensor * input) { - GGML_ASSERT(input->type == GGML_TYPE_I32); - - size_t n_semantic = semantic_components.size(); - int64_t n_codes_per_embd = (n_semantic + acoustic_components.size()); - int64_t n_codes = input->ne[0] / n_codes_per_embd; - - GGML_ASSERT(input->ne[0] % n_codes_per_embd == 0); - - ggml_tensor * out_s = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); - ggml_tensor * out_a = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, mimi_config.codebook_dim, n_codes); - out_s = ggml_scale(ctx0, out_s, 0.0f); // clear - out_a = ggml_scale(ctx0, out_a, 0.0f); // clear - - for (size_t ir = 0; ir < (size_t)n_codes_per_embd; ir++) { - ggml_tensor * row = ggml_view_1d(ctx0, input, n_codes, ir*n_codes*ggml_element_size(input)); - if (ir < n_semantic) { - // semantic - ggml_tensor * codebook = semantic_components[ir].codebook; - ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); - out_s = ggml_add(ctx0, out_s, embd); - } else { - // acoustic - ggml_tensor * codebook = acoustic_components[ir-n_semantic].codebook; - ggml_tensor * embd = ggml_get_rows(ctx0, codebook, row); - out_a = ggml_add(ctx0, out_a, embd); - } - } - - out_s = ggml_mul_mat(ctx0, semantic_out_proj, out_s); - out_a = ggml_mul_mat(ctx0, acoustic_out_proj, out_a); - - return ggml_add(ctx0, out_s, out_a); - } -}; - - - -/////////////////////////////////////////////////////////////////////////// -// main program - int main(int argc, const char ** argv) { if (argc < 3) { fprintf(stderr, "Usage: %s model.gguf codes.txt [output.wav]\n", argv[0]); fprintf(stderr, " Format of codes.txt file: one code per line\n"); fprintf(stderr, " Replace codes.txt with dummy0 and dummy1 for testing\n"); fprintf(stderr, " dummy0: using code 1, 2, 3,..., 96, used for logits matching\n"); - fprintf(stderr, " dummy1: using code that will outputs 'hey hello there' sound\n"); + fprintf(stderr, " dummy1: using code that will outputs 'wah hello there' sound\n"); return 1; } @@ -612,14 +19,6 @@ int main(int argc, const char ** argv) { const char * codes_path = argv[2]; const char * out_path = argc < 4 ? "output.wav" : argv[3]; - mimi_ggml_ctx ctx; - ctx.load_gguf(model_path); - - // initialize components - mimi_encoder_decoder decoder(ctx); - mimi_transformer transformer(ctx, "decoder", mimi_config.num_hidden_layers); - mimi_residual_vector_quantizer quantizer(ctx); - // load codes std::vector codes; if (strcmp(codes_path, "dummy0") == 0) { @@ -693,78 +92,18 @@ int main(int argc, const char ** argv) { printf("Loaded %d codes from %s\n", (int)codes.size(), codes_path); } - // build cgraph - int n_pos = -1; - int n_codes = codes.size(); - int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; - GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiple of n_codes_per_embd"); - - ctx.build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) { - ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes); - ggml_set_name(inp_dec, "inp_dec"); - ggml_set_input(inp_dec); - - // RVQ - ggml_tensor * embeddings = quantizer.decode(ctx_gf, inp_dec); - - // upsample - embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); - embeddings = mimi_conv_transpose_1d(ctx_gf, embeddings, ctx.get_weight("upsample.conv.weight"), nullptr, 2, 1, true); - - // transformer - n_pos = embeddings->ne[0]; - ggml_tensor * pos_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_pos); - ggml_set_name(pos_dec, "pos_dec"); - ggml_set_input(pos_dec); - embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); - embeddings = transformer.forward(ctx_gf, embeddings, pos_dec); - - // SEANET decoder - embeddings = ggml_cont(ctx_gf, ggml_transpose(ctx_gf, embeddings)); - ggml_tensor * output = decoder.forward(ctx_gf, embeddings); - - ggml_set_name(output, "output"); - ggml_set_output(output); - ggml_build_forward_expand(gf, output); - }); - - // position data - std::vector pos_data(1024); - for (int i = 0; i < (int)pos_data.size(); i++) { - pos_data[i] = i; - } - ctx.set_tensor_data("pos_dec", pos_data.data()); - - // code data (need to transpose it) - // code [n_codes, n_codes_per_embd] -> [n_codes_per_embd, n_codes] - std::vector codes_t(n_codes_per_embd * n_codes); - for (int i = 0; i < n_codes / n_codes_per_embd; i++) { - for (int j = 0; j < n_codes_per_embd; j++) { - int src_idx = i * n_codes_per_embd + j; - int dst_idx = j * (n_codes / n_codes_per_embd) + i; - codes_t[dst_idx] = codes[src_idx]; - } - } - ctx.set_tensor_data("inp_dec", codes_t.data()); - - ctx.compute(); - - auto output = ctx.get_tensor_data("output"); - auto output_tensor = output.first; - auto output_data = output.second; - printf("Output shape: [%lld, %lld]\n", output_tensor->ne[0], output_tensor->ne[1]); + mimi_model model(model_path, true); + std::vector wav_data = model.decode(codes); // print first 20 values + printf("Number of output samples: %d\n", (int)wav_data.size()); + printf("First 20 samples:\n"); for (int i = 0; i < 20; i++) { - printf("%2.4f, ", ((float *)output_data.data())[i]); + printf("%2.4f, ", wav_data[i]); } printf("...\n"); // write to wav - std::vector wav_data(output_data.size() / sizeof(float)); - for (size_t i = 0; i < wav_data.size(); i++) { - wav_data[i] = ((float *)output_data.data())[i]; - } printf("Writing to %s\n", out_path); - save_wav16(out_path, wav_data, 24000); + save_wav16(out_path, wav_data, model.get_sample_rate()); } From 891273cf3a678ea4fb4845c35f60af49360b0dbf Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Mar 2025 13:08:42 +0100 Subject: [PATCH 04/10] mimi : non-transposed input codes --- examples/tts/mimi-model.cpp | 14 +++---- examples/tts/mimi-model.h | 1 + examples/tts/mimi.cpp | 78 +++++++++++++++++++------------------ 3 files changed, 48 insertions(+), 45 deletions(-) diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index 31ff86256ae..92bb47a8365 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -24,7 +24,8 @@ * * Background: * - The audio codes can be generated using any Mimi-based model, for example: Moshi, Hibiki, Sesame, etc - * - Audio codes must be in the order: (1 semantic component, 31 acoustic components) repeated N times + * - Audio codes must be in the order: N semantic codes followed by (N*31) acoustic codes + * (In other words, input matrix has shape 32 cols x N rows) * * How it works? * 1. Audio code passed to RVQ (mimi_residual_vector_quantizer) to get the latent code @@ -653,23 +654,22 @@ std::vector mimi_model::decode_frame(const std::vector & codes, int for (int i = 0; i < (int)pos_data.size(); i++) { pos_data[i] = i + n_past; } - n_past += n_pos; if (verbose) { printf("%s: n_pos: %d, n_past: %d\n", __func__, n_pos, n_past); } + n_past += n_pos; ctx->set_tensor_data("pos_dec", pos_data.data()); - // code data (need to transpose it) - // code [n_codes, n_codes_per_embd] -> [n_codes_per_embd, n_codes] - std::vector codes_t(n_codes_per_embd * n_codes); + // code data + /*std::vector codes_t(n_codes_per_embd * n_codes); for (int i = 0; i < n_codes / n_codes_per_embd; i++) { for (int j = 0; j < n_codes_per_embd; j++) { int src_idx = i * n_codes_per_embd + j; int dst_idx = j * (n_codes / n_codes_per_embd) + i; codes_t[dst_idx] = codes[src_idx]; } - } - ctx->set_tensor_data("inp_dec", codes_t.data()); + }*/ + ctx->set_tensor_data("inp_dec", codes.data()); ctx->compute(); diff --git a/examples/tts/mimi-model.h b/examples/tts/mimi-model.h index d48c19b5476..c26fd3bc08e 100644 --- a/examples/tts/mimi-model.h +++ b/examples/tts/mimi-model.h @@ -22,6 +22,7 @@ struct mimi_model { int get_sample_rate() const; + // layout of codes: N semantic codes followed by (N*31) acoustic codes std::vector decode(const std::vector & codes); // TODO: implement encoding pass diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp index 052f546b43a..421c9e418ec 100644 --- a/examples/tts/mimi.cpp +++ b/examples/tts/mimi.cpp @@ -5,6 +5,11 @@ #include +/** + * This file is used for testing and showcase how to use "mimi_model" class. + * Please keep it simple and easy to understand. + */ + int main(int argc, const char ** argv) { if (argc < 3) { fprintf(stderr, "Usage: %s model.gguf codes.txt [output.wav]\n", argv[0]); @@ -23,48 +28,45 @@ int main(int argc, const char ** argv) { std::vector codes; if (strcmp(codes_path, "dummy0") == 0) { printf("Using dummy0 codes\n"); - codes.resize(32 * 3); // [n_codes = 3, n_codes_per_embd = 32] - int n = 0; - for (int c = 0; c < 32; c++) { - for (int r = 0; r < 3; r++) { - codes[r*32 + c] = n++; - } + codes.resize(32 * 3); // [n_codes_per_embd = 32, n_codes = 3] + for (int i = 0; i < (int)codes.size(); i++) { + codes[i] = i; } } else if (strcmp(codes_path, "dummy1") == 0) { printf("Using dummy1 codes\n"); codes = { - 1263 ,1597 ,1596 ,1477 ,1540 ,1720 ,1433 ,118 ,1066 ,1968 ,1096 ,232 ,418 ,566 ,1653 ,2010 , - 1029 ,1874 ,77 ,1803 ,123 ,908 ,97 ,1616 ,595 ,1170 ,1654 ,1211 ,1967 ,1579 ,1846 ,1462 , - 1962 ,175 ,1539 ,742 ,1065 ,1226 ,19 ,955 ,528 ,1031 ,659 ,1687 ,1173 ,1802 ,1031 ,1714 , - 1986 ,582 ,367 ,112 ,1245 ,1386 ,759 ,532 ,1472 ,1790 ,802 ,1213 ,1543 ,1916 ,1251 ,309 , - 1962 ,1280 ,1943 ,878 ,1588 ,1989 ,568 ,1463 ,1814 ,1095 ,103 ,583 ,976 ,998 ,871 ,587 , - 247 ,1698 ,1817 ,1024 ,268 ,597 ,45 ,1608 ,1880 ,2047 ,759 ,1578 ,1612 ,49 ,1031 ,1076 , - 927 ,1202 ,1601 ,1719 ,1670 ,412 ,568 ,1838 ,341 ,1265 ,1279 ,830 ,1997 ,32 ,1369 ,1686 , - 1307 ,419 ,1143 ,324 ,325 ,572 ,1597 ,1920 ,795 ,915 ,610 ,2000 ,819 ,718 ,1235 ,282 , - 1912 ,1911 ,141 ,1069 ,1485 ,642 ,1370 ,732 ,284 ,1407 ,1591 ,1002 ,939 ,671 ,951 ,1411 , - 1887 ,460 ,1588 ,1636 ,1312 ,232 ,969 ,1513 ,1336 ,1185 ,1660 ,4 ,926 ,1243 ,1077 ,1379 , - 704 ,85 ,257 ,1302 ,1029 ,1717 ,899 ,1345 ,355 ,1915 ,1007 ,315 ,1283 ,779 ,415 ,335 , - 1848 ,1786 ,469 ,295 ,380 ,1736 ,393 ,765 ,1921 ,836 ,374 ,1649 ,52 ,1633 ,759 ,548 , - 1922 ,47 ,564 ,893 ,34 ,131 ,1063 ,1657 ,474 ,1960 ,1255 ,1275 ,92 ,976 ,1217 ,483 , - 105 ,1746 ,1158 ,1557 ,1001 ,512 ,1668 ,1255 ,1045 ,1596 ,613 ,1272 ,1366 ,1147 ,411 ,831 , - 349 ,692 ,1435 ,2005 ,1465 ,37 ,892 ,95 ,460 ,557 ,1315 ,259 ,1978 ,1838 ,1232 ,2003 , - 1197 ,111 ,1953 ,1297 ,1843 ,671 ,1687 ,91 ,1788 ,1138 ,1896 ,399 ,615 ,758 ,1423 ,365 , - 288 ,632 ,876 ,875 ,1156 ,345 ,1189 ,638 ,1527 ,1981 ,1925 ,333 ,1353 ,473 ,1913 ,1443 , - 1634 ,1373 ,803 ,420 ,192 ,1440 ,1593 ,1925 ,784 ,831 ,552 ,807 ,1942 ,1289 ,612 ,511 , - 968 ,1091 ,30 ,828 ,1611 ,1241 ,1985 ,596 ,273 ,529 ,1182 ,302 ,726 ,1942 ,733 ,1590 , - 1564 ,214 ,1156 ,1722 ,1215 ,1837 ,1729 ,1823 ,672 ,116 ,340 ,396 ,721 ,462 ,1615 ,1380 , - 1459 ,1553 ,636 ,586 ,1148 ,1147 ,1941 ,471 ,876 ,127 ,1938 ,2002 ,1563 ,1121 ,857 ,1179 , - 1983 ,1324 ,1726 ,1445 ,295 ,270 ,896 ,1947 ,1740 ,1211 ,128 ,1266 ,734 ,715 ,1562 ,285 , - 1139 ,304 ,526 ,653 ,1270 ,320 ,484 ,22 ,687 ,1065 ,489 ,827 ,993 ,1654 ,431 ,1552 , - 1418 ,1604 ,455 ,841 ,412 ,848 ,475 ,540 ,1903 ,575 ,584 ,300 ,1079 ,189 ,1481 ,893 , - 228 ,1577 ,429 ,635 ,106 ,1536 ,176 ,348 ,1733 ,1570 ,537 ,1840 ,798 ,410 ,1714 ,1318 , - 487 ,332 ,1109 ,1744 ,283 ,692 ,681 ,1744 ,1008 ,1715 ,1956 ,1066 ,1768 ,1645 ,139 ,1967 , - 897 ,132 ,1010 ,1932 ,277 ,1536 ,1541 ,952 ,19 ,88 ,1663 ,1232 ,1681 ,1878 ,1241 ,1805 , - 89 ,1401 ,544 ,1061 ,1166 ,267 ,1351 ,1998 ,1623 ,1898 ,425 ,1320 ,2006 ,865 ,1981 ,823 , - 1243 ,471 ,485 ,1765 ,391 ,1281 ,1607 ,1418 ,116 ,1702 ,1725 ,512 ,1088 ,1375 ,1994 ,1738 , - 725 ,1471 ,811 ,1251 ,1156 ,1664 ,898 ,1511 ,1872 ,1717 ,444 ,1005 ,254 ,103 ,202 ,1769 , - 1511 ,433 ,284 ,721 ,1741 ,56 ,615 ,916 ,887 ,1253 ,916 ,535 ,1666 ,1713 ,741 ,873 , - 447 ,492 ,388 ,321 ,1860 ,1456 ,1658 ,1682 ,848 ,462 ,2034 ,1368 ,1609 ,1887 ,510 ,1516 , + 1049 ,1415 ,1962 ,914 ,1372 ,704 ,1922 ,2036 ,288 ,968 ,193 ,1139 ,897 ,897 ,1243 ,1511 , + 1597 ,175 ,1280 ,1202 ,1911 ,85 ,47 ,692 ,632 ,251 ,1553 ,1735 ,1577 ,132 ,471 ,433 , + 1325 ,1539 ,1943 ,1601 ,141 ,257 ,564 ,1435 ,876 ,1096 ,636 ,61 ,1497 ,1010 ,485 ,284 , + 839 ,776 ,878 ,1719 ,1069 ,1302 ,893 ,2005 ,875 ,908 ,586 ,2001 ,186 ,1932 ,1765 ,721 , + 592 ,1046 ,1588 ,1670 ,1485 ,1141 ,34 ,1465 ,1156 ,1938 ,435 ,753 ,1418 ,277 ,391 ,1741 , + 1440 ,117 ,723 ,412 ,642 ,1717 ,131 ,37 ,345 ,112 ,1979 ,2034 ,1822 ,1536 ,1281 ,56 , + 1341 ,803 ,568 ,568 ,1370 ,1995 ,1063 ,892 ,273 ,895 ,1226 ,354 ,1726 ,1541 ,1607 ,615 , + 985 ,1499 ,1736 ,1838 ,702 ,1345 ,1657 ,511 ,1774 ,1787 ,945 ,1927 ,947 ,952 ,1418 ,916 , + 1239 ,1457 ,1021 ,341 ,284 ,882 ,474 ,1559 ,1923 ,273 ,1330 ,1406 ,1782 ,19 ,116 ,887 , + 1146 ,1307 ,983 ,1237 ,1407 ,1350 ,1960 ,1255 ,878 ,1979 ,1500 ,1939 ,1415 ,88 ,1702 ,1253 , + 1778 ,2 ,10 ,1279 ,999 ,1549 ,1049 ,373 ,1355 ,1200 ,1466 ,1009 ,75 ,2042 ,1725 ,916 , + 1636 ,1135 ,833 ,830 ,1758 ,2015 ,1275 ,1675 ,287 ,744 ,89 ,430 ,1724 ,1232 ,1692 ,535 , + 1485 ,1287 ,973 ,1815 ,314 ,2020 ,424 ,1085 ,982 ,1994 ,1563 ,1269 ,1769 ,1681 ,1082 ,1666 , + 1622 ,1039 ,1209 ,32 ,679 ,732 ,976 ,1462 ,805 ,402 ,1150 ,170 ,1529 ,2013 ,350 ,1175 , + 757 ,1124 ,1091 ,1369 ,1061 ,415 ,1217 ,1135 ,1360 ,1578 ,1205 ,1785 ,1835 ,1241 ,14 ,716 , + 480 ,716 ,681 ,1686 ,1624 ,335 ,865 ,1356 ,1688 ,307 ,366 ,541 ,1262 ,1167 ,59 ,269 , + 1899 ,1798 ,1606 ,1307 ,1549 ,1814 ,114 ,483 ,958 ,1919 ,1179 ,898 ,834 ,1526 ,386 ,447 , + 1481 ,201 ,779 ,419 ,430 ,1451 ,1000 ,156 ,1062 ,615 ,1353 ,414 ,1214 ,1487 ,882 ,32 , + 840 ,1517 ,334 ,1143 ,823 ,454 ,725 ,1298 ,1325 ,649 ,1737 ,913 ,685 ,761 ,2010 ,63 , + 1397 ,1299 ,765 ,1158 ,1809 ,1299 ,1585 ,1776 ,625 ,1539 ,830 ,1563 ,461 ,308 ,1438 ,321 , + 82 ,886 ,1836 ,325 ,1976 ,761 ,359 ,1136 ,1720 ,2036 ,904 ,719 ,526 ,1567 ,145 ,1860 , + 1565 ,1786 ,1400 ,1696 ,232 ,1736 ,512 ,518 ,1895 ,1854 ,1584 ,1393 ,1869 ,1702 ,789 ,1986 , + 116 ,521 ,150 ,1597 ,727 ,1916 ,815 ,1826 ,1382 ,653 ,1596 ,286 ,1373 ,177 ,1397 ,1009 , + 1449 ,353 ,877 ,93 ,266 ,1853 ,1255 ,872 ,1974 ,556 ,1885 ,857 ,992 ,5 ,1921 ,1849 , + 1038 ,1912 ,464 ,795 ,747 ,56 ,124 ,431 ,1868 ,609 ,855 ,1522 ,912 ,1709 ,1507 ,1062 , + 1015 ,1357 ,1487 ,4 ,253 ,1871 ,933 ,215 ,1228 ,633 ,1306 ,2024 ,1453 ,900 ,457 ,471 , + 436 ,1311 ,870 ,1032 ,134 ,984 ,1983 ,1103 ,1627 ,1627 ,414 ,1845 ,583 ,1699 ,1458 ,2018 , + 150 ,450 ,1114 ,369 ,267 ,1273 ,1136 ,1578 ,1063 ,1820 ,120 ,779 ,652 ,1266 ,1929 ,1213 , + 159 ,297 ,1703 ,819 ,93 ,247 ,1366 ,144 ,1617 ,1428 ,812 ,121 ,1637 ,1620 ,289 ,1557 , + 1414 ,971 ,476 ,1685 ,428 ,1802 ,653 ,1290 ,614 ,1663 ,1528 ,1344 ,798 ,1027 ,1305 ,990 , + 1740 ,1154 ,1839 ,912 ,731 ,602 ,1064 ,1508 ,834 ,1387 ,252 ,745 ,1034 ,1102 ,965 ,696 , + 1971 ,1729 ,666 ,282 ,1993 ,1551 ,1703 ,1124 ,1628 ,1725 ,107 ,808 ,1096 ,1753 ,500 ,677 , }; } else { std::ifstream fin(codes_path); From eae5f0e1ced91eaffc5f148147c982b3d38877e2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 10:50:35 +0200 Subject: [PATCH 05/10] add mimi_model::transpose_input --- examples/tts/mimi-model.cpp | 27 ++++++++++++++++++--------- examples/tts/mimi-model.h | 5 +++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index 92bb47a8365..ded56ff317d 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -617,7 +617,7 @@ std::vector mimi_model::decode_frame(const std::vector & codes, int int n_pos = -1; int n_codes = codes.size(); int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; - GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiple of n_codes_per_embd"); + GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd"); ctx->build_graph([&](ggml_context * ctx_gf, ggml_cgraph * gf) { ggml_tensor * inp_dec = ggml_new_tensor_1d(ctx_gf, GGML_TYPE_I32, n_codes); @@ -661,14 +661,6 @@ std::vector mimi_model::decode_frame(const std::vector & codes, int ctx->set_tensor_data("pos_dec", pos_data.data()); // code data - /*std::vector codes_t(n_codes_per_embd * n_codes); - for (int i = 0; i < n_codes / n_codes_per_embd; i++) { - for (int j = 0; j < n_codes_per_embd; j++) { - int src_idx = i * n_codes_per_embd + j; - int dst_idx = j * (n_codes / n_codes_per_embd) + i; - codes_t[dst_idx] = codes[src_idx]; - } - }*/ ctx->set_tensor_data("inp_dec", codes.data()); ctx->compute(); @@ -715,6 +707,23 @@ std::vector mimi_model::decode(const std::vector & codes) { return output; } +std::vector mimi_model::transpose_input(const std::vector & codes) { + int n_codes = codes.size(); + int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components; + GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd"); + + std::vector codes_T(n_codes_per_embd * n_codes); + for (int i = 0; i < n_codes / n_codes_per_embd; i++) { + for (int j = 0; j < n_codes_per_embd; j++) { + int src_idx = i * n_codes_per_embd + j; + int dst_idx = j * (n_codes / n_codes_per_embd) + i; + codes_T[dst_idx] = codes[src_idx]; + } + } + + return codes_T; +} + int mimi_model::get_sample_rate() const { return mimi_config.sample_rate; } diff --git a/examples/tts/mimi-model.h b/examples/tts/mimi-model.h index c26fd3bc08e..96945981513 100644 --- a/examples/tts/mimi-model.h +++ b/examples/tts/mimi-model.h @@ -22,6 +22,11 @@ struct mimi_model { int get_sample_rate() const; + // transpose layout: + // - from: (1 semantic code followed by 31 acoustic codes) repeast N times + // - to: N semantic codes followed by (N*31) acoustic codes + std::vector transpose_input(const std::vector & codes); + // layout of codes: N semantic codes followed by (N*31) acoustic codes std::vector decode(const std::vector & codes); From 43bf237e3975a80fe0a52204ad08c7d43999c594 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 10:52:18 +0200 Subject: [PATCH 06/10] fix build --- examples/tts/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt index 39e0a92c5ac..371c3bbf743 100644 --- a/examples/tts/CMakeLists.txt +++ b/examples/tts/CMakeLists.txt @@ -8,4 +8,5 @@ set(TARGET llama-mimi) add_executable(${TARGET} mimi.cpp mimi-model.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +# for using C++ designated initializers, TODO: can be changed back to C++17 in the future +target_compile_features(${TARGET} PRIVATE cxx_std_20) From e618405d4b9040f9536e2acc6761eac004146969 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 11:07:18 +0200 Subject: [PATCH 07/10] fix build (2) --- examples/tts/mimi-model.cpp | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index ded56ff317d..141dd104392 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -15,6 +15,7 @@ #include #include #include +#include /** * Implementation of Kyutai's Mimi model using GGML. @@ -344,10 +345,10 @@ struct mimi_encoder_decoder { bool is_elu = false; bool is_resnet = false; bool is_transposed_conv = false; - ggml_tensor * conv_0_w; - ggml_tensor * conv_0_b; - ggml_tensor * conv_1_w; - ggml_tensor * conv_1_b; + ggml_tensor * conv_0_w = nullptr; + ggml_tensor * conv_0_b = nullptr; + ggml_tensor * conv_1_w = nullptr; + ggml_tensor * conv_1_b = nullptr; int stride = 1; }; std::vector layers; @@ -415,20 +416,20 @@ struct mimi_encoder_decoder { struct mimi_transformer { struct layer { - ggml_tensor * inp_norm_w; - ggml_tensor * inp_norm_b; - - ggml_tensor * attn_q; - ggml_tensor * attn_k; - ggml_tensor * attn_v; - ggml_tensor * attn_o; - ggml_tensor * attn_post_norm_w; - ggml_tensor * attn_post_norm_b; - ggml_tensor * attn_layer_scale; - - ggml_tensor * ffn_up; - ggml_tensor * ffn_down; - ggml_tensor * mlp_layer_scale; + ggml_tensor * inp_norm_w = nullptr; + ggml_tensor * inp_norm_b = nullptr; + + ggml_tensor * attn_q = nullptr; + ggml_tensor * attn_k = nullptr; + ggml_tensor * attn_v = nullptr; + ggml_tensor * attn_o = nullptr; + ggml_tensor * attn_post_norm_w = nullptr; + ggml_tensor * attn_post_norm_b = nullptr; + ggml_tensor * attn_layer_scale = nullptr; + + ggml_tensor * ffn_up = nullptr; + ggml_tensor * ffn_down = nullptr; + ggml_tensor * mlp_layer_scale = nullptr; }; std::vector layers; From e185e0ac7fb10b2b933caae2823a04f44d703967 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 11:44:34 +0200 Subject: [PATCH 08/10] fix build (3) --- examples/tts/mimi-model.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index 141dd104392..0b1fabe8608 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -16,6 +16,7 @@ #include #include #include +#include /** * Implementation of Kyutai's Mimi model using GGML. @@ -367,10 +368,10 @@ struct mimi_encoder_decoder { .is_elu = true, // layer (i_start) }); layers.push_back({ + .is_transposed_conv = true, .conv_0_w = ctx.get_weight("decoder.layers.%d.conv.weight", i_start + 1), .conv_0_b = ctx.get_weight("decoder.layers.%d.conv.bias", i_start + 1), .stride = mimi_config.upsampling_ratio[i], - .is_transposed_conv = true, }); // residual layers layers.push_back({ From ce83041ec3205b2586fca7d52ac9cef5c0ddc446 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 11:45:36 +0200 Subject: [PATCH 09/10] fix strcmp --- examples/tts/mimi.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tts/mimi.cpp b/examples/tts/mimi.cpp index 421c9e418ec..502e0150634 100644 --- a/examples/tts/mimi.cpp +++ b/examples/tts/mimi.cpp @@ -3,6 +3,7 @@ #include #include +#include // strcmp /** From 61d8ad6aef03879ca7193a302a0f549a40d761cb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Mar 2025 12:04:33 +0200 Subject: [PATCH 10/10] fix compilation on linux --- examples/tts/convert_mimi_to_gguf.py | 4 ++-- examples/tts/mimi-model.cpp | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/tts/convert_mimi_to_gguf.py b/examples/tts/convert_mimi_to_gguf.py index 5b44ef62103..5dce72a398a 100644 --- a/examples/tts/convert_mimi_to_gguf.py +++ b/examples/tts/convert_mimi_to_gguf.py @@ -5,13 +5,13 @@ from typing import Union from pathlib import Path from torch import Tensor -from transformers import MimiModel +from transformers import MimiModel, PreTrainedModel logger = logging.getLogger("mimi") class MimiModelConverter: - mimi_model: MimiModel + mimi_model: PreTrainedModel gguf_writer: gguf.GGUFWriter fname_out: Path ftype: gguf.LlamaFileType diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp index 0b1fabe8608..427aeff8658 100644 --- a/examples/tts/mimi-model.cpp +++ b/examples/tts/mimi-model.cpp @@ -17,6 +17,8 @@ #include #include #include +#include +#include /** * Implementation of Kyutai's Mimi model using GGML.