From e09151dcddfa440bb0a6c79ced1591dd8aed3e0a Mon Sep 17 00:00:00 2001 From: Philpax Date: Mon, 24 Apr 2023 02:05:49 +0200 Subject: [PATCH 1/4] fix #149 - load tensors by type, ignoring filetype --- ggml-loader/Cargo.toml | 2 +- ggml-loader/src/lib.rs | 28 ++-- ggml-loader/src/util.rs | 29 +--- ggml/src/lib.rs | 23 ++++ llama-cli/src/cli_args.rs | 18 +-- llama-cli/src/main.rs | 2 +- llama-rs/src/convert.rs | 19 +-- llama-rs/src/lib.rs | 2 +- llama-rs/src/loader.rs | 17 ++- llama-rs/src/loader2.rs | 252 ++++++++++++++-------------------- llama-rs/src/loader_common.rs | 72 +++++++++- llama-rs/src/model.rs | 173 +++++++++++++++++++++-- 12 files changed, 406 insertions(+), 231 deletions(-) diff --git a/ggml-loader/Cargo.toml b/ggml-loader/Cargo.toml index ab711363..2d088758 100644 --- a/ggml-loader/Cargo.toml +++ b/ggml-loader/Cargo.toml @@ -7,4 +7,4 @@ edition = "2021" [dependencies] ggml = { path = "../ggml" } -thiserror = "*" +thiserror = "1.0" diff --git a/ggml-loader/src/lib.rs b/ggml-loader/src/lib.rs index 7d29d4b3..47239f08 100644 --- a/ggml-loader/src/lib.rs +++ b/ggml-loader/src/lib.rs @@ -10,7 +10,7 @@ use util::*; pub type ElementType = ggml::Type; -/// file type containing the model +/// the format of the file containing the model #[derive(Debug, PartialEq, Clone, Copy)] #[allow(clippy::upper_case_acronyms)] pub enum ContainerType { @@ -21,7 +21,6 @@ pub enum ContainerType { /// mmap-able format GGJT, } - impl ContainerType { pub fn support_mmap(&self) -> bool { match self { @@ -64,10 +63,19 @@ pub struct TensorInfo { pub n_dims: usize, pub dims: [usize; 2], pub n_elements: usize, - pub ftype: ElementType, + pub element_type: ElementType, /// start of tensor - start of file pub start_offset: u64, } +impl TensorInfo { + pub fn calc_size(&self) -> usize { + let mut size = ggml::type_size(self.element_type); + for &dim in &self.dims[0..self.n_dims] { + size *= dim; + } + size / ggml::blck_size(self.element_type) + } +} /// Info in hyperparameter used for later loading tasks. Used in callback. /// see [`LoadHandler::load_hyper_parameters`] @@ -78,10 +86,7 @@ pub struct PartialHyperparameters { pub enum TensorDataTreatment<'a> { CopyInto(&'a mut [u8]), - SeekPast { - /// should be `tensor.nbytes` - n_bytes: usize, - }, + Skip, } #[allow(unused_variables)] @@ -173,7 +178,9 @@ pub fn load_weights( // load tensor header let n_dims: usize = read_i32(reader)?.try_into()?; let name_len = read_i32(reader)?; - let ftype = decode_element_type_res(read_i32(reader)?)?; + let ftype = read_i32(reader)?; + let ftype = + ggml::Type::try_from(ftype).map_err(|_| LoadError::UnsupportedElementType(ftype))?; let mut n_elements: usize = 1; let mut dims = [1usize, 1]; @@ -214,9 +221,10 @@ pub fn load_weights( dims, n_dims, n_elements, - ftype, + element_type: ftype, start_offset: offset_aligned, }; + let n_bytes = tensor_info.calc_size(); match controlflow_to_result(handler.tensor_buffer(tensor_info))? { TensorDataTreatment::CopyInto(buf) => { @@ -225,7 +233,7 @@ pub fn load_weights( } reader.read_exact(buf)?; } - TensorDataTreatment::SeekPast { n_bytes } => { + TensorDataTreatment::Skip => { // skip if no buffer is given reader.seek(SeekFrom::Start(offset_aligned + n_bytes as u64))?; } diff --git a/ggml-loader/src/util.rs b/ggml-loader/src/util.rs index 33374fd6..9a759aac 100644 --- a/ggml-loader/src/util.rs +++ b/ggml-loader/src/util.rs @@ -1,7 +1,7 @@ pub use std::io::{BufRead, Seek, SeekFrom}; use std::ops::ControlFlow; -use crate::{ElementType, LoadError}; +use crate::LoadError; pub fn read_bytes(reader: &mut impl BufRead) -> Result<[u8; N], std::io::Error> { let mut bytes = [0u8; N]; @@ -35,33 +35,6 @@ pub fn has_data_left(reader: &mut impl BufRead) -> Result reader.fill_buf().map(|b| !b.is_empty()) } -pub fn decode_element_type(ftype: i32) -> Option { - match ftype { - 0 => Some(ggml::Type::F32), - 1 => Some(ggml::Type::F16), - 2 => Some(ggml::Type::Q4_0), - 3 => Some(ggml::Type::Q4_1), - _ => None, - } -} - -pub fn encode_element_type(element_type: ElementType) -> Option { - match element_type { - ggml::Type::F32 => Some(0), - ggml::Type::F16 => Some(1), - ggml::Type::Q4_0 => Some(2), - ggml::Type::Q4_1 => Some(3), - _ => None, - } -} - -pub fn decode_element_type_res(ftype: i32) -> Result> { - match decode_element_type(ftype) { - Some(x) => Ok(x), - None => Err(LoadError::UnsupportedElementType(ftype)), - } -} - pub fn controlflow_to_result(x: ControlFlow) -> Result> { match x { ControlFlow::Continue(x) => Ok(x), diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs index 11d4246b..37188a64 100644 --- a/ggml/src/lib.rs +++ b/ggml/src/lib.rs @@ -24,6 +24,9 @@ pub const FILE_MAGIC_UNVERSIONED: u32 = 0x67676d6c; /// The currently-supported format version for `ggml` files. pub const FORMAT_VERSION: u32 = 1; +/// The size of a `ggml` object. +pub const OBJECT_SIZE: usize = ggml_sys::GGML_OBJECT_SIZE; + #[derive(Debug, Copy, Clone, PartialEq, Eq, Default)] /// The type of a value in `ggml`. pub enum Type { @@ -32,6 +35,12 @@ pub enum Type { Q4_0, /// Quantized 4-bit (type 1); used by GPTQ. Q4_1, + /// Quantized 4-bit (type 2). + Q4_2, + /// Quantized 4-bit (type 3). + Q4_3, + /// Quantized 8-bit (type 0). + Q8_0, /// Integer 32-bit. I32, /// Float 16-bit. @@ -44,6 +53,9 @@ impl From for ggml_sys::ggml_type { match t { Type::Q4_0 => ggml_sys::ggml_type_GGML_TYPE_Q4_0, Type::Q4_1 => ggml_sys::ggml_type_GGML_TYPE_Q4_1, + Type::Q4_2 => ggml_sys::ggml_type_GGML_TYPE_Q4_2, + Type::Q4_3 => ggml_sys::ggml_type_GGML_TYPE_Q4_3, + Type::Q8_0 => ggml_sys::ggml_type_GGML_TYPE_Q8_0, Type::I32 => ggml_sys::ggml_type_GGML_TYPE_I32, Type::F16 => ggml_sys::ggml_type_GGML_TYPE_F16, Type::F32 => ggml_sys::ggml_type_GGML_TYPE_F32, @@ -56,6 +68,9 @@ impl TryFrom for Type { match t { ggml_sys::ggml_type_GGML_TYPE_Q4_0 => Ok(Type::Q4_0), ggml_sys::ggml_type_GGML_TYPE_Q4_1 => Ok(Type::Q4_1), + ggml_sys::ggml_type_GGML_TYPE_Q4_2 => Ok(Type::Q4_2), + ggml_sys::ggml_type_GGML_TYPE_Q4_3 => Ok(Type::Q4_3), + ggml_sys::ggml_type_GGML_TYPE_Q8_0 => Ok(Type::Q8_0), ggml_sys::ggml_type_GGML_TYPE_I32 => Ok(Type::I32), ggml_sys::ggml_type_GGML_TYPE_F16 => Ok(Type::F16), ggml_sys::ggml_type_GGML_TYPE_F32 => Ok(Type::F32), @@ -68,6 +83,9 @@ impl std::fmt::Display for Type { match self { Type::Q4_0 => write!(f, "q4_0"), Type::Q4_1 => write!(f, "q4_1"), + Type::Q4_2 => write!(f, "q4_2"), + Type::Q4_3 => write!(f, "q4_3"), + Type::Q8_0 => write!(f, "q8_0"), Type::I32 => write!(f, "i32"), Type::F16 => write!(f, "f16"), Type::F32 => write!(f, "f32"), @@ -510,6 +528,11 @@ pub struct Tensor { } impl Tensor { + /// Size of the `ggml_tensor` struct in bytes. + /// + /// Exposed for purposes of determining context size. + pub const C_TYPE_SIZE: usize = std::mem::size_of::(); + /// Creates a shared copy of this tensor pointer. pub fn share(&self) -> Self { Tensor { diff --git a/llama-cli/src/cli_args.rs b/llama-cli/src/cli_args.rs index e31d4f48..fc064017 100644 --- a/llama-cli/src/cli_args.rs +++ b/llama-cli/src/cli_args.rs @@ -373,12 +373,12 @@ pub struct Convert { pub directory: PathBuf, /// File type to convert to - #[arg(long, short = 't', value_enum, default_value_t = ElementType::Q4_0)] - pub element_type: ElementType, + #[arg(long, short = 't', value_enum, default_value_t = FileType::Q4_0)] + pub file_type: FileType, } #[derive(Parser, Debug, ValueEnum, Clone, Copy)] -pub enum ElementType { +pub enum FileType { /// Quantized 4-bit (type 0). Q4_0, /// Quantized 4-bit (type 1); used by GPTQ. @@ -388,13 +388,13 @@ pub enum ElementType { /// Float 32-bit. F32, } -impl From for llama_rs::ElementType { - fn from(t: ElementType) -> Self { +impl From for llama_rs::FileType { + fn from(t: FileType) -> Self { match t { - ElementType::Q4_0 => llama_rs::ElementType::Q4_0, - ElementType::Q4_1 => llama_rs::ElementType::Q4_1, - ElementType::F16 => llama_rs::ElementType::F16, - ElementType::F32 => llama_rs::ElementType::F32, + FileType::Q4_0 => llama_rs::FileType::MostlyQ4_0, + FileType::Q4_1 => llama_rs::FileType::MostlyQ4_1, + FileType::F16 => llama_rs::FileType::MostlyF16, + FileType::F32 => llama_rs::FileType::F32, } } } diff --git a/llama-cli/src/main.rs b/llama-cli/src/main.rs index e0f5ced3..de8323b0 100644 --- a/llama-cli/src/main.rs +++ b/llama-cli/src/main.rs @@ -22,7 +22,7 @@ fn main() -> Result<()> { Args::DumpTokens(args) => dump_tokens(&args)?, Args::Repl(args) => interactive(&args, false)?, Args::ChatExperimental(args) => interactive(&args, true)?, - Args::Convert(args) => convert_pth_to_ggml(&args.directory, args.element_type.into()), + Args::Convert(args) => convert_pth_to_ggml(&args.directory, args.file_type.into()), } Ok(()) diff --git a/llama-rs/src/convert.rs b/llama-rs/src/convert.rs index 67557b8f..07c4939c 100644 --- a/llama-rs/src/convert.rs +++ b/llama-rs/src/convert.rs @@ -16,20 +16,19 @@ use std::{ vec, }; -use crate::{util, Hyperparameters, Vocabulary}; -use ggml_loader::util::encode_element_type; +use crate::{loader_common::FileType, util, Hyperparameters, Vocabulary}; /// Converts a `pth` file to a `ggml` file. -pub fn convert_pth_to_ggml(model_directory: &Path, element_type: ggml::Type) { +pub fn convert_pth_to_ggml(model_directory: &Path, file_type: FileType) { let tokenizer_path = model_directory.parent().unwrap().join("tokenizer.model"); let vocab = load_vocabulary(tokenizer_path.as_path()); - let hparams = load_hyperparameters(model_directory, element_type, &vocab); + let hparams = load_hyperparameters(model_directory, file_type, &vocab); let model_files = util::find_all_model_files(model_directory).unwrap(); for (i, _file) in model_files.iter().enumerate() { - let fname_out = model_directory.join(format!("rust-model-{element_type}.bin")); + let fname_out = model_directory.join(format!("rust-model-{file_type}.bin")); let mut file = File::create(fname_out).expect("Unable to create file"); write_header(file.borrow_mut(), &hparams).unwrap(); write_tokens(file.borrow_mut(), &vocab).unwrap(); @@ -66,11 +65,7 @@ fn load_vocabulary(path: &Path) -> Vocabulary { } } -fn load_hyperparameters( - path: &Path, - element_type: ggml::Type, - vocab: &Vocabulary, -) -> Hyperparameters { +fn load_hyperparameters(path: &Path, file_type: FileType, vocab: &Vocabulary) -> Hyperparameters { #[derive(Deserialize)] struct HyperParametersJson { dim: usize, @@ -83,7 +78,7 @@ fn load_hyperparameters( let json = read_to_string(path.join("params.json")).expect("Unable to read file"); let json: HyperParametersJson = serde_json::from_str(&json).expect("Unable to parse json"); Hyperparameters { - element_type, + file_type, n_ctx: 0, n_embd: json.dim, n_head: json.n_heads, @@ -107,7 +102,7 @@ fn write_header(fout: &mut File, hparams: &Hyperparameters) -> Result<(), String i32::try_from(hparams.n_head).unwrap(), i32::try_from(hparams.n_layer).unwrap(), i32::try_from(hparams.n_embd / hparams.n_head).unwrap(), - encode_element_type(hparams.element_type).unwrap(), + hparams.file_type.into(), ]; let mut packed_values: Vec = vec![]; diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index 3f0a6c69..88d26d0c 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -19,7 +19,7 @@ pub use inference_session::{ InferenceSession, InferenceSessionParameters, InferenceSnapshot, ModelKVMemoryType, SnapshotError, }; -pub use loader_common::{LoadError, LoadProgress}; +pub use loader_common::{FileType, LoadError, LoadProgress}; pub use model::{Hyperparameters, Model}; pub use util::TokenUtf8Buffer; pub use vocabulary::{TokenBias, TokenId, Vocabulary}; diff --git a/llama-rs/src/loader.rs b/llama-rs/src/loader.rs index 9ef545e5..8b92378e 100644 --- a/llama-rs/src/loader.rs +++ b/llama-rs/src/loader.rs @@ -7,6 +7,7 @@ use std::{ }; use crate::{ + loader_common::FileType, util::{self, mulf}, LoadError, LoadProgress, Model, TokenId, Vocabulary, }; @@ -69,9 +70,9 @@ pub(crate) fn load( n_head: read_i32(&mut reader)?.try_into()?, n_layer: read_i32(&mut reader)?.try_into()?, n_rot: read_i32(&mut reader)?.try_into()?, - element_type: { + file_type: { let ftype = read_i32(&mut reader)?; - decode_element_type(ftype).ok_or_else(|| LoadError::UnsupportedElementType(ftype)) + FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype)) }?, }; @@ -108,7 +109,13 @@ pub(crate) fn load( // for the big tensors, we have the option to store the data in 16-bit // floats or quantized in order to save memory and also to speed up the // computation - let wtype = hparams.element_type; + let wtype = match hparams.file_type { + FileType::F32 => ggml::Type::F32, + FileType::MostlyF16 => ggml::Type::F16, + FileType::MostlyQ4_0 => ggml::Type::Q4_0, + FileType::MostlyQ4_1 => ggml::Type::Q4_1, + _ => unimplemented!(), + }; let n_embd = hparams.n_embd; let n_layer = hparams.n_layer; @@ -159,7 +166,7 @@ pub(crate) fn load( (None, None) }; - let mut model = Model::new(context, hparams, vocabulary, n_ff, wtype, model_type, mmap); + let mut model = Model::new_loader1(context, hparams, vocabulary, n_ff, wtype, mmap); match model_type { ContainerType::GGMF | ContainerType::GGML => { let file_offset = reader.stream_position()?; @@ -421,7 +428,7 @@ fn load_tensor_header_ggmf<'a>( } fn tensor_type_size(ftype: i32, ne: [i64; 2]) -> Option { - let ftype = decode_element_type(ftype)?; + let ftype = ggml::Type::try_from(ftype).ok()?; match ftype { ElementType::Q4_0 | ElementType::Q4_1 => { assert_eq!(ne[0] % 64, 0); diff --git a/llama-rs/src/loader2.rs b/llama-rs/src/loader2.rs index ff84e3b1..38413df0 100644 --- a/llama-rs/src/loader2.rs +++ b/llama-rs/src/loader2.rs @@ -3,6 +3,7 @@ use ggml_loader::*; use memmap2::Mmap; use std::{ + collections::HashMap, fs::File, io::{BufRead, BufReader, Seek}, ops::ControlFlow, @@ -10,8 +11,8 @@ use std::{ }; use crate::{ - util::{self, mulf}, - Hyperparameters, LoadError, LoadProgress, Model, TokenId, Vocabulary, + loader_common::FileType, util, Hyperparameters, LoadError, LoadProgress, Model, TokenId, + Vocabulary, }; impl LoadError { @@ -38,7 +39,7 @@ pub(crate) fn load( path: impl AsRef, prefer_mmap: bool, n_context_tokens: usize, - load_progress_callback: impl FnMut(LoadProgress), + mut load_progress_callback: impl FnMut(LoadProgress), ) -> Result { let main_path = path.as_ref(); @@ -47,45 +48,114 @@ pub(crate) fn load( return Err(LoadError::MultipartNotSupported { paths }); } - let file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed { + let mut file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed { source: e, path: main_path.to_owned(), })?; let mut reader = BufReader::new(&file); let path = path.as_ref().to_owned(); - let mut loader = Loader { - path: path.clone(), - vocab: Default::default(), - model: None, - n_ctx: n_context_tokens, - load_progress_callback, - prefer_mmap, - tensor_accumulator: 0, - hyperparameters: Hyperparameters::default(), - container_type: ContainerType::GGJT, - }; + (load_progress_callback)(LoadProgress::PartLoading { + file: &path, + current_part: 0, + total_parts: 1, + }); + + let mut loader = Loader::new( + path.clone(), + n_context_tokens, + prefer_mmap, + load_progress_callback, + ); + let use_mmap = loader.mmap_active(); ggml_loader::load_model_from_reader(&mut reader, &mut loader) .map_err(|err| LoadError::from_ggml_loader_error(err, path.clone()))?; - loader.model.ok_or(LoadError::ModelNotCreated { path }) + let Loader { + hyperparameters, + vocabulary, + tensors, + mut load_progress_callback, + .. + } = loader; + + let Hyperparameters { n_embd, n_mult, .. } = hyperparameters; + let n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult; + + let ctx_size = tensors + .values() + .map(|ti| { + ggml::Tensor::C_TYPE_SIZE + + ggml::OBJECT_SIZE + + if use_mmap { 0 } else { ti.calc_size() } + }) + .sum::(); + (load_progress_callback)(LoadProgress::ContextSize { bytes: ctx_size }); + let context = ggml::Context::init(ctx_size, !use_mmap); + + let mmap = if use_mmap { + let file = File::open(&path)?; + Some(unsafe { Mmap::map(&file)? }) + } else { + None + }; + + let model = Model::new_loader2( + context, + hyperparameters, + vocabulary, + n_ff, + path.clone(), + &mut file, + &tensors, + mmap, + |tensor_index| { + (load_progress_callback)(LoadProgress::PartTensorLoaded { + file: &path, + current_tensor: tensor_index, + tensor_count: tensors.len(), + }); + }, + )?; + + (load_progress_callback)(LoadProgress::PartLoaded { + file: &path, + byte_size: 0, + tensor_count: tensors.len(), + }); + + Ok(model) } struct Loader { - // input data and options + // Input path: PathBuf, n_ctx: usize, prefer_mmap: bool, + load_progress_callback: F, - // Internal state - tensor_accumulator: usize, + // Output container_type: ContainerType, hyperparameters: Hyperparameters, - model: Option, - vocab: Vocabulary, - load_progress_callback: F, + vocabulary: Vocabulary, + tensors: HashMap, +} +impl Loader { + fn new(path: PathBuf, n_ctx: usize, prefer_mmap: bool, load_progress_callback: F) -> Self { + Self { + path, + n_ctx, + prefer_mmap, + load_progress_callback, + + container_type: ContainerType::GGJT, + hyperparameters: Hyperparameters::default(), + vocabulary: Vocabulary::default(), + tensors: HashMap::default(), + } + } } impl ggml_loader::LoadHandler> for Loader { @@ -118,150 +188,33 @@ impl ggml_loader::LoadHandler id, Err(err) => return ControlFlow::Break(LoadError::InvalidIntegerConversion(err)), }; - self.vocab.push_token(id, token, score); + self.vocabulary.push_token(id, token, score); ControlFlow::Continue(()) } fn tensor_buffer(&mut self, info: TensorInfo) -> ControlFlow { - let model = match &mut self.model { - Some(model) => model, - None => { - let model = result_to_controlflow(self.create_model(self.vocab.clone()))?; - self.model.insert(model) - } - }; - - let tensor_name = match String::from_utf8(info.name) { + let tensor_name = match String::from_utf8(info.name.clone()) { Ok(n) => n, Err(err) => return ControlFlow::Break(LoadError::InvalidUtf8(err)), }; - let tensor_count = model.tensors_mut().len(); - - // to satisfy borrow checker - macro_rules! get_tensor { - () => { - match model.tensors_mut().get_mut(&tensor_name) { - Some(tensor) => tensor, - None => { - return ControlFlow::Break(LoadError::UnknownTensor { - path: self.path.clone(), - tensor_name, - }) - } - } - }; - } - - let ret = match &model.mmap { - Some(map) => unsafe { - let ptr = map.as_ptr().offset(info.start_offset as isize); - let tensor = get_tensor!(); - tensor.set_data(ptr as *mut std::ffi::c_void); - TensorDataTreatment::SeekPast { - n_bytes: tensor.nbytes(), - } - }, - None => { - let tensor = get_tensor!(); - let buf: &mut [u8] = unsafe { - std::slice::from_raw_parts_mut(tensor.data() as *mut u8, tensor.nbytes()) - }; - TensorDataTreatment::CopyInto(buf) - } - }; - (self.load_progress_callback)(LoadProgress::PartTensorLoaded { - file: &self.path, - current_tensor: self.tensor_accumulator, - tensor_count, - }); - self.tensor_accumulator += 1; - - ControlFlow::Continue(ret) + self.tensors.insert(tensor_name, info); + ControlFlow::Continue(TensorDataTreatment::Skip) } } impl Loader { - fn create_model(&mut self, vocabulary: Vocabulary) -> Result { - (self.load_progress_callback)(LoadProgress::PartLoading { - file: &self.path, - current_part: 0, - total_parts: 1, - }); - let alloc = !(self.use_mmap()); - let Hyperparameters { - n_vocab, - n_embd, - n_mult, - n_layer, - element_type, - .. - } = self.hyperparameters; - let n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult; - let wtype = element_type; - let ctx_size = { - // Use 64-bit math to prevent overflow. - let mut ctx_size: usize = (5 + 10 * n_layer) * 256; // object overhead - - if alloc { - let mut model_size: usize = 0; - - ctx_size += mulf!(n_embd, n_vocab, ggml::type_sizef(wtype)); // tok_embeddings - ctx_size += mulf!(n_embd, ggml::type_sizef(ggml::Type::F32)); // norm - ctx_size += mulf!(n_embd, n_vocab, ggml::type_sizef(wtype)); // output - - model_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // attention_norm - - model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wq - model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wk - model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wv - model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wo - - model_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // ffn_norm - - model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w1 - model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w2 - model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w3 - - ctx_size += model_size; - } - - (self.load_progress_callback)(LoadProgress::ContextSize { bytes: ctx_size }); - - ctx_size - }; - // Initialize the context - let context = ggml::Context::init(ctx_size, alloc); - - let mmap = if self.use_mmap() { - let file = File::open(&self.path)?; - Some(unsafe { Mmap::map(&file)? }) - } else { - None - }; - - Ok(Model::new( - context, - self.hyperparameters, - vocabulary, - n_ff, - wtype, - self.container_type, - mmap, - )) - } - - fn use_mmap(&mut self) -> bool { + fn mmap_active(&mut self) -> bool { self.prefer_mmap && self.container_type.support_mmap() } } /// use this to load params for llama model inside [`LoadHandler::load_hyper_parameters`] -fn load_hyperparameters( +fn load_hyperparameters( reader: &mut R, n_ctx: usize, -) -> Result<(Hyperparameters, PartialHyperparameters), ggml_loader::LoadError> { +) -> Result<(Hyperparameters, PartialHyperparameters), ggml_loader::LoadError> { // NOTE: Field order matters! Data is laid out in the file exactly in this order. let hparams = Hyperparameters { n_vocab: read_i32(reader)?.try_into()?, @@ -270,7 +223,12 @@ fn load_hyperparameters( n_head: read_i32(reader)?.try_into()?, n_layer: read_i32(reader)?.try_into()?, n_rot: read_i32(reader)?.try_into()?, - element_type: decode_element_type_res(read_i32(reader)?)?, + file_type: { + let ftype = read_i32(reader)?; + FileType::try_from(ftype).map_err(|_| { + ggml_loader::LoadError::UserInterrupted(LoadError::UnsupportedFileType(ftype)) + })? + }, n_ctx, }; let partial = PartialHyperparameters { diff --git a/llama-rs/src/loader_common.rs b/llama-rs/src/loader_common.rs index 4a219642..fe44da46 100644 --- a/llama-rs/src/loader_common.rs +++ b/llama-rs/src/loader_common.rs @@ -1,9 +1,75 @@ -use std::path::{Path, PathBuf}; +use std::{ + fmt::{Display, Formatter}, + path::{Path, PathBuf}, +}; use thiserror::Error; use crate::{util::FindAllModelFilesError, Hyperparameters}; +/// How the tensors are stored in the GGML LLaMA model. +#[derive(Debug, PartialEq, Clone, Copy, Eq, Default)] +pub enum FileType { + /// All tensors are stored as f32. + F32, + #[default] + /// All tensors are mostly stored as `f16`, except for the 1D tensors (32-bit). + MostlyF16, + /// All tensors are mostly stored as `Q4_0`, except for the 1D tensors (32-bit). + MostlyQ4_0, + /// All tensors are mostly stored as `Q4_1`, except for the 1D tensors (32-bit) + MostlyQ4_1, + /// All tensors are mostly stored as `Q4_1`, except for the 1D tensors (32-bit) + /// and the `tok_embeddings.weight` (f16) and `output.weight` tensors (f16). + MostlyQ4_1SomeF16, + /// All tensors are mostly stored as `Q4_2`, except for the 1D tensors (32-bit). + MostlyQ4_2, + /// All tensors are mostly stored as `Q4_3`, except for the 1D tensors (32-bit). + MostlyQ4_3, +} +impl From for i32 { + fn from(value: FileType) -> Self { + match value { + FileType::F32 => 0, + FileType::MostlyF16 => 1, + FileType::MostlyQ4_0 => 2, + FileType::MostlyQ4_1 => 3, + FileType::MostlyQ4_1SomeF16 => 4, + FileType::MostlyQ4_2 => 5, + FileType::MostlyQ4_3 => 6, + } + } +} +impl TryFrom for FileType { + type Error = (); + + fn try_from(value: i32) -> Result { + match value { + 0 => Ok(FileType::F32), + 1 => Ok(FileType::MostlyF16), + 2 => Ok(FileType::MostlyQ4_0), + 3 => Ok(FileType::MostlyQ4_1), + 4 => Ok(FileType::MostlyQ4_1SomeF16), + 5 => Ok(FileType::MostlyQ4_2), + 6 => Ok(FileType::MostlyQ4_3), + _ => Err(()), + } + } +} +impl Display for FileType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + FileType::F32 => write!(f, "f32"), + FileType::MostlyF16 => write!(f, "f16"), + FileType::MostlyQ4_0 => write!(f, "q4_0"), + FileType::MostlyQ4_1 => write!(f, "q4_1"), + FileType::MostlyQ4_1SomeF16 => write!(f, "q4_1_with_f16"), + FileType::MostlyQ4_2 => write!(f, "q4_2"), + FileType::MostlyQ4_3 => write!(f, "q4_3"), + } + } +} + /// Each variant represents a step within the process of loading the model. /// These can be used to report progress to the user. #[derive(Clone, PartialEq, Eq, Debug)] @@ -79,8 +145,8 @@ pub enum LoadError { /// One of the integers encountered could not be converted to a more appropriate type. InvalidIntegerConversion(#[from] std::num::TryFromIntError), #[error("unsupported f16_: {0}")] - /// One of the integers encountered could not be converted to a more appropriate type. - UnsupportedElementType(i32), + /// The `f16_` hyperparameter had an invalid value. + UnsupportedFileType(i32), #[error("invalid magic number for {path:?}")] /// An invalid magic number was encountered during the loading process. InvalidMagic { diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs index 6cd64dc1..635ffaec 100644 --- a/llama-rs/src/model.rs +++ b/llama-rs/src/model.rs @@ -1,12 +1,18 @@ -use std::{collections::HashMap, path::Path}; +use std::{ + collections::HashMap, + fs::File, + io::{Read, Seek, SeekFrom}, + path::{Path, PathBuf}, +}; use crate::{ - loader, loader2, vocabulary::TokenId, EvaluateOutputRequest, InferenceParameters, - InferenceSession, InferenceSessionParameters, LoadError, LoadProgress, Vocabulary, + loader, loader2, loader_common::FileType, vocabulary::TokenId, EvaluateOutputRequest, + InferenceParameters, InferenceSession, InferenceSessionParameters, LoadError, LoadProgress, + Vocabulary, }; use memmap2::Mmap; -use ggml_loader::ContainerType; +use ggml_loader::TensorInfo; /// The weights for the LLaMA model. All the mutable state is split into a /// separate struct `InferenceSession`. @@ -25,21 +31,18 @@ pub struct Model { tensors: HashMap, /// Needs to kept alive while the model is alive - pub(crate) mmap: Option, - - _version: ContainerType, + _mmap: Option, // Must be kept alive for the model _context: ggml::Context, } impl Model { - pub(crate) fn new( + pub(crate) fn new_loader1( context: ggml::Context, hparams: Hyperparameters, vocabulary: Vocabulary, n_ff: usize, wtype: ggml::Type, - container_type: ContainerType, mmap: Option, ) -> Model { let n_embd = hparams.n_embd; @@ -110,9 +113,151 @@ impl Model { layers, tensors, _context: context, - mmap, - _version: container_type, + _mmap: mmap, + } + } + + pub(crate) fn new_loader2( + context: ggml::Context, + hyperparameters: Hyperparameters, + vocabulary: Vocabulary, + n_ff: usize, + path: PathBuf, + file: &mut File, + tensors: &HashMap, + mmap: Option, + progress_callback: impl FnMut(usize), + ) -> Result { + let n_embd = hyperparameters.n_embd; + let n_layer = hyperparameters.n_layer; + let n_vocab = hyperparameters.n_vocab; + + struct TensorLoader<'a, F: FnMut(usize)> { + // Input + path: PathBuf, + file: &'a mut File, + tensors: &'a HashMap, + context: &'a ggml::Context, + mmap_ptr: Option<*const u8>, + progress_callback: F, + + // Output + loaded_tensors: HashMap, + } + impl TensorLoader<'_, F> { + fn load(&mut self, name: &str, ne: &[usize]) -> Result { + let info = self + .tensors + .get(name) + .ok_or_else(|| LoadError::UnknownTensor { + path: self.path.clone(), + tensor_name: name.to_owned(), + })?; + + let ctx = self.context; + let mut tensor = match ne.len() { + 1 => ctx.new_tensor_1d(info.element_type, ne[0]), + 2 => ctx.new_tensor_2d(info.element_type, ne[0], ne[1]), + 3 => ctx.new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]), + _ => { + return Err(LoadError::InvariantBroken { + path: self.path.clone(), + invariant: format!( + "the tensor {name} had an unsupported dimension count: {ne:?}" + ), + }) + } + }; + + match self.mmap_ptr { + Some(mmap) => unsafe { + let ptr = mmap.offset(info.start_offset as isize); + tensor.set_data(ptr as *mut std::ffi::c_void); + }, + None => { + let buf: &mut [u8] = unsafe { + std::slice::from_raw_parts_mut( + tensor.data() as *mut u8, + tensor.nbytes(), + ) + }; + self.file.seek(SeekFrom::Start(info.start_offset))?; + self.file.read_exact(buf)?; + } + } + + self.loaded_tensors.insert(name.to_owned(), tensor.share()); + (self.progress_callback)(self.loaded_tensors.len()); + + Ok(tensor) + } + } + let mut tl = TensorLoader { + path, + file, + tensors, + context: &context, + mmap_ptr: mmap.as_ref().map(|m| m.as_ptr()), + progress_callback, + + loaded_tensors: Default::default(), + }; + + let tok_embeddings = tl.load("tok_embeddings.weight", &[n_embd, n_vocab])?; + let norm = tl.load("norm.weight", &[n_embd])?; + let output = tl.load("output.weight", &[n_embd, n_vocab])?; + + let mut layers = Vec::new(); + for i in 0..n_layer { + let layer = Layer { + attention_norm: tl.load(&format!("layers.{i}.attention_norm.weight"), &[n_embd])?, + wq: tl.load( + &format!("layers.{i}.attention.wq.weight"), + &[n_embd, n_embd], + )?, + wk: tl.load( + &format!("layers.{i}.attention.wk.weight"), + &[n_embd, n_embd], + )?, + wv: tl.load( + &format!("layers.{i}.attention.wv.weight"), + &[n_embd, n_embd], + )?, + wo: tl.load( + &format!("layers.{i}.attention.wo.weight"), + &[n_embd, n_embd], + )?, + ffn_norm: tl.load(&format!("layers.{i}.ffn_norm.weight"), &[n_embd])?, + w1: tl.load( + &format!("layers.{i}.feed_forward.w1.weight"), + &[n_embd, n_ff], + )?, + w2: tl.load( + &format!("layers.{i}.feed_forward.w2.weight"), + &[n_ff, n_embd], + )?, + w3: tl.load( + &format!("layers.{i}.feed_forward.w3.weight"), + &[n_embd, n_ff], + )?, + }; + + layers.push(layer); } + + let tensors = tl.loaded_tensors; + + Ok(Model { + hparams: hyperparameters, + vocabulary, + tok_embeddings, + norm, + output, + layers, + tensors, + _context: context, + _mmap: mmap, + }) } /// Load the model from `path` with `n_context_tokens` context tokens. @@ -180,7 +325,7 @@ impl Model { n_head, n_layer, n_rot, - element_type: _, + file_type: _, } = self.hparams; // For the first run, we need to guess a maximum buffer size so we can measure @@ -472,8 +617,8 @@ pub struct Hyperparameters { pub n_layer: usize, /// n_rot pub n_rot: usize, - /// element_type - pub element_type: crate::ElementType, + /// file_type + pub file_type: FileType, } struct Layer { From 5e5f3ccbef646b7f9170301d564e3f2310512e61 Mon Sep 17 00:00:00 2001 From: Philpax Date: Mon, 24 Apr 2023 02:58:50 +0200 Subject: [PATCH 2/4] chore: ignore too many arguments --- llama-rs/src/model.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs index 635ffaec..4b158f14 100644 --- a/llama-rs/src/model.rs +++ b/llama-rs/src/model.rs @@ -117,6 +117,7 @@ impl Model { } } + #[allow(clippy::too_many_arguments)] pub(crate) fn new_loader2( context: ggml::Context, hyperparameters: Hyperparameters, From ecb9175ed38bfc2d962a2787f45aae914f57b4c4 Mon Sep 17 00:00:00 2001 From: Philpax Date: Mon, 24 Apr 2023 03:09:33 +0200 Subject: [PATCH 3/4] chore: hide Model internals --- llama-rs/src/inference_session.rs | 4 ++-- llama-rs/src/model.rs | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/llama-rs/src/inference_session.rs b/llama-rs/src/inference_session.rs index 428b9a7b..e4e9c244 100644 --- a/llama-rs/src/inference_session.rs +++ b/llama-rs/src/inference_session.rs @@ -68,7 +68,7 @@ impl InferenceSession { .map(|(_, tok)| *tok) .collect(); - if self.n_past + prompt_tokens.len() >= model.hparams.n_ctx { + if self.n_past + prompt_tokens.len() >= model.n_ctx() { return Err(InferenceError::ContextFull); } @@ -96,7 +96,7 @@ impl InferenceSession { params: &InferenceParameters, rng: &mut impl rand::Rng, ) -> Result<&'v [u8], InferenceError> { - if self.n_past + 1 >= model.hparams.n_ctx { + if self.n_past + 1 >= model.n_ctx() { return Err(InferenceError::ContextFull); } diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs index 4b158f14..6be17b99 100644 --- a/llama-rs/src/model.rs +++ b/llama-rs/src/model.rs @@ -17,7 +17,7 @@ use ggml_loader::TensorInfo; /// The weights for the LLaMA model. All the mutable state is split into a /// separate struct `InferenceSession`. pub struct Model { - pub(crate) hparams: Hyperparameters, + hyperparameters: Hyperparameters, vocabulary: Vocabulary, @@ -105,7 +105,7 @@ impl Model { } Model { - hparams, + hyperparameters: hparams, vocabulary, tok_embeddings, norm, @@ -249,7 +249,7 @@ impl Model { let tensors = tl.loaded_tensors; Ok(Model { - hparams: hyperparameters, + hyperparameters, vocabulary, tok_embeddings, norm, @@ -291,10 +291,10 @@ impl Model { pub fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession { InferenceSession::new( params, - self.hparams.n_ctx, - self.hparams.n_layer, - self.hparams.n_embd, - self.hparams.n_vocab, + self.hyperparameters.n_ctx, + self.hyperparameters.n_layer, + self.hyperparameters.n_embd, + self.hyperparameters.n_vocab, ) } @@ -327,7 +327,7 @@ impl Model { n_layer, n_rot, file_type: _, - } = self.hparams; + } = self.hyperparameters; // For the first run, we need to guess a maximum buffer size so we can measure // the actual memory consumption of the temporary ggml context. @@ -599,6 +599,10 @@ impl Model { pub(crate) fn tensors_mut(&mut self) -> &mut HashMap { &mut self.tensors } + + pub(crate) fn n_ctx(&self) -> usize { + self.hyperparameters.n_ctx + } } /// The hyperparameters of the model. From c9e5c2659fd9aa3bdde685c36096b968eaa0d97a Mon Sep 17 00:00:00 2001 From: Philpax Date: Tue, 25 Apr 2023 03:33:08 +0200 Subject: [PATCH 4/4] refactor: decouple loading from model --- llama-rs/src/loader2.rs | 98 ++++++++++++++++++++++++++++++-------- llama-rs/src/model.rs | 103 +++++----------------------------------- 2 files changed, 91 insertions(+), 110 deletions(-) diff --git a/llama-rs/src/loader2.rs b/llama-rs/src/loader2.rs index 38413df0..ead8bfb3 100644 --- a/llama-rs/src/loader2.rs +++ b/llama-rs/src/loader2.rs @@ -5,14 +5,14 @@ use memmap2::Mmap; use std::{ collections::HashMap, fs::File, - io::{BufRead, BufReader, Seek}, + io::{BufRead, BufReader, Read, Seek}, ops::ControlFlow, path::{Path, PathBuf}, }; use crate::{ - loader_common::FileType, util, Hyperparameters, LoadError, LoadProgress, Model, TokenId, - Vocabulary, + loader_common::FileType, model::TensorLoader, util, Hyperparameters, LoadError, LoadProgress, + Model, TokenId, Vocabulary, }; impl LoadError { @@ -48,7 +48,7 @@ pub(crate) fn load( return Err(LoadError::MultipartNotSupported { paths }); } - let mut file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed { + let file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed { source: e, path: main_path.to_owned(), })?; @@ -102,28 +102,86 @@ pub(crate) fn load( None }; - let model = Model::new_loader2( + struct TensorLoader2<'a> { + path: PathBuf, + file: File, + tensors: HashMap, + context: ggml::Context, + mmap: Option, + load_progress_callback: &'a mut dyn FnMut(LoadProgress), + loaded_tensors: HashMap, + } + impl TensorLoader for TensorLoader2<'_> { + fn load(&mut self, name: &str, ne: &[usize]) -> Result { + let info = self + .tensors + .get(name) + .ok_or_else(|| LoadError::UnknownTensor { + path: self.path.clone(), + tensor_name: name.to_owned(), + })?; + + let ctx = &self.context; + let mut tensor = match ne.len() { + 1 => ctx.new_tensor_1d(info.element_type, ne[0]), + 2 => ctx.new_tensor_2d(info.element_type, ne[0], ne[1]), + 3 => ctx.new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]), + _ => { + return Err(LoadError::InvariantBroken { + path: self.path.clone(), + invariant: format!( + "the tensor {name} had an unsupported dimension count: {ne:?}" + ), + }) + } + }; + + match self.mmap.as_ref() { + Some(mmap) => unsafe { + let ptr = mmap.as_ptr().offset(info.start_offset as isize); + tensor.set_data(ptr as *mut std::ffi::c_void); + }, + None => { + let buf: &mut [u8] = unsafe { + std::slice::from_raw_parts_mut(tensor.data() as *mut u8, tensor.nbytes()) + }; + self.file.seek(SeekFrom::Start(info.start_offset))?; + self.file.read_exact(buf)?; + } + } + + self.loaded_tensors.insert(name.to_owned(), tensor.share()); + (self.load_progress_callback)(LoadProgress::PartTensorLoaded { + file: &self.path, + current_tensor: self.loaded_tensors.len(), + tensor_count: self.tensors.len(), + }); + + Ok(tensor) + } + + fn finish(self) -> (ggml::Context, HashMap, Option) { + (self.context, self.loaded_tensors, self.mmap) + } + } + + let tensors_len = tensors.len(); + let tl = TensorLoader2 { + path: path.clone(), + file, + tensors, context, - hyperparameters, - vocabulary, - n_ff, - path.clone(), - &mut file, - &tensors, mmap, - |tensor_index| { - (load_progress_callback)(LoadProgress::PartTensorLoaded { - file: &path, - current_tensor: tensor_index, - tensor_count: tensors.len(), - }); - }, - )?; + load_progress_callback: &mut load_progress_callback, + loaded_tensors: Default::default(), + }; + + let model = Model::new_loader2(hyperparameters, vocabulary, n_ff, tl)?; (load_progress_callback)(LoadProgress::PartLoaded { file: &path, byte_size: 0, - tensor_count: tensors.len(), + tensor_count: tensors_len, }); Ok(model) diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs index 6be17b99..13488a2d 100644 --- a/llama-rs/src/model.rs +++ b/llama-rs/src/model.rs @@ -1,9 +1,4 @@ -use std::{ - collections::HashMap, - fs::File, - io::{Read, Seek, SeekFrom}, - path::{Path, PathBuf}, -}; +use std::{collections::HashMap, error::Error, path::Path}; use crate::{ loader, loader2, loader_common::FileType, vocabulary::TokenId, EvaluateOutputRequest, @@ -12,8 +7,6 @@ use crate::{ }; use memmap2::Mmap; -use ggml_loader::TensorInfo; - /// The weights for the LLaMA model. All the mutable state is split into a /// separate struct `InferenceSession`. pub struct Model { @@ -117,92 +110,17 @@ impl Model { } } - #[allow(clippy::too_many_arguments)] - pub(crate) fn new_loader2( - context: ggml::Context, + pub(crate) fn new_loader2( hyperparameters: Hyperparameters, vocabulary: Vocabulary, n_ff: usize, - path: PathBuf, - file: &mut File, - tensors: &HashMap, - mmap: Option, - progress_callback: impl FnMut(usize), - ) -> Result { + tensor_loader: impl TensorLoader, + ) -> Result { let n_embd = hyperparameters.n_embd; let n_layer = hyperparameters.n_layer; let n_vocab = hyperparameters.n_vocab; - struct TensorLoader<'a, F: FnMut(usize)> { - // Input - path: PathBuf, - file: &'a mut File, - tensors: &'a HashMap, - context: &'a ggml::Context, - mmap_ptr: Option<*const u8>, - progress_callback: F, - - // Output - loaded_tensors: HashMap, - } - impl TensorLoader<'_, F> { - fn load(&mut self, name: &str, ne: &[usize]) -> Result { - let info = self - .tensors - .get(name) - .ok_or_else(|| LoadError::UnknownTensor { - path: self.path.clone(), - tensor_name: name.to_owned(), - })?; - - let ctx = self.context; - let mut tensor = match ne.len() { - 1 => ctx.new_tensor_1d(info.element_type, ne[0]), - 2 => ctx.new_tensor_2d(info.element_type, ne[0], ne[1]), - 3 => ctx.new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]), - _ => { - return Err(LoadError::InvariantBroken { - path: self.path.clone(), - invariant: format!( - "the tensor {name} had an unsupported dimension count: {ne:?}" - ), - }) - } - }; - - match self.mmap_ptr { - Some(mmap) => unsafe { - let ptr = mmap.offset(info.start_offset as isize); - tensor.set_data(ptr as *mut std::ffi::c_void); - }, - None => { - let buf: &mut [u8] = unsafe { - std::slice::from_raw_parts_mut( - tensor.data() as *mut u8, - tensor.nbytes(), - ) - }; - self.file.seek(SeekFrom::Start(info.start_offset))?; - self.file.read_exact(buf)?; - } - } - - self.loaded_tensors.insert(name.to_owned(), tensor.share()); - (self.progress_callback)(self.loaded_tensors.len()); - - Ok(tensor) - } - } - let mut tl = TensorLoader { - path, - file, - tensors, - context: &context, - mmap_ptr: mmap.as_ref().map(|m| m.as_ptr()), - progress_callback, - - loaded_tensors: Default::default(), - }; + let mut tl = tensor_loader; let tok_embeddings = tl.load("tok_embeddings.weight", &[n_embd, n_vocab])?; let norm = tl.load("norm.weight", &[n_embd])?; @@ -246,7 +164,7 @@ impl Model { layers.push(layer); } - let tensors = tl.loaded_tensors; + let (_context, tensors, _mmap) = tl.finish(); Ok(Model { hyperparameters, @@ -256,8 +174,8 @@ impl Model { output, layers, tensors, - _context: context, - _mmap: mmap, + _context, + _mmap, }) } @@ -626,6 +544,11 @@ pub struct Hyperparameters { pub file_type: FileType, } +pub(crate) trait TensorLoader { + fn load(&mut self, name: &str, ne: &[usize]) -> Result; + fn finish(self) -> (ggml::Context, HashMap, Option); +} + struct Layer { attention_norm: ggml::Tensor,